readr/0000755000176200001440000000000014547623042011356 5ustar liggesusersreadr/NAMESPACE0000644000176200001440000000670214547552156012611 0ustar liggesusers# Generated by roxygen2: do not edit by hand S3method("[",spec_tbl_df) S3method(as.character,col_spec) S3method(as.col_spec,"NULL") S3method(as.col_spec,character) S3method(as.col_spec,col_spec) S3method(as.col_spec,data.frame) S3method(as.col_spec,default) S3method(as.col_spec,list) S3method(as.data.frame,spec_tbl_df) S3method(as_tibble,spec_tbl_df) S3method(format,col_spec) S3method(output_column,POSIXt) S3method(output_column,default) S3method(output_column,double) S3method(print,col_spec) S3method(print,collector) S3method(print,date_names) S3method(print,locale) S3method(str,col_spec) S3method(type_to_col,Date) S3method(type_to_col,POSIXct) S3method(type_to_col,default) S3method(type_to_col,double) S3method(type_to_col,factor) S3method(type_to_col,hms) S3method(type_to_col,integer) S3method(type_to_col,logical) export(AccumulateCallback) export(ChunkCallback) export(DataFrameCallback) export(ListCallback) export(SideEffectChunkCallback) export(as.col_spec) export(clipboard) export(col_character) export(col_date) export(col_datetime) export(col_double) export(col_factor) export(col_guess) export(col_integer) export(col_logical) export(col_number) export(col_skip) export(col_time) export(cols) export(cols_condense) export(cols_only) export(count_fields) export(datasource) export(date_names) export(date_names_lang) export(date_names_langs) export(default_locale) export(edition_get) export(format_csv) export(format_csv2) export(format_delim) export(format_tsv) export(fwf_cols) export(fwf_empty) export(fwf_positions) export(fwf_widths) export(guess_encoding) export(guess_parser) export(local_edition) export(locale) export(melt_csv) export(melt_csv2) export(melt_csv2_chunked) export(melt_csv_chunked) export(melt_delim) export(melt_delim_chunked) export(melt_fwf) export(melt_table) export(melt_table2) export(melt_tsv) export(melt_tsv_chunked) export(output_column) export(parse_character) export(parse_date) export(parse_datetime) export(parse_double) export(parse_factor) export(parse_guess) export(parse_integer) export(parse_logical) export(parse_number) export(parse_time) export(parse_vector) export(problems) export(read_builtin) export(read_csv) export(read_csv2) export(read_csv2_chunked) export(read_csv_chunked) export(read_delim) export(read_delim_chunked) export(read_file) export(read_file_raw) export(read_fwf) export(read_lines) export(read_lines_chunked) export(read_lines_raw) export(read_lines_raw_chunked) export(read_log) export(read_rds) export(read_table) export(read_table2) export(read_tsv) export(read_tsv_chunked) export(readr_example) export(readr_threads) export(should_read_lazy) export(should_show_types) export(show_progress) export(spec) export(spec_csv) export(spec_csv2) export(spec_delim) export(spec_table) export(spec_tsv) export(stop_for_problems) export(tokenize) export(tokenizer_csv) export(tokenizer_delim) export(tokenizer_fwf) export(tokenizer_line) export(tokenizer_log) export(tokenizer_tsv) export(tokenizer_ws) export(type_convert) export(with_edition) export(write_csv) export(write_csv2) export(write_delim) export(write_excel_csv) export(write_excel_csv2) export(write_file) export(write_lines) export(write_rds) export(write_tsv) importFrom(R6,R6Class) importFrom(hms,hms) importFrom(lifecycle,deprecate_soft) importFrom(lifecycle,deprecate_warn) importFrom(lifecycle,deprecated) importFrom(lifecycle,is_present) importFrom(methods,setOldClass) importFrom(tibble,as_tibble) importFrom(tibble,tibble) useDynLib(readr, .registration = TRUE) readr/LICENSE0000644000176200001440000000005314371264576012371 0ustar liggesusersYEAR: 2023 COPYRIGHT HOLDER: readr authors readr/README.md0000644000176200001440000002264114547547012012644 0ustar liggesusers # readr [![CRAN status](https://www.r-pkg.org/badges/version/readr)](https://CRAN.R-project.org/package=readr) [![R-CMD-check](https://github.com/tidyverse/readr/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/tidyverse/readr/actions/workflows/R-CMD-check.yaml) [![Codecov test coverage](https://codecov.io/gh/tidyverse/readr/branch/main/graph/badge.svg)](https://app.codecov.io/gh/tidyverse/readr?branch=main) ## Overview The goal of readr is to provide a fast and friendly way to read rectangular data from delimited files, such as comma-separated values (CSV) and tab-separated values (TSV). It is designed to parse many types of data found in the wild, while providing an informative problem report when parsing leads to unexpected results. If you are new to readr, the best place to start is the [data import chapter](https://r4ds.hadley.nz/data-import) in R for Data Science. ## Installation ``` r # The easiest way to get readr is to install the whole tidyverse: install.packages("tidyverse") # Alternatively, install just readr: install.packages("readr") ```
``` r # Or you can install the development version from GitHub: # install.packages("pak") pak::pak("tidyverse/readr") ```
## Cheatsheet thumbnail of tidyverse data import cheatsheet ## Usage readr is part of the core tidyverse, so you can load it with: ``` r library(tidyverse) #> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ── #> ✔ dplyr 1.1.4 ✔ readr 2.1.4.9000 #> ✔ forcats 1.0.0 ✔ stringr 1.5.1 #> ✔ ggplot2 3.4.3 ✔ tibble 3.2.1 #> ✔ lubridate 1.9.3 ✔ tidyr 1.3.0 #> ✔ purrr 1.0.2 #> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ── #> ✖ dplyr::filter() masks stats::filter() #> ✖ dplyr::lag() masks stats::lag() #> ℹ Use the conflicted package () to force all conflicts to become errors ``` Of course, you can also load readr as an individual package: ``` r library(readr) ``` To read a rectangular dataset with readr, you combine two pieces: a function that parses the lines of the file into individual fields and a column specification. readr supports the following file formats with these `read_*()` functions: - `read_csv()`: comma-separated values (CSV) - `read_tsv()`: tab-separated values (TSV) - `read_csv2()`: semicolon-separated values with `,` as the decimal mark - `read_delim()`: delimited files (CSV and TSV are important special cases) - `read_fwf()`: fixed-width files - `read_table()`: whitespace-separated files - `read_log()`: web log files A column specification describes how each column should be converted from a character vector to a specific data type (e.g. character, numeric, datetime, etc.). In the absence of a column specification, readr will guess column types from the data. `vignette("column-types")` gives more detail on how readr guesses the column types. Column type guessing is very handy, especially during data exploration, but it’s important to remember these are *just guesses*. As any data analysis project matures past the exploratory phase, the best strategy is to provide explicit column types. The following example loads a sample file bundled with readr and guesses the column types: ``` r (chickens <- read_csv(readr_example("chickens.csv"))) #> Rows: 5 Columns: 4 #> ── Column specification ──────────────────────────────────────────────────────── #> Delimiter: "," #> chr (3): chicken, sex, motto #> dbl (1): eggs_laid #> #> ℹ Use `spec()` to retrieve the full column specification for this data. #> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. #> # A tibble: 5 × 4 #> chicken sex eggs_laid motto #> #> 1 Foghorn Leghorn rooster 0 That's a joke, ah say, that's a jok… #> 2 Chicken Little hen 3 The sky is falling! #> 3 Ginger hen 12 Listen. We'll either die free chick… #> 4 Camilla the Chicken hen 7 Bawk, buck, ba-gawk. #> 5 Ernie The Giant Chicken rooster 0 Put Captain Solo in the cargo hold. ``` Note that readr prints the column types – the *guessed* column types, in this case. This is useful because it allows you to check that the columns have been read in as you expect. If they haven’t, that means you need to provide the column specification. This sounds like a lot of trouble, but luckily readr affords a nice workflow for this. Use `spec()` to retrieve the (guessed) column specification from your initial effort. ``` r spec(chickens) #> cols( #> chicken = col_character(), #> sex = col_character(), #> eggs_laid = col_double(), #> motto = col_character() #> ) ``` Now you can copy, paste, and tweak this, to create a more explicit readr call that expresses the desired column types. Here we express that `sex` should be a factor with levels `rooster` and `hen`, in that order, and that `eggs_laid` should be integer. ``` r chickens <- read_csv( readr_example("chickens.csv"), col_types = cols( chicken = col_character(), sex = col_factor(levels = c("rooster", "hen")), eggs_laid = col_integer(), motto = col_character() ) ) chickens #> # A tibble: 5 × 4 #> chicken sex eggs_laid motto #> #> 1 Foghorn Leghorn rooster 0 That's a joke, ah say, that's a jok… #> 2 Chicken Little hen 3 The sky is falling! #> 3 Ginger hen 12 Listen. We'll either die free chick… #> 4 Camilla the Chicken hen 7 Bawk, buck, ba-gawk. #> 5 Ernie The Giant Chicken rooster 0 Put Captain Solo in the cargo hold. ``` `vignette("readr")` gives an expanded introduction to readr. ## Editions readr got a new parsing engine in version 2.0.0 (released July 2021). In this so-called second edition, readr calls `vroom::vroom()`, by default. The parsing engine in readr versions prior to 2.0.0 is now called the first edition. If you’re using readr \>= 2.0.0, you can still access first edition parsing via the functions `with_edition(1, ...)` and `local_edition(1)`. And, obviously, if you’re using readr \< 2.0.0, you will get first edition parsing, by definition, because that’s all there is. We will continue to support the first edition for a number of releases, but the overall goal is to make the second edition uniformly better than the first. Therefore the plan is to eventually deprecate and then remove the first edition code. New code and actively-maintained code should use the second edition. The workarounds `with_edition(1, ...)` and `local_edition(1)` are offered as a pragmatic way to patch up legacy code or as a temporary solution for infelicities identified as the second edition matures. ## Alternatives There are two main alternatives to readr: base R and data.table’s `fread()`. The most important differences are discussed below. ### Base R Compared to the corresponding base functions, readr functions: - Use a consistent naming scheme for the parameters (e.g. `col_names` and `col_types` not `header` and `colClasses`). - Are generally much faster (up to 10x-100x) depending on the dataset. - Leave strings as is by default, and automatically parse common date/time formats. - Have a helpful progress bar if loading is going to take a while. - All functions work exactly the same way regardless of the current locale. To override the US-centric defaults, use `locale()`. ### data.table and `fread()` [data.table](https://github.com/Rdatatable/data.table) has a function similar to `read_csv()` called `fread()`. Compared to `fread()`, readr functions: - Are sometimes slower, particularly on numeric heavy data. - Can automatically guess some parameters, but basically encourage explicit specification of, e.g., the delimiter, skipped rows, and the header row. - Follow tidyverse-wide conventions, such as returning a tibble, a standard approach for column name repair, and a common mini-language for column selection. ## Acknowledgements Thanks to: - [Joe Cheng](https://github.com/jcheng5) for showing me the beauty of deterministic finite automata for parsing, and for teaching me why I should write a tokenizer. - [JJ Allaire](https://github.com/jjallaire) for helping me come up with a design that makes very few copies, and is easy to extend. - [Dirk Eddelbuettel](http://dirk.eddelbuettel.com) for coming up with the name! readr/man/0000755000176200001440000000000014510343737012131 5ustar liggesusersreadr/man/cols.Rd0000644000176200001440000000471214174704674013373 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/col_types.R \name{cols} \alias{cols} \alias{cols_only} \title{Create column specification} \usage{ cols(..., .default = col_guess()) cols_only(...) } \arguments{ \item{...}{Either column objects created by \verb{col_*()}, or their abbreviated character names (as described in the \code{col_types} argument of \code{\link[=read_delim]{read_delim()}}). If you're only overriding a few columns, it's best to refer to columns by name. If not named, the column types must match the column names exactly.} \item{.default}{Any named columns not explicitly overridden in \code{...} will be read with this column type.} } \description{ \code{cols()} includes all columns in the input data, guessing the column types as the default. \code{cols_only()} includes only the columns you explicitly specify, skipping the rest. In general you can substitute \code{list()} for \code{cols()} without changing the behavior. } \details{ The available specifications are: (with string abbreviations in brackets) \itemize{ \item \code{col_logical()} [l], containing only \code{T}, \code{F}, \code{TRUE} or \code{FALSE}. \item \code{col_integer()} [i], integers. \item \code{col_double()} [d], doubles. \item \code{col_character()} [c], everything else. \item \code{col_factor(levels, ordered)} [f], a fixed set of values. \item \code{col_date(format = "")} [D]: with the locale's \code{date_format}. \item \code{col_time(format = "")} [t]: with the locale's \code{time_format}. \item \code{col_datetime(format = "")} [T]: ISO8601 date times \item \code{col_number()} [n], numbers containing the \code{grouping_mark} \item \code{col_skip()} [_, -], don't import this column. \item \code{col_guess()} [?], parse using the "best" type based on the input. } } \examples{ cols(a = col_integer()) cols_only(a = col_integer()) # You can also use the standard abbreviations cols(a = "i") cols(a = "i", b = "d", c = "_") # You can also use multiple sets of column definitions by combining # them like so: t1 <- cols( column_one = col_integer(), column_two = col_number() ) t2 <- cols( column_three = col_character() ) t3 <- t1 t3$cols <- c(t1$cols, t2$cols) t3 } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/melt_delim_chunked.Rd0000644000176200001440000001206514304131171016224 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/melt_delim_chunked.R \name{melt_delim_chunked} \alias{melt_delim_chunked} \alias{melt_csv_chunked} \alias{melt_csv2_chunked} \alias{melt_tsv_chunked} \title{Melt a delimited file by chunks} \usage{ melt_delim_chunked( file, callback, chunk_size = 10000, delim, quote = "\\"", escape_backslash = FALSE, escape_double = TRUE, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, progress = show_progress(), skip_empty_rows = FALSE ) melt_csv_chunked( file, callback, chunk_size = 10000, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, progress = show_progress(), skip_empty_rows = FALSE ) melt_csv2_chunked( file, callback, chunk_size = 10000, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, progress = show_progress(), skip_empty_rows = FALSE ) melt_tsv_chunked( file, callback, chunk_size = 10000, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, progress = show_progress(), skip_empty_rows = FALSE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{callback}{A callback function to call on each chunk} \item{chunk_size}{The number of rows to include in each chunk} \item{delim}{Single character used to separate fields within a record.} \item{quote}{Single character used to quote strings.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \description{ For certain non-rectangular data formats, it can be useful to parse the data into a melted format where each row represents a single token. } \details{ \code{melt_delim_chunked()} and the specialisations \code{melt_csv_chunked()}, \code{melt_csv2_chunked()} and \code{melt_tsv_chunked()} read files by a chunk of rows at a time, executing a given function on one chunk before reading the next. } \examples{ # Cars with 3 gears f <- function(x, pos) subset(x, data_type == "integer") melt_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) } \seealso{ Other chunked: \code{\link{callback}}, \code{\link{read_delim_chunked}()}, \code{\link{read_lines_chunked}()} } \concept{chunked} \keyword{internal} readr/man/parse_atomic.Rd0000644000176200001440000000426414174704674015103 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_atomic} \alias{parse_logical} \alias{parse_integer} \alias{parse_double} \alias{parse_character} \alias{col_logical} \alias{col_integer} \alias{col_double} \alias{col_character} \title{Parse logicals, integers, and reals} \usage{ parse_logical(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) parse_integer(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) parse_double(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) parse_character(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) col_logical() col_integer() col_double() col_character() } \arguments{ \item{x}{Character vector of values to parse.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} } \description{ Use \verb{parse_*()} if you have a character vector you want to parse. Use \verb{col_*()} in conjunction with a \verb{read_*()} function to parse the values as they're read in. } \examples{ parse_integer(c("1", "2", "3")) parse_double(c("1", "2", "3.123")) parse_number("$1,123,456.00") # Use locale to override default decimal and grouping marks es_MX <- locale("es", decimal_mark = ",") parse_number("$1.123.456,00", locale = es_MX) # Invalid values are replaced with missing values with a warning. x <- c("1", "2", "3", "-") parse_double(x) # Or flag values as missing parse_double(x, na = "-") } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/spec_delim.Rd0000644000176200001440000002610614510343737014531 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_delim.R, R/read_table.R \name{spec_delim} \alias{spec_delim} \alias{spec_csv} \alias{spec_csv2} \alias{spec_tsv} \alias{spec_table} \title{Generate a column specification} \usage{ spec_delim( file, delim = NULL, quote = "\\"", escape_backslash = FALSE, escape_double = TRUE, col_names = TRUE, col_types = list(), col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, n_max = 0, guess_max = 1000, name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) spec_csv( file, col_names = TRUE, col_types = list(), col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = 0, guess_max = 1000, name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) spec_csv2( file, col_names = TRUE, col_types = list(), col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = 0, guess_max = 1000, progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) spec_tsv( file, col_names = TRUE, col_types = list(), col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = 0, guess_max = 1000, progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) spec_table( file, col_names = TRUE, col_types = list(), locale = default_locale(), na = "NA", skip = 0, n_max = 0, guess_max = 1000, progress = show_progress(), comment = "", show_col_types = should_show_types(), skip_empty_rows = TRUE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{delim}{Single character used to separate fields within a record.} \item{quote}{Single character used to quote strings.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector of column names. If \code{TRUE}, the first row of the input will be used as the column names, and will not be included in the data frame. If \code{FALSE}, column names will be generated automatically: X1, X2, X3 etc. If \code{col_names} is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. Missing (\code{NA}) column names will generate a warning, and be filled in with dummy names \code{...1}, \code{...2} etc. Duplicate column names will generate a warning and be made unique, see \code{name_repair} to control how this is done.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{col_select}{Columns to include in the results. You can use the same mini-language as \code{dplyr::select()} to refer to the columns by name. Use \code{c()} to use more than one selection expression. Although this usage is less common, \code{col_select} also accepts a numeric column index. See \code{\link[tidyselect:language]{?tidyselect::language}} for full details on the selection language.} \item{id}{The name of a column in which to store the file path. This is useful when reading multiple input files and there is data in the file paths, such as the data collection date. If \code{NULL} (the default) no extra column is created.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{n_max}{Maximum number of lines to read.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{name_repair}{Handling of column names. The default behaviour is to ensure column names are \code{"unique"}. Various repair strategies are supported: \itemize{ \item \code{"minimal"}: No name repair or checks, beyond basic existence of names. \item \code{"unique"} (default value): Make sure names are unique and not empty. \item \code{"check_unique"}: No name repair, but check they are \code{unique}. \item \code{"unique_quiet"}: Repair with the \code{unique} strategy, quietly. \item \code{"universal"}: Make the names \code{unique} and syntactic. \item \code{"universal_quiet"}: Repair with the \code{universal} strategy, quietly. \item A function: Apply custom name repair (e.g., \code{name_repair = make.names} for names in the style of base R). \item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}}. } This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. See there for more details on these terms and the strategies used to enforce them.} \item{num_threads}{The number of processing threads to use for initial parsing and lazy reading of data. If your data contains newlines within fields the parser should automatically detect this and fall back to using one thread only. However if you know your file has newlines within quoted fields it is safest to set \code{num_threads = 1} explicitly.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} \item{lazy}{Read values lazily? By default, this is \code{FALSE}, because there are special considerations when reading a file lazily that have tripped up some users. Specifically, things get tricky when reading and then writing back into the same file. But, in general, lazy reading (\code{lazy = TRUE}) has many benefits, especially for interactive use and when your downstream work only involves a subset of the rows or columns. Learn more in \code{\link[=should_read_lazy]{should_read_lazy()}} and in the documentation for the \code{altrep} argument of \code{\link[vroom:vroom]{vroom::vroom()}}.} } \value{ The \code{col_spec} generated for the file. } \description{ When printed, only the first 20 columns are printed by default. To override, set \code{options(readr.num_columns)} can be used to modify this (a value of 0 turns off printing). } \examples{ # Input sources ------------------------------------------------------------- # Retrieve specs from a path spec_csv(system.file("extdata/mtcars.csv", package = "readr")) spec_csv(system.file("extdata/mtcars.csv.zip", package = "readr")) # Or directly from a string (must contain a newline) spec_csv(I("x,y\n1,2\n3,4")) # Column types -------------------------------------------------------------- # By default, readr guesses the columns types, looking at 1000 rows # throughout the file. # You can specify the number of rows used with guess_max. spec_csv(system.file("extdata/mtcars.csv", package = "readr"), guess_max = 20) } readr/man/read_lines_chunked.Rd0000644000176200001440000000445314174704674016243 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_lines_chunked.R \name{read_lines_chunked} \alias{read_lines_chunked} \alias{read_lines_raw_chunked} \title{Read lines from a file or string by chunk.} \usage{ read_lines_chunked( file, callback, chunk_size = 10000, skip = 0, locale = default_locale(), na = character(), progress = show_progress() ) read_lines_raw_chunked( file, callback, chunk_size = 10000, skip = 0, progress = show_progress() ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{callback}{A callback function to call on each chunk} \item{chunk_size}{The number of rows to include in each chunk} \item{skip}{Number of lines to skip before reading data.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} } \description{ Read lines from a file or string by chunk. } \seealso{ Other chunked: \code{\link{callback}}, \code{\link{melt_delim_chunked}()}, \code{\link{read_delim_chunked}()} } \concept{chunked} \keyword{internal} readr/man/parse_guess.Rd0000644000176200001440000000410714174704674014751 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_guess} \alias{parse_guess} \alias{col_guess} \alias{guess_parser} \title{Parse using the "best" type} \usage{ parse_guess( x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE ) col_guess() guess_parser( x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA") ) } \arguments{ \item{x}{Character vector of values to parse.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{guess_integer}{If \code{TRUE}, guess integer types for whole numbers, if \code{FALSE} guess numeric type for all numbers.} } \description{ \code{parse_guess()} returns the parser vector; \code{guess_parser()} returns the name of the parser. These functions use a number of heuristics to determine which type of vector is "best". Generally they try to err of the side of safety, as it's straightforward to override the parsing choice if needed. } \examples{ # Logical vectors parse_guess(c("FALSE", "TRUE", "F", "T")) # Integers and doubles parse_guess(c("1", "2", "3")) parse_guess(c("1.6", "2.6", "3.4")) # Numbers containing grouping mark guess_parser("1,234,566") parse_guess("1,234,566") # ISO 8601 date times guess_parser(c("2010-10-10")) parse_guess(c("2010-10-10")) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/format_delim.Rd0000644000176200001440000001111214304131311015036 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/write.R \name{format_delim} \alias{format_delim} \alias{format_csv} \alias{format_csv2} \alias{format_tsv} \title{Convert a data frame to a delimited string} \usage{ format_delim( x, delim, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", quote_escape = deprecated() ) format_csv( x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", quote_escape = deprecated() ) format_csv2( x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", quote_escape = deprecated() ) format_tsv( x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", quote_escape = deprecated() ) } \arguments{ \item{x}{A data frame.} \item{delim}{Delimiter used to separate values. Defaults to \code{" "} for \code{write_delim()}, \code{","} for \code{write_excel_csv()} and \code{";"} for \code{write_excel_csv2()}. Must be a single character.} \item{na}{String used for missing values. Defaults to NA. Missing values will never be quoted; strings with the same value as \code{na} will always be quoted.} \item{append}{If \code{FALSE}, will overwrite existing file. If \code{TRUE}, will append to existing file. In both cases, if the file does not exist a new file is created.} \item{col_names}{If \code{FALSE}, column names will not be included at the top of the file. If \code{TRUE}, column names will be included. If not specified, \code{col_names} will take the opposite value given to \code{append}.} \item{quote}{How to handle fields which contain characters that need to be quoted. \itemize{ \item \code{needed} - Values are only quoted if needed: if they contain a delimiter, quote, or newline. \item \code{all} - Quote all fields. \item \code{none} - Never quote fields. }} \item{escape}{The type of escape to use when quotes are in the data. \itemize{ \item \code{double} - quotes are escaped by doubling them. \item \code{backslash} - quotes are escaped by a preceding backslash. \item \code{none} - quotes are not escaped. }} \item{eol}{The end of line character to use. Most commonly either \code{"\\n"} for Unix style newlines, or \code{"\\r\\n"} for Windows style newlines.} \item{quote_escape}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{escape} argument instead.} } \value{ A string. } \description{ These functions are equivalent to \code{\link[=write_csv]{write_csv()}} etc., but instead of writing to disk, they return a string. } \section{Output}{ Factors are coerced to character. Doubles are formatted to a decimal string using the grisu3 algorithm. \code{POSIXct} values are formatted as ISO8601 with a UTC timezone \emph{Note: \code{POSIXct} objects in local or non-UTC timezones will be converted to UTC time before writing.} All columns are encoded as UTF-8. \code{write_excel_csv()} and \code{write_excel_csv2()} also include a \href{https://en.wikipedia.org/wiki/Byte_order_mark}{UTF-8 Byte order mark} which indicates to Excel the csv is UTF-8 encoded. \code{write_excel_csv2()} and \code{write_csv2} were created to allow users with different locale settings to save .csv files using their default settings (e.g. \verb{;} as the column separator and \verb{,} as the decimal separator). This is common in some European countries. Values are only quoted if they contain a comma, quote or newline. The \verb{write_*()} functions will automatically compress outputs if an appropriate extension is given. Three extensions are currently supported: \code{.gz} for gzip compression, \code{.bz2} for bzip2 compression and \code{.xz} for lzma compression. See the examples for more information. } \examples{ # format_()* functions are useful for testing and reprexes cat(format_csv(mtcars)) cat(format_tsv(mtcars)) cat(format_delim(mtcars, ";")) # Specifying missing values df <- data.frame(x = c(1, NA, 3)) format_csv(df, na = "missing") # Quotes are automatically added as needed df <- data.frame(x = c("a ", '"', ",", "\n")) cat(format_csv(df)) } \references{ Florian Loitsch, Printing Floating-Point Numbers Quickly and Accurately with Integers, PLDI '10, \url{http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf} } readr/man/write_delim.Rd0000644000176200001440000001542314304131311014711 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/write.R \name{write_delim} \alias{write_delim} \alias{write_csv} \alias{write_csv2} \alias{write_excel_csv} \alias{write_excel_csv2} \alias{write_tsv} \title{Write a data frame to a delimited file} \usage{ write_delim( x, file, delim = " ", na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) write_csv( x, file, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) write_csv2( x, file, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) write_excel_csv( x, file, na = "NA", append = FALSE, col_names = !append, delim = ",", quote = "all", escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) write_excel_csv2( x, file, na = "NA", append = FALSE, col_names = !append, delim = ";", quote = "all", escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) write_tsv( x, file, na = "NA", append = FALSE, col_names = !append, quote = "none", escape = c("double", "backslash", "none"), eol = "\\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated() ) } \arguments{ \item{x}{A data frame or tibble to write to disk.} \item{file}{File or connection to write to.} \item{delim}{Delimiter used to separate values. Defaults to \code{" "} for \code{write_delim()}, \code{","} for \code{write_excel_csv()} and \code{";"} for \code{write_excel_csv2()}. Must be a single character.} \item{na}{String used for missing values. Defaults to NA. Missing values will never be quoted; strings with the same value as \code{na} will always be quoted.} \item{append}{If \code{FALSE}, will overwrite existing file. If \code{TRUE}, will append to existing file. In both cases, if the file does not exist a new file is created.} \item{col_names}{If \code{FALSE}, column names will not be included at the top of the file. If \code{TRUE}, column names will be included. If not specified, \code{col_names} will take the opposite value given to \code{append}.} \item{quote}{How to handle fields which contain characters that need to be quoted. \itemize{ \item \code{needed} - Values are only quoted if needed: if they contain a delimiter, quote, or newline. \item \code{all} - Quote all fields. \item \code{none} - Never quote fields. }} \item{escape}{The type of escape to use when quotes are in the data. \itemize{ \item \code{double} - quotes are escaped by doubling them. \item \code{backslash} - quotes are escaped by a preceding backslash. \item \code{none} - quotes are not escaped. }} \item{eol}{The end of line character to use. Most commonly either \code{"\\n"} for Unix style newlines, or \code{"\\r\\n"} for Windows style newlines.} \item{num_threads}{Number of threads to use when reading and materializing vectors. If your data contains newlines within fields the parser will automatically be forced to use a single thread only.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The display is updated every 50,000 values and will only display if estimated reading time is 5 seconds or more. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{path}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{file} argument instead.} \item{quote_escape}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{escape} argument instead.} } \value{ \verb{write_*()} returns the input \code{x} invisibly. } \description{ The \verb{write_*()} family of functions are an improvement to analogous function such as \code{\link[=write.csv]{write.csv()}} because they are approximately twice as fast. Unlike \code{\link[=write.csv]{write.csv()}}, these functions do not include row names as a column in the written file. A generic function, \code{output_column()}, is applied to each variable to coerce columns to suitable output. } \section{Output}{ Factors are coerced to character. Doubles are formatted to a decimal string using the grisu3 algorithm. \code{POSIXct} values are formatted as ISO8601 with a UTC timezone \emph{Note: \code{POSIXct} objects in local or non-UTC timezones will be converted to UTC time before writing.} All columns are encoded as UTF-8. \code{write_excel_csv()} and \code{write_excel_csv2()} also include a \href{https://en.wikipedia.org/wiki/Byte_order_mark}{UTF-8 Byte order mark} which indicates to Excel the csv is UTF-8 encoded. \code{write_excel_csv2()} and \code{write_csv2} were created to allow users with different locale settings to save .csv files using their default settings (e.g. \verb{;} as the column separator and \verb{,} as the decimal separator). This is common in some European countries. Values are only quoted if they contain a comma, quote or newline. The \verb{write_*()} functions will automatically compress outputs if an appropriate extension is given. Three extensions are currently supported: \code{.gz} for gzip compression, \code{.bz2} for bzip2 compression and \code{.xz} for lzma compression. See the examples for more information. } \examples{ \dontshow{ .old_wd <- setwd(tempdir()) } # If only a file name is specified, write_()* will write # the file to the current working directory. write_csv(mtcars, "mtcars.csv") write_tsv(mtcars, "mtcars.tsv") # If you add an extension to the file name, write_()* will # automatically compress the output. write_tsv(mtcars, "mtcars.tsv.gz") write_tsv(mtcars, "mtcars.tsv.bz2") write_tsv(mtcars, "mtcars.tsv.xz") \dontshow{ setwd(.old_wd) } } \references{ Florian Loitsch, Printing Floating-Point Numbers Quickly and Accurately with Integers, PLDI '10, \url{http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf} } readr/man/problems.Rd0000644000176200001440000000215114462256076014247 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/problems.R \name{problems} \alias{problems} \alias{stop_for_problems} \title{Retrieve parsing problems} \usage{ problems(x = .Last.value) stop_for_problems(x) } \arguments{ \item{x}{A data frame (from \verb{read_*()}) or a vector (from \verb{parse_*()}).} } \value{ A data frame with one row for each problem and four columns: \item{row,col}{Row and column of problem} \item{expected}{What readr expected to find} \item{actual}{What it actually got} } \description{ Readr functions will only throw an error if parsing fails in an unrecoverable way. However, there are lots of potential problems that you might want to know about - these are stored in the \code{problems} attribute of the output, which you can easily access with this function. \code{stop_for_problems()} will throw an error if there are any parsing problems: this is useful for automated scripts where you want to throw an error as soon as you encounter a problem. } \examples{ x <- parse_integer(c("1X", "blah", "3")) problems(x) y <- parse_integer(c("1", "2", "3")) problems(y) } readr/man/melt_delim.Rd0000644000176200001440000001456614304131171014533 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/melt_delim.R \name{melt_delim} \alias{melt_delim} \alias{melt_csv} \alias{melt_csv2} \alias{melt_tsv} \title{Return melted data for each token in a delimited file (including csv & tsv)} \usage{ melt_delim( file, delim, quote = "\\"", escape_backslash = FALSE, escape_double = TRUE, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE ) melt_csv( file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE ) melt_csv2( file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE ) melt_tsv( file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{delim}{Single character used to separate fields within a record.} \item{quote}{Single character used to quote strings.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{n_max}{Maximum number of lines to read.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \value{ A \code{\link[=tibble]{tibble()}} of four columns: \itemize{ \item \code{row}, the row that the token comes from in the original file \item \code{col}, the column that the token comes from in the original file \item \code{data_type}, the data type of the token, e.g. \code{"integer"}, \code{"character"}, \code{"date"}, guessed in a similar way to the \code{guess_parser()} function. \item \code{value}, the token itself as a character string, unchanged from its representation in the original file. } If there are parsing problems, a warning tells you how many, and you can retrieve the details with \code{\link[=problems]{problems()}}. } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} This function has been superseded in readr and moved to \href{https://r-lib.github.io/meltr/}{the meltr package}. } \details{ For certain non-rectangular data formats, it can be useful to parse the data into a melted format where each row represents a single token. \code{melt_csv()} and \code{melt_tsv()} are special cases of the general \code{melt_delim()}. They're useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively. \code{melt_csv2()} uses \verb{;} for the field separator and \verb{,} for the decimal point. This is common in some European countries. } \examples{ # Input sources ------------------------------------------------------------- # Read from a path melt_csv(readr_example("mtcars.csv")) melt_csv(readr_example("mtcars.csv.zip")) melt_csv(readr_example("mtcars.csv.bz2")) \dontrun{ melt_csv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv") } # Or directly from a string (must contain a newline) melt_csv("x,y\n1,2\n3,4") # To import empty cells as 'empty' rather than `NA` melt_csv("x,y\n,NA,\"\",''", na = "NA") # File types ---------------------------------------------------------------- melt_csv("a,b\n1.0,2.0") melt_csv2("a;b\n1,0;2,0") melt_tsv("a\tb\n1.0\t2.0") melt_delim("a|b\n1.0|2.0", delim = "|") } \seealso{ \code{\link[=read_delim]{read_delim()}} for the conventional way to read rectangular data from delimited files. } readr/man/read_builtin.Rd0000644000176200001440000000134014174704674015066 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_builtin.R \name{read_builtin} \alias{read_builtin} \title{Read built-in object from package} \usage{ read_builtin(x, package = NULL) } \arguments{ \item{x}{Name (character string) of data set to read.} \item{package}{Name of package from which to find data set. By default, all attached packages are searched and then the 'data' subdirectory (if present) of the current working directory.} } \value{ An object of the built-in class of \code{x}. } \description{ Consistent wrapper around \code{\link[=data]{data()}} that forces the promise. This is also a stronger parallel to loading data from a file. } \examples{ read_builtin("mtcars", "datasets") } readr/man/date_names.Rd0000644000176200001440000000212514174704674014527 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/date-symbols.R \name{date_names} \alias{date_names} \alias{date_names_lang} \alias{date_names_langs} \title{Create or retrieve date names} \usage{ date_names(mon, mon_ab = mon, day, day_ab = day, am_pm = c("AM", "PM")) date_names_lang(language) date_names_langs() } \arguments{ \item{mon, mon_ab}{Full and abbreviated month names.} \item{day, day_ab}{Full and abbreviated week day names. Starts with Sunday.} \item{am_pm}{Names used for AM and PM.} \item{language}{A BCP 47 locale, made up of a language and a region, e.g. \code{"en"} for American English. See \code{date_names_langs()} for a complete list of available locales.} } \description{ When parsing dates, you often need to know how weekdays of the week and months are represented as text. This pair of functions allows you to either create your own, or retrieve from a standard list. The standard list is derived from ICU (\verb{http://site.icu-project.org}) via the stringi package. } \examples{ date_names_lang("en") date_names_lang("ko") date_names_lang("fr") } readr/man/parse_number.Rd0000644000176200001440000000415214315646511015103 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_number} \alias{parse_number} \alias{col_number} \title{Parse numbers, flexibly} \usage{ parse_number(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) col_number() } \arguments{ \item{x}{Character vector of values to parse.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} } \value{ A numeric vector (double) of parsed numbers. } \description{ This parses the first number it finds, dropping any non-numeric characters before the first number and all characters after the first number. The grouping mark specified by the locale is ignored inside the number. } \examples{ ## These all return 1000 parse_number("$1,000") ## leading `$` and grouping character `,` ignored parse_number("euro1,000") ## leading non-numeric euro ignored parse_number("t1000t1000") ## only parses first number found parse_number("1,234.56") ## explicit locale specifying European grouping and decimal marks parse_number("1.234,56", locale = locale(decimal_mark = ",", grouping_mark = ".")) ## SI/ISO 31-0 standard spaces for number grouping parse_number("1 234.56", locale = locale(decimal_mark = ".", grouping_mark = " ")) ## Specifying strings for NAs parse_number(c("1", "2", "3", "NA")) parse_number(c("1", "2", "3", "NA", "Nothing"), na = c("NA", "Nothing")) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/read_table2.Rd0000644000176200001440000000154014174704674014573 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_table.R \name{read_table2} \alias{read_table2} \title{Read whitespace-separated columns into a tibble} \usage{ read_table2( file, col_names = TRUE, col_types = NULL, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", skip_empty_rows = TRUE ) } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This function is deprecated because we renamed it to \code{\link[=read_table]{read_table()}} and removed the old \code{read_table} function, which was too strict for most cases and was analogous to just using \code{read_fwf()}. } \keyword{internal} readr/man/melt_fwf.Rd0000644000176200001440000001042614174704674014235 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/melt_fwf.R \name{melt_fwf} \alias{melt_fwf} \title{Return melted data for each token in a fixed width file} \usage{ melt_fwf( file, col_positions, locale = default_locale(), na = c("", "NA"), comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}}, \code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields, use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a ragged fwf file), supply the last end position as NA.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Maximum number of lines to read.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} This function has been superseded in readr and moved to \href{https://r-lib.github.io/meltr/}{the meltr package}. } \details{ For certain non-rectangular data formats, it can be useful to parse the data into a melted format where each row represents a single token. \code{melt_fwf()} parses each token of a fixed width file into a single row, but it still requires that each field is in the same in every row of the source file. } \examples{ fwf_sample <- readr_example("fwf-sample.txt") cat(read_lines(fwf_sample)) # You can specify column positions in several ways: # 1. Guess based on position of empty columns melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) # 2. A vector of field widths melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) # 3. Paired vectors of start and end positions melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn"))) # 4. Named arguments with start and end positions melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42))) # 5. Named arguments with column widths melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) } \seealso{ \code{\link[=melt_table]{melt_table()}} to melt fixed width files where each column is separated by whitespace, and \code{\link[=read_fwf]{read_fwf()}} for the conventional way to read rectangular data from fixed width files. } readr/man/count_fields.Rd0000644000176200001440000000274214174704674015112 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/count_fields.R \name{count_fields} \alias{count_fields} \title{Count the number of fields in each line of a file} \usage{ count_fields(file, tokenizer, skip = 0, n_max = -1L) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{tokenizer}{A tokenizer that specifies how to break the \code{file} up into fields, e.g., \code{\link[=tokenizer_csv]{tokenizer_csv()}}, \code{\link[=tokenizer_fwf]{tokenizer_fwf()}}} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Optionally, maximum number of rows to count fields for.} } \description{ This is useful for diagnosing problems with functions that fail to parse correctly. } \examples{ count_fields(readr_example("mtcars.csv"), tokenizer_csv()) } readr/man/read_log.Rd0000644000176200001440000001010114510343737014165 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_log.R \name{read_log} \alias{read_log} \title{Read common/combined log file into a tibble} \usage{ read_log( file, col_names = FALSE, col_types = NULL, trim_ws = TRUE, skip = 0, n_max = Inf, show_col_types = should_show_types(), progress = show_progress() ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector of column names. If \code{TRUE}, the first row of the input will be used as the column names, and will not be included in the data frame. If \code{FALSE}, column names will be generated automatically: X1, X2, X3 etc. If \code{col_names} is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. Missing (\code{NA}) column names will generate a warning, and be filled in with dummy names \code{...1}, \code{...2} etc. Duplicate column names will generate a warning and be made unique, see \code{name_repair} to control how this is done.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{n_max}{Maximum number of lines to read.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} } \description{ This is a fairly standard format for log files - it uses both quotes and square brackets for quoting, and there may be literal quotes embedded in a quoted string. The dash, "-", is used for missing values. } \examples{ read_log(readr_example("example.log")) } readr/man/read_rds.Rd0000644000176200001440000000332414304131171014171 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/rds.R \name{read_rds} \alias{read_rds} \alias{write_rds} \title{Read/write RDS files.} \usage{ read_rds(file, refhook = NULL) write_rds( x, file, compress = c("none", "gz", "bz2", "xz"), version = 2, refhook = NULL, text = FALSE, path = deprecated(), ... ) } \arguments{ \item{file}{The file path to read from/write to.} \item{refhook}{A function to handle reference objects.} \item{x}{R object to write to serialise.} \item{compress}{Compression method to use: "none", "gz" ,"bz", or "xz".} \item{version}{Serialization format version to be used. The default value is 2 as it's compatible for R versions prior to 3.5.0. See \code{\link[base:readRDS]{base::saveRDS()}} for more details.} \item{text}{If \code{TRUE} a text representation is used, otherwise a binary representation is used.} \item{path}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{file} argument instead.} \item{...}{Additional arguments to connection function. For example, control the space-time trade-off of different compression methods with \code{compression}. See \code{\link[=connections]{connections()}} for more details.} } \value{ \code{write_rds()} returns \code{x}, invisibly. } \description{ Consistent wrapper around \code{\link[=saveRDS]{saveRDS()}} and \code{\link[=readRDS]{readRDS()}}. \code{write_rds()} does not compress by default as space is generally cheaper than time. } \examples{ temp <- tempfile() write_rds(mtcars, temp) read_rds(temp) \dontrun{ write_rds(mtcars, "compressed_mtc.rds", "xz", compression = 9L) } } readr/man/callback.Rd0000644000176200001440000000375514315642631014164 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/callback.R \name{callback} \alias{callback} \alias{ChunkCallback} \alias{SideEffectChunkCallback} \alias{DataFrameCallback} \alias{ListCallback} \alias{AccumulateCallback} \title{Callback classes} \description{ These classes are used to define callback behaviors. } \details{ \describe{ \item{ChunkCallback}{Callback interface definition, all callback functions should inherit from this class.} \item{SideEffectChunkCallback}{Callback function that is used only for side effects, no results are returned.} \item{DataFrameCallback}{Callback function that combines each result together at the end.} \item{AccumulateCallBack}{ Callback function that accumulates a single result. Requires the parameter \code{acc} to specify the initial value of the accumulator. The parameter \code{acc} is \code{NULL} by default. } } } \examples{ ## If given a regular function it is converted to a SideEffectChunkCallback # view structure of each chunk read_lines_chunked(readr_example("mtcars.csv"), str, chunk_size = 5) # Print starting line of each chunk f <- function(x, pos) print(pos) read_lines_chunked(readr_example("mtcars.csv"), SideEffectChunkCallback$new(f), chunk_size = 5) # If combined results are desired you can use the DataFrameCallback # Cars with 3 gears f <- function(x, pos) subset(x, gear == 3) read_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) # The ListCallback can be used for more flexible output f <- function(x, pos) x$mpg[x$hp > 100] read_csv_chunked(readr_example("mtcars.csv"), ListCallback$new(f), chunk_size = 5) # The AccumulateCallback accumulates results from each chunk f <- function(x, pos, acc) sum(x$mpg) + acc read_csv_chunked(readr_example("mtcars.csv"), AccumulateCallback$new(f, acc = 0), chunk_size = 5) } \seealso{ Other chunked: \code{\link{melt_delim_chunked}()}, \code{\link{read_delim_chunked}()}, \code{\link{read_lines_chunked}()} } \concept{chunked} \keyword{internal} readr/man/datasource.Rd0000644000176200001440000000301614174704674014561 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/source.R \name{datasource} \alias{datasource} \title{Create a source object.} \usage{ datasource( file, skip = 0, skip_empty_rows = FALSE, comment = "", skip_quote = TRUE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{skip}{Number of lines to skip before reading data.} } \description{ Create a source object. } \examples{ # Literal csv datasource("a,b,c\n1,2,3") datasource(charToRaw("a,b,c\n1,2,3")) # Strings datasource(readr_example("mtcars.csv")) datasource(readr_example("mtcars.csv.bz2")) datasource(readr_example("mtcars.csv.zip")) \dontrun{ datasource("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv") } # Connection con <- rawConnection(charToRaw("abc\n123")) datasource(con) close(con) } \keyword{internal} readr/man/read_table.Rd0000644000176200001440000001257314510343737014512 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_table.R \name{read_table} \alias{read_table} \title{Read whitespace-separated columns into a tibble} \usage{ read_table( file, col_names = TRUE, col_types = NULL, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", show_col_types = should_show_types(), skip_empty_rows = TRUE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector of column names. If \code{TRUE}, the first row of the input will be used as the column names, and will not be included in the data frame. If \code{FALSE}, column names will be generated automatically: X1, X2, X3 etc. If \code{col_names} is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. Missing (\code{NA}) column names will generate a warning, and be filled in with dummy names \code{...1}, \code{...2} etc. Duplicate column names will generate a warning and be made unique, see \code{name_repair} to control how this is done.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Maximum number of lines to read.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \description{ \code{read_table()} is designed to read the type of textual data where each column is separated by one (or more) columns of space. \code{read_table()} is like \code{\link[=read.table]{read.table()}}, it allows any number of whitespace characters between columns, and the lines can be of different lengths. \code{spec_table()} returns the column specifications rather than a data frame. } \examples{ ws <- readr_example("whitespace-sample.txt") writeLines(read_lines(ws)) read_table(ws) } \seealso{ \code{\link[=read_fwf]{read_fwf()}} to read fixed width files where each column is not separated by whitespace. \code{read_fwf()} is also useful for reading tabular data with non-standard formatting. } readr/man/spec.Rd0000644000176200001440000000172514174357220013355 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/col_types.R \name{cols_condense} \alias{cols_condense} \alias{spec} \title{Examine the column specifications for a data frame} \usage{ cols_condense(x) spec(x) } \arguments{ \item{x}{The data frame object to extract from} } \value{ A col_spec object. } \description{ \code{cols_condense()} takes a spec object and condenses its definition by setting the default column type to the most frequent type and only listing columns with a different type. \code{spec()} extracts the full column specification from a tibble created by readr. } \examples{ df <- read_csv(readr_example("mtcars.csv")) s <- spec(df) s cols_condense(s) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/parse_factor.Rd0000644000176200001440000000475514304131171015067 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_factor} \alias{parse_factor} \alias{col_factor} \title{Parse factors} \usage{ parse_factor( x, levels = NULL, ordered = FALSE, na = c("", "NA"), locale = default_locale(), include_na = TRUE, trim_ws = TRUE ) col_factor(levels = NULL, ordered = FALSE, include_na = FALSE) } \arguments{ \item{x}{Character vector of values to parse.} \item{levels}{Character vector of the allowed levels. When \code{levels = NULL} (the default), \code{levels} are discovered from the unique values of \code{x}, in the order in which they appear in \code{x}.} \item{ordered}{Is it an ordered factor?} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{include_na}{If \code{TRUE} and \code{x} contains at least one \code{NA}, then \code{NA} is included in the levels of the constructed factor.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} } \description{ \code{parse_factor()} is similar to \code{\link[=factor]{factor()}}, but generates a warning if \code{levels} have been specified and some elements of \code{x} are not found in those \code{levels}. } \examples{ # discover the levels from the data parse_factor(c("a", "b")) parse_factor(c("a", "b", "-99")) parse_factor(c("a", "b", "-99"), na = c("", "NA", "-99")) parse_factor(c("a", "b", "-99"), na = c("", "NA", "-99"), include_na = FALSE) # provide the levels explicitly parse_factor(c("a", "b"), levels = letters[1:5]) x <- c("cat", "dog", "caw") animals <- c("cat", "dog", "cow") # base::factor() silently converts elements that do not match any levels to # NA factor(x, levels = animals) # parse_factor() generates same factor as base::factor() but throws a warning # and reports problems parse_factor(x, levels = animals) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/output_column.Rd0000644000176200001440000000104714174357220015335 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/write.R \name{output_column} \alias{output_column} \title{Preprocess column for output} \usage{ output_column(x, name) } \arguments{ \item{x}{A vector} } \description{ This is a generic function that applied to each column before it is saved to disk. It provides a hook for S3 classes that need special handling. } \examples{ # Most columns are not altered, but POSIXct are converted to ISO8601. x <- parse_datetime("2016-01-01") str(output_column(x)) } \keyword{internal} readr/man/type_convert.Rd0000644000176200001440000000433414174704674015154 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/type_convert.R \name{type_convert} \alias{type_convert} \title{Re-convert character columns in existing data frame} \usage{ type_convert( df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, locale = default_locale(), guess_integer = FALSE ) } \arguments{ \item{df}{A data frame.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, column types will be imputed using all rows.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{guess_integer}{If \code{TRUE}, guess integer types for whole numbers, if \code{FALSE} guess numeric type for all numbers.} } \description{ This is useful if you need to do some manual munging - you can read the columns in as character, clean it up with (e.g.) regular expressions and then let readr take another stab at parsing it. The name is a homage to the base \code{\link[utils:type.convert]{utils::type.convert()}}. } \note{ \code{type_convert()} removes a 'spec' attribute, because it likely modifies the column data types. (see \code{\link[=spec]{spec()}} for more information about column specifications). } \examples{ df <- data.frame( x = as.character(runif(10)), y = as.character(sample(10)), stringsAsFactors = FALSE ) str(df) str(type_convert(df)) df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE) str(type_convert(df)) # Type convert can be used to infer types from an entire dataset # first read the data as character data <- read_csv(readr_example("mtcars.csv"), col_types = list(.default = col_character()) ) str(data) # Then convert it with type_convert type_convert(data) } readr/man/show_progress.Rd0000644000176200001440000000127514304131171015315 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{show_progress} \alias{show_progress} \title{Determine whether progress bars should be shown} \usage{ show_progress() } \description{ By default, readr shows progress bars. However, progress reporting is suppressed if any of the following conditions hold: \itemize{ \item The bar is explicitly disabled by setting \code{options(readr.show_progress = FALSE)}. \item The code is run in a non-interactive session, as determined by \code{\link[rlang:is_interactive]{rlang::is_interactive()}}. \item The code is run in an RStudio notebook chunk, as determined by \code{getOption("rstudio.notebook.executing")}. } } readr/man/tokenize.Rd0000644000176200001440000000303214174704674014255 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tokenizer.R \name{tokenize} \alias{tokenize} \title{Tokenize a file/string.} \usage{ tokenize(file, tokenizer = tokenizer_csv(), skip = 0, n_max = -1L) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{tokenizer}{A tokenizer specification.} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Optionally, maximum number of rows to tokenize.} } \description{ Turns input into a character vector. Usually the tokenization is done purely in C++, and never exposed to R (because that requires a copy). This function is useful for testing, or when a file doesn't parse correctly and you want to see the underlying tokens. } \examples{ tokenize("1,2\n3,4,5\n\n6") # Only tokenize first two lines tokenize("1,2\n3,4,5\n\n6", n = 2) } \keyword{internal} readr/man/encoding.Rd0000644000176200001440000000162714174357220014212 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/encoding.R \name{guess_encoding} \alias{guess_encoding} \title{Guess encoding of file} \usage{ guess_encoding(file, n_max = 10000, threshold = 0.2) } \arguments{ \item{file}{A character string specifying an input as specified in \code{\link[=datasource]{datasource()}}, a raw vector, or a list of raw vectors.} \item{n_max}{Number of lines to read. If \code{n_max} is -1, all lines in file will be read.} \item{threshold}{Only report guesses above this threshold of certainty.} } \value{ A tibble } \description{ Uses \code{\link[stringi:stri_enc_detect]{stringi::stri_enc_detect()}}: see the documentation there for caveats. } \examples{ guess_encoding(readr_example("mtcars.csv")) guess_encoding(read_lines_raw(readr_example("mtcars.csv"))) guess_encoding(read_file_raw(readr_example("mtcars.csv"))) guess_encoding("a\n\u00b5\u00b5") } readr/man/melt_table.Rd0000644000176200001440000001004214371264576014535 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/melt_table.R \name{melt_table} \alias{melt_table} \alias{melt_table2} \title{Return melted data for each token in a whitespace-separated file} \usage{ melt_table( file, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", skip_empty_rows = FALSE ) melt_table2( file, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, progress = show_progress(), comment = "", skip_empty_rows = FALSE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Maximum number of lines to read.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \description{ \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#superseded}{\figure{lifecycle-superseded.svg}{options: alt='[Superseded]'}}}{\strong{[Superseded]}} This function has been superseded in readr and moved to \href{https://r-lib.github.io/meltr/}{the meltr package}. For certain non-rectangular data formats, it can be useful to parse the data into a melted format where each row represents a single token. \code{melt_table()} and \code{melt_table2()} are designed to read the type of textual data where each column is separated by one (or more) columns of space. \code{melt_table2()} allows any number of whitespace characters between columns, and the lines can be of different lengths. \code{melt_table()} is more strict, each line must be the same length, and each field is in the same position in every line. It first finds empty columns and then parses like a fixed width file. } \examples{ fwf <- readr_example("fwf-sample.txt") writeLines(read_lines(fwf)) melt_table(fwf) ws <- readr_example("whitespace-sample.txt") writeLines(read_lines(ws)) melt_table2(ws) } \seealso{ \code{\link[=melt_fwf]{melt_fwf()}} to melt fixed width files where each column is not separated by whitespace. \code{melt_fwf()} is also useful for reading tabular data with non-standard formatting. \code{\link[=read_table]{read_table()}} is the conventional way to read tabular data from whitespace-separated files. } readr/man/figures/0000755000176200001440000000000014403212750013564 5ustar liggesusersreadr/man/figures/lifecycle-defunct.svg0000644000176200001440000000242414403212750017674 0ustar liggesusers lifecycle: defunct lifecycle defunct readr/man/figures/lifecycle-maturing.svg0000644000176200001440000000243014403212750020067 0ustar liggesusers lifecycle: maturing lifecycle maturing readr/man/figures/logo.png0000644000176200001440000003765114403211600015237 0ustar liggesusersPNG  IHDRޫhgAMA a cHRMz&u0`:pQ<bKGDtIME 1q>IDATxy$EYw'03  ((x끊껯(Ȋ("r0 s}TוYWU]Uϧ!2*3+~D<fh_uO?x^L_DMd `1BnG/B~̙Mqd ppИFdtHz[g4E[*%KTJ4?ɩ)[jE .>jq ,j {P ;5Ȣw~-jA o=3J6O~ q5\d lBŹoezp >T\hW y't ܃Zȕp'NԴ'Cf^~sj!WZBx#ۀ+PT#!<x5t0 Zĕ %\%aOJa@]:6_kR)C  z"xB^,3QqxIO 4 Ҋ?fZ٬x HR{l~caֶ64{b*>(E|ֶ2\"JhSdǹ:2Nm{!:\xp5ڶXr6t 9qniV,FLq[mY0؏Acz{7PKhb .R\ns "pX>Ꮎ8녾>d$Emh{` [hĹ7 ZT;]5dcV(UhbQTN ư]y*0; G2qa>"Οvrh)mZmZS`ǁCI f\7;#{{UTnOA-ɩZRyFuH0}?+Oo #D }*ךm[E<^[f<߱(%ɳ6mtNz[FF2i-ގmqB1l~Cx*0rl~z%?>.m'+A-\攞<ᶢl~6?ђgfᎾьKo[|m=TY l~W26ZQߣmij&M!WM-./G.ƫU)66ʣ(uTmk6l~k9׌NJgmI,mۨ5ƶX55wKyWkctoVͬxbE<ߑdl~ 6Lߠ֯hbE"RJikh鏵(MtvHb,6mZ^/ˁ.->Cƶ#y67И@3됱'>P3%--&a{*m~RJ,ۍ~B !m *}nS^b7 r ,6[_ͯvl~-i relu!Y[H)%wly`GtG1}B9m-pCȞR'"wjZibJ)il'!/G2&{SqQa{D)Ujmz(mPQكWW]Xra j|6*eK>n%:e->w'00SK$P41{D 9uH9xۢj6->Ҿ’6l~s""i{آio5i|#I$-{Ɠ'&{n0 *!g[1~3cmqbYym!c;˲5@6X! yZ[LKH{4y |LgU$BmiGC߄ZI&)SP gۀrpxO5#Jq[HY3Ύ!3/7>ޭ6Q ܌jl_ua) x ߥBF*P>x⻷>eb]za`zۢnu=pʼ([=ۈjm/6^Y{.`$s;x"dfJN|lBs5"-.սǍl~QAw(tkff?$Mٱcp/|>Y:kcl~?S-࿁3qvΜ(>a$j%"G-֪./>X"Mn+XP[PW[kw1jNw6ʥV!̗/L,ⶇ^sp ~c|5j ؓ/dr[|e*ś.|v%!x=-T,%ĩ| Nxvq#'-Tu}b5ۭ^|Қ5Z | Yr&us2(ŧ\7o{ -"̀YXi0ѨQ꟡|.Č_ |xʊsE$D֮CB!xYcZwh$}C,ho˶I$ B_/?y+z("*&>Q >ܮ뜂OȞ8pW8;?%"fņiomd%rs|Y:S:X'"?-17>NċPãxG[ڱU?;Wh!7x _D~~<@A;]%e޻5nva0!|8 J`_Ȣ&벳my}_5gvpjQFݍe-$bo#TU}h9 8r|\1l[)QǶ+8>gͯ6%R>9jeIm*\qe[lH-"S-N*` 7?-ܲٛ4ӗ)q[Jmq2O ]߀;_ǹM9ö(IO0@F3M!0YQT`)*E4hkZ$Sc`n~V*Fe)|Ssyf4%B BRKb&SO=p \~;bRJӬP\M$)by` B~qֱ3}k\qڀ#-MM뙄ZC B`Zx<4MIh݅`7ԴgJ wsG?df+)%?埸'wb39eYC-}U Z00EØHǶ%1`+!VIJa/cV F BBpSc3r ?@s{p\~:̝T0zIb5*Fw+TA,CP%48|bZgQhA BRw~;54jI}fGJ'ɕ;WdU(Z7vd$%pR)_ޥ+- EymvdnI`V- F BBpهbaKg>SNXp3oN^J3!zIb5*Fw+ $c1FO˻&PHpsg? #>Ls8̣gQhA x&j_1NIn9|R-)`Ø)gbNrCJ H{zPfa|Ϝ i20Fsz J茒c1H8HcCdFIK4?-i#d1$ڎh7klߋ.!iyϾZfrIAt$p46J>HIr\#6굺 F 8w9g'2UHo9D$ iBHS:`5 z8HڮZdY(O3 EJ8:QD4@cqIQ 25~Z߈h@"-aD0TVMVylJ):>u-pҁatHǽu :B=5!6]zRbYBBb iEk@s+U7O-pSZad֕jضdng3gz$~gU!_}>=@#u|S@ x2ȚZݻZvlixa_D"A`L~s39!L?y1THO-!R!H ضt~ۣSFF5`;% EĠ6JI}#UM \p!:Z*zʨ8 JJĆEC + ܪE<<)#Mh{tmmH|cf[ i1EP*Q`Z#::>Jg2RF1>tSFE%ZN.b.-YSKlѸ력#ݓxGgbymiݪ' 2>8Gι]|+4M K9~9㬘ẌF~B``{:gw)_5έxYBJM$DB~,[20# !;anmT(d">"A?xh4NY(p l)%>I0'cIFb R) ùݽ)%H8CڒxX,JbJ㼽}4 A?Dh\}.PZRJ"A6_B($%ytGպkV,hm'\G"i#vb44Hz;ٶ$qEl:v _EB ahx]{yWml}-ղ\mׅY|>G[ΚX0:|>۶y< Vv۶1 qqsCX8p(mKzxin >"ã]YdQgoZѸ8 L!apayX|.MuAv\w'fe( %4ׇ|3RK~Áa>s7Ƨ?s[[ʶi98mAFH)Y{b.yim- Jaú圽y#p>O~y]=1εJ#~qֱ[쮴@pܛVqێ20M~Z۸8-0NKfSFl:n ;dK|燷q[? ]}-0/W2җ.G=l>ֆp }Hl) C(a>U/8mdAՍUMu_%މw7n\,٨юm`6.뷳\ώ]GXJIcCE# |X2ؖH2iEsOϪC/_ E,1Gﯼ kRV*o0ֱz"_{o*H|!DۖL30 w?CR}>3eLe> !0 1-w@}8ķnI}]O}t7+ύ3vq<I ˶4 Lཧ֩/L7l& 5j-,,[JeVBAgK*U<`08IdLl)ټP>{[i bYat/q!Lam- l:v |`3]mOݴK/<7ضj>,!~y}Y{q'! zB.8o. $x'rߖmާ0MsgasygDzsHY0ɤcO̝w'- sw[x+?v~f 'osyx'Fl֬Za so[!b뒧n?<']~˖̥!]DHYvfZ48,lJIK|sX"4,AMcYǮ.tOЉp8|YIzMг1˧'y]|7qFl[o;#n$L|[QՇ6]GT+q,[2^ܙa6][O;1Kg~|)*N&S~# _sZg70g E׍pKF=8:+yHCALO\?]Q|mEMsjܨXoz{y4fqcǟ~;y2ݒRrq3nkȲ3xf9d0 ㏏RBkK=kW/M$=H}]8-^!a~gBP_|$)˚s:>HȏmK~']OLYc>يp6w@߽1zGƝڱKY${anwVs"е$ ýOqvXz2l}~Gz#73S[Vp!9S"L/t˨s?/8X{ҙ~쮧s4 - pGSgSJI(c%Le7 _^;0)Tx/+p(gaK 2K?nH`6Җtu0;3:^vĒH9j%Hg~W^۫BRb SD aغ 8;4GB ~ڛ"^ܑ氥v3O3 ![ {XJ@WA:i˗7G{nFB~p(H$  X}TkPvt䆵1 #=O$sR*v#oMgg/#Ḍ^?@2es=u#E> زm~7⼷qoZ9-{|H)ijҐg5^FtW]kԮ zc>acYg59Z"œēIP[SNܨ*~[cdV8[Ξ6"U^d,޼榺428\BMȬsދA}]p(~abI2eXn0 eD7`*>?kIIMuMJU8#4HlR);}Ԗ7m8/%*%eajze`p]{xcA說h,N]$ą:ֆqZ sb8v*˝Ze l[-G !H,} zGƙ" Gi XͪC!֬Z2 ]{Ǹ>U)p8Qe97{\ƼFwʈ`YJ'D Dqm"eq- r.eMZۥ|#oVyŝFd4&@HrA|A2{=)#P/L7Cy;IL["#uȖ6!mC9}zB曟*՟;|m 9`4Futj-{x2J Mk2YS9?VD}KjJ apu44dR?nv3TB?o0-:ۖt7137SMuN0]s}8hCNr8WҦi ,*fbt%xrSRCO4t688;d~55qJ"ΔQG\N@ lh!zyig3ͳq|V.jǜu #:&kIJL?Ȥ-@cCdSA.{~ƺ ']ʈp By{5,u|`z~s֕&hjV絆BEvG"9U4׏/#ow03m6l\͉Ђh|Ȳ%/e!pB'_}VXJ=0uZE,%BjpxԠ*_-32Hh l ²l/眐6g:˦(%y]< N@JI}]O~|=oa$e,';ОTh!}Ϧ [s?r$?D(3NmX  tc9Z9{QWL-|wfբQL!?30Mݶ%.>Vm!eQa6-Y/_:ZT't.ng+/ŲUoi ksӽx r3MpGE:j9mXJ`};d}7oL0vm1<~6^'/9NZ>i6HUws{NöQ/pQ˙z54l#q8ta+B^y8ǭYf3R.!t?"3-cVpKu=%e}& -ukkp^!BwZ?=^ɺ5K, ۖ47}gG247ձ~R9j%mjv΃;1GPHB>@ȔZʙJ$\[YyٸhfI'_d3{uXPEMkk4 $hkoyc[/Jfq] Z%m=_SyӪRR Y}ԹRd6ogn\:,[uág'v^p9lF w,[|>3=lv^ NS^C { #4ԅFFK)1}>I{~$zT,-r_}>{Ol≔Zy?0>ym{iFYض`98@² :B` Ax1+8,D}$YNlc  .}~[w _.86AebO۷g_a6q$FfW;O:u̡>PZX"Ů/{t.+{x`O8~=4Hbwui(D '|:RJDKyS-*qv~3'IJ%RkB -Du5Cy BlSs.45{~.'FڶSe9gL~*Ҷ[`sZ+ Z@[N (ȡT1ZMT1ZMʍE.*bR8RӸN(/ZeB4!1[SKlGj#`JAe8ʊpsfgA /^V, @F"[=R_SK* A֙B K97.TƞA뇸'0S7 -#H1#LOM$2BEʦ/+={o{Mkc/_OoG2e~'2#$[^)l vS!KS28L*`04A}ʱ{S L'~mC .C23HO87> C`8 ]W}.B嗆=Rhb1o6uj!xϛz:}\!,l*dC_kvxi7}&>8qOe|6{a^ag<\}Dp;}M U6 CtX4܊p6vX,ESP>*&,o6Ě&#{{`?p`! cA"C ͙+UOn"lۣǛ-,xhiC3O~;<#QS.At9<ܳw7293Bw_+Aۃܳ Vu] #/wǀ+) #'>0܊ՋtzFtz&^f lnn < \|XYHi!'uЉ۠QǚCNkd<kw݃@ }pJ텼K:>`vͺi'43 gnj,>yU;-ƹnWkmAW_5p%p*&L`d tXJɳgټxuOz}ˋt4|~ʲ=inyโ=)}ᠧVĬ2d,MZ.8 p FPhnFj7>vwnkMzcʎZ>c t hNMuSz`pmrܤqzӛqOB50Z.Cy_+#2/_r:o?~5^nW!X2k ߻|}'yxꕽ5wZ]{#|02e ׶Ukӝ)BO0I"G^z/5'uxd{qk#5 aAGsۛ"L,lTFC8@C8Hgs]pFu ZYQx:'@S]F³ҋeOwsߠZ'q'˔2,p9x30"&tiMiT)dBgi|je*/6εPU]@&\R)؏> YHYi'+=ݹiڶR)C eVDy @{ee^?DRga!eՉejۢfWdwy?:`{Px^:[[T]ԷȞvҶEM%`>F <].E/%d 8*-JBC53J/MQqǢR ;TlвkH]];T:ee[#S!xI"ɔ!Jv[Gӭ\OZJʩxp>Q+x[u{Q>C[HyŽ̺VF frc6=Q#cgA !Ƶ C#)/#eesw=C8VZr˫HiPl~/k܃KYUbGNBJgsJimC|ϸ:o7@ ;27XJutL-˶eH"!:[??pp Ͻ8p' A ˴<,!QyWPm%釦DsKm[zF}&52x\d{IhboQ+cwB"\y}5^K 9TNM:-OWq9r86B %OQK0²HVWmrХVtQ3϶x *mt`l~EM%϶x6*jܶXr Otm~06jv[ԶJ6VMvtQ/OȫP"ֶ DF:rq֚p]jͶejb?ƜǢw9pm~W06bԸ,!דmxbKCֶɷ!{{ɢA~$a[+R/幻ьg2{vgxa EPz"l~n~OP6bVײ< ^CUG=G߯6hU+7]̲Z56YBn"/eelA6(_c+:ѵ݀ʨfxA 8ŏl .l6rpeUd.jQ{۔Ķ(Bo3F淅*M5Z'϶x*>>m[̾=dvKn.1yEw7QbKy28]fo.%-fwCaDk+ԗȶ8.=j˳7]h lifecycle: archived lifecycle archived readr/man/figures/lifecycle-soft-deprecated.svg0000644000176200001440000000246614403212750021323 0ustar liggesusers lifecycle: soft-deprecated lifecycle soft-deprecated readr/man/figures/lifecycle-questioning.svg0000644000176200001440000000244414403212750020613 0ustar liggesusers lifecycle: questioning lifecycle questioning readr/man/figures/lifecycle-superseded.svg0000644000176200001440000000244014403212750020405 0ustar liggesusers lifecycle: superseded lifecycle superseded readr/man/figures/lifecycle-stable.svg0000644000176200001440000000247214403212750017521 0ustar liggesusers lifecycle: stable lifecycle stable readr/man/figures/lifecycle-experimental.svg0000644000176200001440000000245014403212750020740 0ustar liggesusers lifecycle: experimental lifecycle experimental readr/man/figures/lifecycle-deprecated.svg0000644000176200001440000000244014403212750020342 0ustar liggesusers lifecycle: deprecated lifecycle deprecated readr/man/edition_get.Rd0000644000176200001440000000053214315646511014711 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/edition.R \name{edition_get} \alias{edition_get} \title{Retrieve the currently active edition} \usage{ edition_get() } \value{ An integer corresponding to the currently active edition. } \description{ Retrieve the currently active edition } \examples{ edition_get() } readr/man/with_edition.Rd0000644000176200001440000000260614315646511015111 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/edition.R \name{with_edition} \alias{with_edition} \alias{local_edition} \title{Temporarily change the active readr edition} \usage{ with_edition(edition, code) local_edition(edition, env = parent.frame()) } \arguments{ \item{edition}{Should be a single integer, such as \code{1} or \code{2}.} \item{code}{Code to run with the changed edition.} \item{env}{Environment that controls scope of changes. For expert use only.} } \description{ \code{with_edition()} allows you to change the active edition of readr for a given block of code. \code{local_edition()} allows you to change the active edition of readr until the end of the current function or file. } \examples{ with_edition(1, edition_get()) with_edition(2, edition_get()) # readr 1e and 2e behave differently when input rows have different number # number of fields with_edition(1, read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z"))) with_edition(2, read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z"))) # local_edition() applies in a specific scope, for example, inside a function read_csv_1e <- function(...) { local_edition(1) read_csv(...) } read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 2e behaviour read_csv_1e("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 1e behaviour read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 2e behaviour } readr/man/readr-package.Rd0000644000176200001440000000255214403211400015070 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/readr-package.R \docType{package} \name{readr-package} \alias{readr} \alias{readr-package} \title{readr: Read Rectangular Text Data} \description{ \if{html}{\figure{logo.png}{options: style='float: right' alt='logo' width='120'}} The goal of 'readr' is to provide a fast and friendly way to read rectangular data (like 'csv', 'tsv', and 'fwf'). It is designed to flexibly parse many types of data found in the wild, while still cleanly failing when data unexpectedly changes. } \seealso{ Useful links: \itemize{ \item \url{https://readr.tidyverse.org} \item \url{https://github.com/tidyverse/readr} \item Report bugs at \url{https://github.com/tidyverse/readr/issues} } } \author{ \strong{Maintainer}: Jennifer Bryan \email{jenny@posit.co} (\href{https://orcid.org/0000-0002-6983-2759}{ORCID}) Authors: \itemize{ \item Hadley Wickham \email{hadley@posit.co} \item Jim Hester } Other contributors: \itemize{ \item Romain Francois [contributor] \item Shelby Bearrows [contributor] \item Posit Software, PBC [copyright holder, funder] \item https://github.com/mandreyel/ (mio library) [copyright holder] \item Jukka Jylänki (grisu3 implementation) [contributor, copyright holder] \item Mikkel Jørgensen (grisu3 implementation) [contributor, copyright holder] } } \keyword{internal} readr/man/should_show_types.Rd0000644000176200001440000000137714174704674016221 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{should_show_types} \alias{should_show_types} \title{Determine whether column types should be shown} \usage{ should_show_types() } \description{ Wrapper around \code{getOption("readr.show_col_types")} that implements some fall back logic if the option is unset. This returns: \itemize{ \item \code{TRUE} if the option is set to \code{TRUE} \item \code{FALSE} if the option is set to \code{FALSE} \item \code{FALSE} if the option is unset and we appear to be running tests \item \code{NULL} otherwise, in which case the caller determines whether to show column types based on context, e.g. whether \code{show_col_types} or actual \code{col_types} were explicitly specified } } readr/man/read_fwf.Rd0000644000176200001440000002377514510343737014213 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_fwf.R \name{read_fwf} \alias{read_fwf} \alias{fwf_empty} \alias{fwf_widths} \alias{fwf_positions} \alias{fwf_cols} \title{Read a fixed width file into a tibble} \usage{ read_fwf( file, col_positions = fwf_empty(file, skip, n = guess_max), col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), lazy = should_read_lazy(), skip_empty_rows = TRUE ) fwf_empty( file, skip = 0, skip_empty_rows = FALSE, col_names = NULL, comment = "", n = 100L ) fwf_widths(widths, col_names = NULL) fwf_positions(start, end = NULL, col_names = NULL) fwf_cols(...) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{col_positions}{Column positions, as created by \code{\link[=fwf_empty]{fwf_empty()}}, \code{\link[=fwf_widths]{fwf_widths()}} or \code{\link[=fwf_positions]{fwf_positions()}}. To read in only selected fields, use \code{\link[=fwf_positions]{fwf_positions()}}. If the width of the last column is variable (a ragged fwf file), supply the last end position as NA.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{col_select}{Columns to include in the results. You can use the same mini-language as \code{dplyr::select()} to refer to the columns by name. Use \code{c()} to use more than one selection expression. Although this usage is less common, \code{col_select} also accepts a numeric column index. See \code{\link[tidyselect:language]{?tidyselect::language}} for full details on the selection language.} \item{id}{The name of a column in which to store the file path. This is useful when reading multiple input files and there is data in the file paths, such as the data collection date. If \code{NULL} (the default) no extra column is created.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data.} \item{n_max}{Maximum number of lines to read.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{name_repair}{Handling of column names. The default behaviour is to ensure column names are \code{"unique"}. Various repair strategies are supported: \itemize{ \item \code{"minimal"}: No name repair or checks, beyond basic existence of names. \item \code{"unique"} (default value): Make sure names are unique and not empty. \item \code{"check_unique"}: No name repair, but check they are \code{unique}. \item \code{"unique_quiet"}: Repair with the \code{unique} strategy, quietly. \item \code{"universal"}: Make the names \code{unique} and syntactic. \item \code{"universal_quiet"}: Repair with the \code{universal} strategy, quietly. \item A function: Apply custom name repair (e.g., \code{name_repair = make.names} for names in the style of base R). \item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}}. } This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. See there for more details on these terms and the strategies used to enforce them.} \item{num_threads}{The number of processing threads to use for initial parsing and lazy reading of data. If your data contains newlines within fields the parser should automatically detect this and fall back to using one thread only. However if you know your file has newlines within quoted fields it is safest to set \code{num_threads = 1} explicitly.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{lazy}{Read values lazily? By default, this is \code{FALSE}, because there are special considerations when reading a file lazily that have tripped up some users. Specifically, things get tricky when reading and then writing back into the same file. But, in general, lazy reading (\code{lazy = TRUE}) has many benefits, especially for interactive use and when your downstream work only involves a subset of the rows or columns. Learn more in \code{\link[=should_read_lazy]{should_read_lazy()}} and in the documentation for the \code{altrep} argument of \code{\link[vroom:vroom]{vroom::vroom()}}.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} \item{col_names}{Either NULL, or a character vector column names.} \item{n}{Number of lines the tokenizer will read to determine file structure. By default it is set to 100.} \item{widths}{Width of each field. Use NA as width of last field when reading a ragged fwf file.} \item{start, end}{Starting and ending (inclusive) positions of each field. Use NA as last end field when reading a ragged fwf file.} \item{...}{If the first element is a data frame, then it must have all numeric columns and either one or two rows. The column names are the variable names. The column values are the variable widths if a length one vector, and if length two, variable start and end positions. The elements of \code{...} are used to construct a data frame with or or two rows as above.} } \description{ A fixed width file can be a very compact representation of numeric data. It's also very fast to parse, because every field is in the same place in every line. Unfortunately, it's painful to parse because you need to describe the length of every field. Readr aims to make it as easy as possible by providing a number of different ways to describe the field structure. \itemize{ \item \code{\link[=fwf_empty]{fwf_empty()}} - Guesses based on the positions of empty columns. \item \code{\link[=fwf_widths]{fwf_widths()}} - Supply the widths of the columns. \item \code{\link[=fwf_positions]{fwf_positions()}} - Supply paired vectors of start and end positions. \item \code{\link[=fwf_cols]{fwf_cols()}} - Supply named arguments of paired start and end positions or column widths. } } \section{Second edition changes}{ Comments are no longer looked for anywhere in the file. They are now only ignored at the start of a line. } \examples{ fwf_sample <- readr_example("fwf-sample.txt") writeLines(read_lines(fwf_sample)) # You can specify column positions in several ways: # 1. Guess based on position of empty columns read_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) # 2. A vector of field widths read_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) # 3. Paired vectors of start and end positions read_fwf(fwf_sample, fwf_positions(c(1, 30), c(20, 42), c("name", "ssn"))) # 4. Named arguments with start and end positions read_fwf(fwf_sample, fwf_cols(name = c(1, 20), ssn = c(30, 42))) # 5. Named arguments with column widths read_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) } \seealso{ \code{\link[=read_table]{read_table()}} to read fixed width files where each column is separated by whitespace. } readr/man/clipboard.Rd0000644000176200001440000000050514152512262014350 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/source.R \name{clipboard} \alias{clipboard} \title{Returns values from the clipboard} \usage{ clipboard() } \description{ This is useful in the \code{\link[=read_delim]{read_delim()}} functions to read from the clipboard. } \seealso{ read_delim } readr/man/locale.Rd0000644000176200001440000000447614174357220013670 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/locale.R \name{locale} \alias{locale} \alias{default_locale} \title{Create locales} \usage{ locale( date_names = "en", date_format = "\%AD", time_format = "\%AT", decimal_mark = ".", grouping_mark = ",", tz = "UTC", encoding = "UTF-8", asciify = FALSE ) default_locale() } \arguments{ \item{date_names}{Character representations of day and month names. Either the language code as string (passed on to \code{\link[=date_names_lang]{date_names_lang()}}) or an object created by \code{\link[=date_names]{date_names()}}.} \item{date_format, time_format}{Default date and time formats.} \item{decimal_mark, grouping_mark}{Symbols used to indicate the decimal place, and to chunk larger numbers. Decimal mark can only be \verb{,} or \code{.}.} \item{tz}{Default tz. This is used both for input (if the time zone isn't present in individual strings), and for output (to control the default display). The default is to use "UTC", a time zone that does not use daylight savings time (DST) and hence is typically most useful for data. The absence of time zones makes it approximately 50x faster to generate UTC times than any other time zone. Use \code{""} to use the system default time zone, but beware that this will not be reproducible across systems. For a complete list of possible time zones, see \code{\link[=OlsonNames]{OlsonNames()}}. Americans, note that "EST" is a Canadian time zone that does not have DST. It is \emph{not} Eastern Standard Time. It's better to use "US/Eastern", "US/Central" etc.} \item{encoding}{Default encoding. This only affects how the file is read - readr always converts the output to UTF-8.} \item{asciify}{Should diacritics be stripped from date names and converted to ASCII? This is useful if you're dealing with ASCII data where the correct spellings have been lost. Requires the \pkg{stringi} package.} } \description{ A locale object tries to capture all the defaults that can vary between countries. You set the locale in once, and the details are automatically passed on down to the columns parsers. The defaults have been chosen to match R (i.e. US English) as closely as possible. See \code{vignette("locales")} for more details. } \examples{ locale() locale("fr") # South American locale locale("es", decimal_mark = ",") } readr/man/readr_example.Rd0000644000176200001440000000076014174357220015231 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/example.R \name{readr_example} \alias{readr_example} \title{Get path to readr example} \usage{ readr_example(file = NULL) } \arguments{ \item{file}{Name of file. If \code{NULL}, the example files will be listed.} } \description{ readr comes bundled with a number of sample files in its \code{inst/extdata} directory. This function make them easy to access } \examples{ readr_example() readr_example("challenge.csv") } readr/man/parse_vector.Rd0000644000176200001440000000254714174704674015133 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_vector} \alias{parse_vector} \title{Parse a character vector.} \usage{ parse_vector( x, collector, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE ) } \arguments{ \item{x}{Character vector of elements to parse.} \item{collector}{Column specification.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} } \description{ Parse a character vector. } \examples{ x <- c("1", "2", "3", "NA") parse_vector(x, col_integer()) parse_vector(x, col_double()) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()} } \concept{parsers} \keyword{internal} readr/man/read_delim.Rd0000644000176200001440000003200014510343737014500 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_delim.R \name{read_delim} \alias{read_delim} \alias{read_csv} \alias{read_csv2} \alias{read_tsv} \title{Read a delimited file (including CSV and TSV) into a tibble} \usage{ read_delim( file, delim = NULL, quote = "\\"", escape_backslash = FALSE, escape_double = TRUE, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) read_csv( file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) read_csv2( file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) read_tsv( file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy() ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{delim}{Single character used to separate fields within a record.} \item{quote}{Single character used to quote strings.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector of column names. If \code{TRUE}, the first row of the input will be used as the column names, and will not be included in the data frame. If \code{FALSE}, column names will be generated automatically: X1, X2, X3 etc. If \code{col_names} is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. Missing (\code{NA}) column names will generate a warning, and be filled in with dummy names \code{...1}, \code{...2} etc. Duplicate column names will generate a warning and be made unique, see \code{name_repair} to control how this is done.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{col_select}{Columns to include in the results. You can use the same mini-language as \code{dplyr::select()} to refer to the columns by name. Use \code{c()} to use more than one selection expression. Although this usage is less common, \code{col_select} also accepts a numeric column index. See \code{\link[tidyselect:language]{?tidyselect::language}} for full details on the selection language.} \item{id}{The name of a column in which to store the file path. This is useful when reading multiple input files and there is data in the file paths, such as the data collection date. If \code{NULL} (the default) no extra column is created.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{n_max}{Maximum number of lines to read.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{name_repair}{Handling of column names. The default behaviour is to ensure column names are \code{"unique"}. Various repair strategies are supported: \itemize{ \item \code{"minimal"}: No name repair or checks, beyond basic existence of names. \item \code{"unique"} (default value): Make sure names are unique and not empty. \item \code{"check_unique"}: No name repair, but check they are \code{unique}. \item \code{"unique_quiet"}: Repair with the \code{unique} strategy, quietly. \item \code{"universal"}: Make the names \code{unique} and syntactic. \item \code{"universal_quiet"}: Repair with the \code{universal} strategy, quietly. \item A function: Apply custom name repair (e.g., \code{name_repair = make.names} for names in the style of base R). \item A purrr-style anonymous function, see \code{\link[rlang:as_function]{rlang::as_function()}}. } This argument is passed on as \code{repair} to \code{\link[vctrs:vec_as_names]{vctrs::vec_as_names()}}. See there for more details on these terms and the strategies used to enforce them.} \item{num_threads}{The number of processing threads to use for initial parsing and lazy reading of data. If your data contains newlines within fields the parser should automatically detect this and fall back to using one thread only. However if you know your file has newlines within quoted fields it is safest to set \code{num_threads = 1} explicitly.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} \item{lazy}{Read values lazily? By default, this is \code{FALSE}, because there are special considerations when reading a file lazily that have tripped up some users. Specifically, things get tricky when reading and then writing back into the same file. But, in general, lazy reading (\code{lazy = TRUE}) has many benefits, especially for interactive use and when your downstream work only involves a subset of the rows or columns. Learn more in \code{\link[=should_read_lazy]{should_read_lazy()}} and in the documentation for the \code{altrep} argument of \code{\link[vroom:vroom]{vroom::vroom()}}.} } \value{ A \code{\link[=tibble]{tibble()}}. If there are parsing problems, a warning will alert you. You can retrieve the full details by calling \code{\link[=problems]{problems()}} on your dataset. } \description{ \code{read_csv()} and \code{read_tsv()} are special cases of the more general \code{read_delim()}. They're useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively. \code{read_csv2()} uses \verb{;} for the field separator and \verb{,} for the decimal point. This format is common in some European countries. } \examples{ # Input sources ------------------------------------------------------------- # Read from a path read_csv(readr_example("mtcars.csv")) read_csv(readr_example("mtcars.csv.zip")) read_csv(readr_example("mtcars.csv.bz2")) \dontrun{ # Including remote paths read_csv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv") } # Read from multiple file paths at once continents <- c("africa", "americas", "asia", "europe", "oceania") filepaths <- vapply( paste0("mini-gapminder-", continents, ".csv"), FUN = readr_example, FUN.VALUE = character(1) ) read_csv(filepaths, id = "file") # Or directly from a string with `I()` read_csv(I("x,y\n1,2\n3,4")) # Column selection----------------------------------------------------------- # Pass column names or indexes directly to select them read_csv(readr_example("chickens.csv"), col_select = c(chicken, eggs_laid)) read_csv(readr_example("chickens.csv"), col_select = c(1, 3:4)) # Or use the selection helpers read_csv( readr_example("chickens.csv"), col_select = c(starts_with("c"), last_col()) ) # You can also rename specific columns read_csv( readr_example("chickens.csv"), col_select = c(egg_yield = eggs_laid, everything()) ) # Column types -------------------------------------------------------------- # By default, readr guesses the columns types, looking at `guess_max` rows. # You can override with a compact specification: read_csv(I("x,y\n1,2\n3,4"), col_types = "dc") # Or with a list of column types: read_csv(I("x,y\n1,2\n3,4"), col_types = list(col_double(), col_character())) # If there are parsing problems, you get a warning, and can extract # more details with problems() y <- read_csv(I("x\n1\n2\nb"), col_types = list(col_double())) y problems(y) # Column names -------------------------------------------------------------- # By default, readr duplicate name repair is noisy read_csv(I("x,x\n1,2\n3,4")) # Same default repair strategy, but quiet read_csv(I("x,x\n1,2\n3,4"), name_repair = "unique_quiet") # There's also a global option that controls verbosity of name repair withr::with_options( list(rlib_name_repair_verbosity = "quiet"), read_csv(I("x,x\n1,2\n3,4")) ) # Or use "minimal" to turn off name repair read_csv(I("x,x\n1,2\n3,4"), name_repair = "minimal") # File types ---------------------------------------------------------------- read_csv(I("a,b\n1.0,2.0")) read_csv2(I("a;b\n1,0;2,0")) read_tsv(I("a\tb\n1.0\t2.0")) read_delim(I("a|b\n1.0|2.0"), delim = "|") } readr/man/col_skip.Rd0000644000176200001440000000114414174357220014221 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{col_skip} \alias{col_skip} \title{Skip a column} \usage{ col_skip() } \description{ Use this function to ignore a column when reading in a file. To skip all columns not otherwise specified, use \code{\link[=cols_only]{cols_only()}}. } \seealso{ Other parsers: \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_datetime}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/as.col_spec.Rd0000644000176200001440000000056514152512262014610 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/col_types.R \name{as.col_spec} \alias{as.col_spec} \title{Generate a column specification} \usage{ as.col_spec(x) } \arguments{ \item{x}{Input object} } \description{ This is most useful for generating a specification using the short form } \examples{ as.col_spec("cccnnn") } \keyword{internal} readr/man/read_delim_chunked.Rd0000644000176200001440000001707414510343737016217 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/read_delim_chunked.R \name{read_delim_chunked} \alias{read_delim_chunked} \alias{read_csv_chunked} \alias{read_csv2_chunked} \alias{read_tsv_chunked} \title{Read a delimited file by chunks} \usage{ read_delim_chunked( file, callback, delim = NULL, chunk_size = 10000, quote = "\\"", escape_backslash = FALSE, escape_double = TRUE, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE ) read_csv_chunked( file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE ) read_csv2_chunked( file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE ) read_tsv_chunked( file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{callback}{A callback function to call on each chunk} \item{delim}{Single character used to separate fields within a record.} \item{chunk_size}{The number of rows to include in each chunk} \item{quote}{Single character used to quote strings.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{col_names}{Either \code{TRUE}, \code{FALSE} or a character vector of column names. If \code{TRUE}, the first row of the input will be used as the column names, and will not be included in the data frame. If \code{FALSE}, column names will be generated automatically: X1, X2, X3 etc. If \code{col_names} is a character vector, the values will be used as the names of the columns, and the first row of the input will be read into the first row of the output data frame. Missing (\code{NA}) column names will generate a warning, and be filled in with dummy names \code{...1}, \code{...2} etc. Duplicate column names will generate a warning and be made unique, see \code{name_repair} to control how this is done.} \item{col_types}{One of \code{NULL}, a \code{\link[=cols]{cols()}} specification, or a string. See \code{vignette("readr")} for more details. If \code{NULL}, all column types will be inferred from \code{guess_max} rows of the input, interspersed throughout the file. This is convenient (and fast), but not robust. If the guessed types are wrong, you'll need to increase \code{guess_max} or supply the correct types yourself. Column specifications created by \code{\link[=list]{list()}} or \code{\link[=cols]{cols()}} must contain one column specification for each column. If you only want to read a subset of the columns, use \code{\link[=cols_only]{cols_only()}}. Alternatively, you can use a compact string representation where each character represents one column: \itemize{ \item c = character \item i = integer \item n = number \item d = double \item l = logical \item f = factor \item D = date \item T = date time \item t = time \item ? = guess \item _ or - = skip } By default, reading a file without a column specification will print a message showing what \code{readr} guessed they were. To remove this message, set \code{show_col_types = FALSE} or set \code{options(readr.show_col_types = FALSE)}.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{skip}{Number of lines to skip before reading data. If \code{comment} is supplied any commented lines are ignored \emph{after} skipping.} \item{guess_max}{Maximum number of lines to use for guessing column types. Will never use more than the number of lines read. See \code{vignette("column-types", package = "readr")} for more details.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{show_col_types}{If \code{FALSE}, do not show the guessed column types. If \code{TRUE} always show the column types, even if they are supplied. If \code{NULL} (the default) only show the column types if they are not explicitly supplied by the \code{col_types} argument.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} } \description{ Read a delimited file by chunks } \details{ The number of lines in \code{file} can exceed the maximum integer value in R (~2 billion). } \examples{ # Cars with 3 gears f <- function(x, pos) subset(x, gear == 3) read_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) } \seealso{ Other chunked: \code{\link{callback}}, \code{\link{melt_delim_chunked}()}, \code{\link{read_lines_chunked}()} } \concept{chunked} \keyword{internal} readr/man/read_lines.Rd0000644000176200001440000001155414315646511014532 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/lines.R \name{read_lines} \alias{read_lines} \alias{read_lines_raw} \alias{write_lines} \title{Read/write lines to/from a file} \usage{ read_lines( file, skip = 0, skip_empty_rows = FALSE, n_max = Inf, locale = default_locale(), na = character(), lazy = should_read_lazy(), num_threads = readr_threads(), progress = show_progress() ) read_lines_raw( file, skip = 0, n_max = -1L, num_threads = readr_threads(), progress = show_progress() ) write_lines( x, file, sep = "\\n", na = "NA", append = FALSE, num_threads = readr_threads(), path = deprecated() ) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{skip}{Number of lines to skip before reading data.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} \item{n_max}{Number of lines to read. If \code{n_max} is -1, all lines in file will be read.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{lazy}{Read values lazily? By default, this is \code{FALSE}, because there are special considerations when reading a file lazily that have tripped up some users. Specifically, things get tricky when reading and then writing back into the same file. But, in general, lazy reading (\code{lazy = TRUE}) has many benefits, especially for interactive use and when your downstream work only involves a subset of the rows or columns. Learn more in \code{\link[=should_read_lazy]{should_read_lazy()}} and in the documentation for the \code{altrep} argument of \code{\link[vroom:vroom]{vroom::vroom()}}.} \item{num_threads}{The number of processing threads to use for initial parsing and lazy reading of data. If your data contains newlines within fields the parser should automatically detect this and fall back to using one thread only. However if you know your file has newlines within quoted fields it is safest to set \code{num_threads = 1} explicitly.} \item{progress}{Display a progress bar? By default it will only display in an interactive session and not while knitting a document. The automatic progress bar can be disabled by setting option \code{readr.show_progress} to \code{FALSE}.} \item{x}{A character vector or list of raw vectors to write to disk.} \item{sep}{The line separator. Defaults to \verb{\\\\n}, commonly used on POSIX systems like macOS and linux. For native windows (CRLF) separators use \verb{\\\\r\\\\n}.} \item{append}{If \code{FALSE}, will overwrite existing file. If \code{TRUE}, will append to existing file. In both cases, if the file does not exist a new file is created.} \item{path}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{file} argument instead.} } \value{ \code{read_lines()}: A character vector with one element for each line. \code{read_lines_raw()}: A list containing a raw vector for each line. \code{write_lines()} returns \code{x}, invisibly. } \description{ \code{read_lines()} reads up to \code{n_max} lines from a file. New lines are not included in the output. \code{read_lines_raw()} produces a list of raw vectors, and is useful for handling data with unknown encoding. \code{write_lines()} takes a character vector or list of raw vectors, appending a new line after each entry. } \examples{ read_lines(file.path(R.home("doc"), "AUTHORS"), n_max = 10) read_lines_raw(file.path(R.home("doc"), "AUTHORS"), n_max = 10) tmp <- tempfile() write_lines(rownames(mtcars), tmp) read_lines(tmp, lazy = FALSE) read_file(tmp) # note trailing \n write_lines(airquality$Ozone, tmp, na = "-1") read_lines(tmp) } readr/man/Tokenizers.Rd0000644000176200001440000000560514304131171014547 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/tokenizer.R \name{Tokenizers} \alias{Tokenizers} \alias{tokenizer_delim} \alias{tokenizer_csv} \alias{tokenizer_tsv} \alias{tokenizer_line} \alias{tokenizer_log} \alias{tokenizer_fwf} \alias{tokenizer_ws} \title{Tokenizers.} \usage{ tokenizer_delim( delim, quote = "\\"", na = "NA", quoted_na = TRUE, comment = "", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE, skip_empty_rows = TRUE ) tokenizer_csv( na = "NA", quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE ) tokenizer_tsv( na = "NA", quoted_na = TRUE, quote = "\\"", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE ) tokenizer_line(na = character(), skip_empty_rows = TRUE) tokenizer_log(trim_ws) tokenizer_fwf( begin, end, na = "NA", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE ) tokenizer_ws(na = "NA", comment = "", skip_empty_rows = TRUE) } \arguments{ \item{delim}{Single character used to separate fields within a record.} \item{quote}{Single character used to quote strings.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{quoted_na}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Should missing values inside quotes be treated as missing values (the default) or strings. This parameter is soft deprecated as of readr 2.0.0.} \item{comment}{A string used to identify comments. Any text after the comment characters will be silently ignored.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} \item{escape_double}{Does the file escape quotes by doubling them? i.e. If this option is \code{TRUE}, the value \verb{""""} represents a single quote, \verb{\\"}.} \item{escape_backslash}{Does the file use backslashes to escape special characters? This is more general than \code{escape_double} as backslashes can be used to escape the delimiter character, the quote character, or to add special characters like \verb{\\\\n}.} \item{skip_empty_rows}{Should blank rows be ignored altogether? i.e. If this option is \code{TRUE} then blank rows will not be represented at all. If it is \code{FALSE} then they will be represented by \code{NA} values in all the columns.} \item{begin, end}{Begin and end offsets for each file. These are C++ offsets so the first column is column zero, and the ranges are [begin, end) (i.e inclusive-exclusive).} } \description{ Explicitly create tokenizer objects. Usually you will not call these function, but will instead use one of the use friendly wrappers like \code{\link[=read_csv]{read_csv()}}. } \examples{ tokenizer_csv() } \keyword{internal} readr/man/should_read_lazy.Rd0000644000176200001440000000203214304131576015742 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{should_read_lazy} \alias{should_read_lazy} \title{Determine whether to read a file lazily} \usage{ should_read_lazy() } \description{ This function consults the option \code{readr.read_lazy} to figure out whether to do lazy reading or not. If the option is unset, the default is \code{FALSE}, meaning readr will read files eagerly, not lazily. If you want to use this option to express a preference for lazy reading, do this: \if{html}{\out{
}}\preformatted{options(readr.read_lazy = TRUE) }\if{html}{\out{
}} Typically, one would use the option to control lazy reading at the session, file, or user level. The \code{lazy} argument of functions like \code{\link[=read_csv]{read_csv()}} can be used to control laziness in an individual call. } \seealso{ The blog post \href{https://www.tidyverse.org/blog/2021/11/readr-2-1-0-lazy/}{"Eager vs lazy reading in readr 2.1.0"} explains the benefits (and downsides) of lazy reading. } readr/man/read_file.Rd0000644000176200001440000000467414304131171014331 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/file.R \name{read_file} \alias{read_file} \alias{read_file_raw} \alias{write_file} \title{Read/write a complete file} \usage{ read_file(file, locale = default_locale()) read_file_raw(file) write_file(x, file, append = FALSE, path = deprecated()) } \arguments{ \item{file}{Either a path to a file, a connection, or literal data (either a single string or a raw vector). Files ending in \code{.gz}, \code{.bz2}, \code{.xz}, or \code{.zip} will be automatically uncompressed. Files starting with \verb{http://}, \verb{https://}, \verb{ftp://}, or \verb{ftps://} will be automatically downloaded. Remote gz files can also be automatically downloaded and decompressed. Literal data is most useful for examples and tests. To be recognised as literal data, the input must be either wrapped with \code{I()}, be a string containing at least one new line, or be a vector containing at least one string with a new line. Using a value of \code{\link[=clipboard]{clipboard()}} will read from the system clipboard.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{x}{A single string, or a raw vector to write to disk.} \item{append}{If \code{FALSE}, will overwrite existing file. If \code{TRUE}, will append to existing file. In both cases, if the file does not exist a new file is created.} \item{path}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} Use the \code{file} argument instead.} } \value{ \code{read_file}: A length 1 character vector. \code{read_lines_raw}: A raw vector. } \description{ \code{read_file()} reads a complete file into a single object: either a character vector of length one, or a raw vector. \code{write_file()} takes a single string, or a raw vector, and writes it exactly as is. Raw vectors are useful when dealing with binary data, or if you have text data with unknown encoding. } \examples{ read_file(file.path(R.home("doc"), "AUTHORS")) read_file_raw(file.path(R.home("doc"), "AUTHORS")) tmp <- tempfile() x <- format_csv(mtcars[1:6, ]) write_file(x, tmp) identical(x, read_file(tmp)) read_lines(I(x)) } readr/man/parse_datetime.Rd0000644000176200001440000001527214174704674015424 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/collectors.R \name{parse_datetime} \alias{parse_datetime} \alias{parse_date} \alias{parse_time} \alias{col_datetime} \alias{col_date} \alias{col_time} \title{Parse date/times} \usage{ parse_datetime( x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE ) parse_date( x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE ) parse_time( x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE ) col_datetime(format = "") col_date(format = "") col_time(format = "") } \arguments{ \item{x}{A character vector of dates to parse.} \item{format}{A format specification, as described below. If set to "", date times are parsed as ISO8601, dates and times used the date and time formats specified in the \code{\link[=locale]{locale()}}. Unlike \code{\link[=strptime]{strptime()}}, the format specification must match the complete string.} \item{na}{Character vector of strings to interpret as missing values. Set this option to \code{character()} to indicate no missing values.} \item{locale}{The locale controls defaults that vary from place to place. The default locale is US-centric (like R), but you can use \code{\link[=locale]{locale()}} to create your own locale that controls things like the default time zone, encoding, decimal mark, big mark, and day/month names.} \item{trim_ws}{Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from each field before parsing it?} } \value{ A \code{\link[=POSIXct]{POSIXct()}} vector with \code{tzone} attribute set to \code{tz}. Elements that could not be parsed (or did not generate valid dates) will be set to \code{NA}, and a warning message will inform you of the total number of failures. } \description{ Parse date/times } \section{Format specification}{ \code{readr} uses a format specification similar to \code{\link[=strptime]{strptime()}}. There are three types of element: \enumerate{ \item Date components are specified with "\%" followed by a letter. For example "\%Y" matches a 4 digit year, "\%m", matches a 2 digit month and "\%d" matches a 2 digit day. Month and day default to \code{1}, (i.e. Jan 1st) if not present, for example if only a year is given. \item Whitespace is any sequence of zero or more whitespace characters. \item Any other character is matched exactly. } \code{parse_datetime()} recognises the following format specifications: \itemize{ \item Year: "\%Y" (4 digits). "\%y" (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999. \item Month: "\%m" (2 digits), "\%b" (abbreviated name in current locale), "\%B" (full name in current locale). \item Day: "\%d" (2 digits), "\%e" (optional leading space), "\%a" (abbreviated name in current locale). \item Hour: "\%H" or "\%I" or "\%h", use I (and not H) with AM/PM, use h (and not H) if your times represent durations longer than one day. \item Minutes: "\%M" \item Seconds: "\%S" (integer seconds), "\%OS" (partial seconds) \item Time zone: "\%Z" (as name, e.g. "America/Chicago"), "\%z" (as offset from UTC, e.g. "+0800") \item AM/PM indicator: "\%p". \item Non-digits: "\%." skips one non-digit character, "\%+" skips one or more non-digit characters, "\%*" skips any number of non-digits characters. \item Automatic parsers: "\%AD" parses with a flexible YMD parser, "\%AT" parses with a flexible HMS parser. \item Time since the Unix epoch: "\%s" decimal seconds since the Unix epoch. \item Shortcuts: "\%D" = "\%m/\%d/\%y", "\%F" = "\%Y-\%m-\%d", "\%R" = "\%H:\%M", "\%T" = "\%H:\%M:\%S", "\%x" = "\%y/\%m/\%d". } } \section{ISO8601 support}{ Currently, readr does not support all of ISO8601. Missing features: \itemize{ \item Week & weekday specifications, e.g. "2013-W05", "2013-W05-10". \item Ordinal dates, e.g. "2013-095". \item Using commas instead of a period for decimal separator. } The parser is also a little laxer than ISO8601: \itemize{ \item Dates and times can be separated with a space, not just T. \item Mostly correct specifications like "2009-05-19 14:" and "200912-01" work. } } \examples{ # Format strings -------------------------------------------------------- parse_datetime("01/02/2010", "\%d/\%m/\%Y") parse_datetime("01/02/2010", "\%m/\%d/\%Y") # Handle any separator parse_datetime("01/02/2010", "\%m\%.\%d\%.\%Y") # Dates look the same, but internally they use the number of days since # 1970-01-01 instead of the number of seconds. This avoids a whole lot # of troubles related to time zones, so use if you can. parse_date("01/02/2010", "\%d/\%m/\%Y") parse_date("01/02/2010", "\%m/\%d/\%Y") # You can parse timezones from strings (as listed in OlsonNames()) parse_datetime("2010/01/01 12:00 US/Central", "\%Y/\%m/\%d \%H:\%M \%Z") # Or from offsets parse_datetime("2010/01/01 12:00 -0600", "\%Y/\%m/\%d \%H:\%M \%z") # Use the locale parameter to control the default time zone # (but note UTC is considerably faster than other options) parse_datetime("2010/01/01 12:00", "\%Y/\%m/\%d \%H:\%M", locale = locale(tz = "US/Central") ) parse_datetime("2010/01/01 12:00", "\%Y/\%m/\%d \%H:\%M", locale = locale(tz = "US/Eastern") ) # Unlike strptime, the format specification must match the complete # string (ignoring leading and trailing whitespace). This avoids common # errors: strptime("01/02/2010", "\%d/\%m/\%y") parse_datetime("01/02/2010", "\%d/\%m/\%y") # Failures ------------------------------------------------------------- parse_datetime("01/01/2010", "\%d/\%m/\%Y") parse_datetime(c("01/ab/2010", "32/01/2010"), "\%d/\%m/\%Y") # Locales -------------------------------------------------------------- # By default, readr expects English date/times, but that's easy to change' parse_datetime("1 janvier 2015", "\%d \%B \%Y", locale = locale("fr")) parse_datetime("1 enero 2015", "\%d \%B \%Y", locale = locale("es")) # ISO8601 -------------------------------------------------------------- # With separators parse_datetime("1979-10-14") parse_datetime("1979-10-14T10") parse_datetime("1979-10-14T10:11") parse_datetime("1979-10-14T10:11:12") parse_datetime("1979-10-14T10:11:12.12345") # Without separators parse_datetime("19791014") parse_datetime("19791014T101112") # Time zones us_central <- locale(tz = "US/Central") parse_datetime("1979-10-14T1010", locale = us_central) parse_datetime("1979-10-14T1010-0500", locale = us_central) parse_datetime("1979-10-14T1010Z", locale = us_central) # Your current time zone parse_datetime("1979-10-14T1010", locale = locale(tz = "")) } \seealso{ Other parsers: \code{\link{col_skip}()}, \code{\link{cols_condense}()}, \code{\link{cols}()}, \code{\link{parse_factor}()}, \code{\link{parse_guess}()}, \code{\link{parse_logical}()}, \code{\link{parse_number}()}, \code{\link{parse_vector}()} } \concept{parsers} readr/man/readr_threads.Rd0000644000176200001440000000073414174704674015242 0ustar liggesusers% Generated by roxygen2: do not edit by hand % Please edit documentation in R/utils.R \name{readr_threads} \alias{readr_threads} \title{Determine how many threads readr should use when processing} \usage{ readr_threads() } \description{ The number of threads returned can be set by \itemize{ \item The global option \code{readr.num_threads} \item The environment variable \code{VROOM_THREADS} \item The value of \code{\link[parallel:detectCores]{parallel::detectCores()}} } } readr/DESCRIPTION0000644000176200001440000000431514547623042013067 0ustar liggesusersPackage: readr Title: Read Rectangular Text Data Version: 2.1.5 Authors@R: c( person("Hadley", "Wickham", , "hadley@posit.co", role = "aut"), person("Jim", "Hester", role = "aut"), person("Romain", "Francois", role = "ctb"), person("Jennifer", "Bryan", , "jenny@posit.co", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-6983-2759")), person("Shelby", "Bearrows", role = "ctb"), person("Posit Software, PBC", role = c("cph", "fnd")), person("https://github.com/mandreyel/", role = "cph", comment = "mio library"), person("Jukka", "Jylänki", role = c("ctb", "cph"), comment = "grisu3 implementation"), person("Mikkel", "Jørgensen", role = c("ctb", "cph"), comment = "grisu3 implementation") ) Description: The goal of 'readr' is to provide a fast and friendly way to read rectangular data (like 'csv', 'tsv', and 'fwf'). It is designed to flexibly parse many types of data found in the wild, while still cleanly failing when data unexpectedly changes. License: MIT + file LICENSE URL: https://readr.tidyverse.org, https://github.com/tidyverse/readr BugReports: https://github.com/tidyverse/readr/issues Depends: R (>= 3.6) Imports: cli (>= 3.2.0), clipr, crayon, hms (>= 0.4.1), lifecycle (>= 0.2.0), methods, R6, rlang, tibble, utils, vroom (>= 1.6.0) Suggests: covr, curl, datasets, knitr, rmarkdown, spelling, stringi, testthat (>= 3.2.0), tzdb (>= 0.1.1), waldo, withr, xml2 LinkingTo: cpp11, tzdb (>= 0.1.1) VignetteBuilder: knitr Config/Needs/website: tidyverse, tidyverse/tidytemplate Config/testthat/edition: 3 Config/testthat/parallel: false Encoding: UTF-8 Language: en-US RoxygenNote: 7.2.3 NeedsCompilation: yes Packaged: 2024-01-10 21:03:49 UTC; jenny Author: Hadley Wickham [aut], Jim Hester [aut], Romain Francois [ctb], Jennifer Bryan [aut, cre] (), Shelby Bearrows [ctb], Posit Software, PBC [cph, fnd], https://github.com/mandreyel/ [cph] (mio library), Jukka Jylänki [ctb, cph] (grisu3 implementation), Mikkel Jørgensen [ctb, cph] (grisu3 implementation) Maintainer: Jennifer Bryan Repository: CRAN Date/Publication: 2024-01-10 23:20:02 UTC readr/build/0000755000176200001440000000000014547603063012456 5ustar liggesusersreadr/build/vignette.rds0000644000176200001440000000040214547603063015011 0ustar liggesusersQ 0L>Z({"D= 5) { return(FALSE) } } dims <- list() melt_csv_chunked(file, get_dims_stop, chunk_size = 5) expect_true(length(dims) == 2) expect_true(all(vapply(dims[1:2], identical, logical(1), c(55L, 4L)))) }) test_that("DataFrameCallback works as intended", { withr::local_options(lifecycle_verbosity = "quiet") f <- readr_example("mtcars.csv") out0 <- subset(melt_csv(f), data_type == "integer") fun3 <- DataFrameCallback$new(function(x, pos) { subset(x, data_type == "integer") }) out1 <- melt_csv_chunked(f, fun3) out2 <- melt_csv_chunked(f, fun3, chunk_size = 1) out3 <- melt_csv_chunked(f, fun3, chunk_size = 10) expect_true(all.equal(out0, out1)) expect_true(all.equal(out0, out2)) expect_true(all.equal(out0, out3)) # No matching rows out0 <- subset(melt_csv(f), data_type == "integer") fun5 <- DataFrameCallback$new(function(x, pos) subset(x, data_type == "integer")) out1 <- melt_csv_chunked(f, fun5) # Need to set guess_max higher than 1 to guess correct column types out2 <- melt_csv_chunked(f, fun5, chunk_size = 1) out3 <- melt_csv_chunked(f, fun5, chunk_size = 10) expect_true(all.equal(out0, out1)) expect_true(all.equal(out0, out2)) expect_true(all.equal(out0, out3)) }) test_that("ListCallback works as intended", { withr::local_options(lifecycle_verbosity = "quiet") f <- readr_example("mtcars.csv") out0 <- melt_csv(f) fun <- ListCallback$new(function(x, pos) x[["value"]]) out1 <- melt_csv_chunked(f, fun, chunk_size = 10) expect_equal(out0[["value"]], unlist(out1)) }) readr/tests/testthat/test-source.R0000644000176200001440000000020614174357220016754 0ustar liggesuserstest_that("standardise_path works", { expect_error(standardise_path("https://foo/bar.bz2"), "compressed files is not supported") }) readr/tests/testthat/test-tokenizer-delim.R0000644000176200001440000000515014174704674020572 0ustar liggesusersparse_b <- function(x, ...) { tok <- tokenizer_delim(",", escape_double = FALSE, escape_backslash = TRUE, ...) tokenize(datasource_string(x, 0), tok) } parse_d <- function(x, ...) { tok <- tokenizer_delim(",", escape_double = TRUE, escape_backslash = FALSE, ...) tokenize(datasource_string(x, 0), tok) } test_that("simple sequence parsed correctly", { expect_equal(parse_d("1,2,3"), list(c("1", "2", "3"))) }) test_that("newlines are not tokenised", { expect_equal(parse_d("1\n2"), list("1", "2")) }) test_that("quotes in strings are dropped", { expect_equal(parse_d('"abc",abc'), list(c("abc", "abc"))) expect_equal(parse_b('"abc",abc'), list(c("abc", "abc"))) expect_equal(parse_b("'abc',abc", quote = "'"), list(c("abc", "abc"))) expect_equal(parse_d("'abc',abc", quote = "'"), list(c("abc", "abc"))) }) test_that("problems if unterminated string", { p1 <- problems(parse_d('1,2,"3')) p2 <- problems(parse_b('1,2,"3')) expect_equal(p1$col, 3) expect_equal(p2$col, 3) expect_equal(p1$expected, "closing quote at end of file") expect_equal(p2$expected, "closing quote at end of file") }) test_that("problem if unterminated escape", { p <- problems(parse_b("1\\")) expect_equal(p$row, 1) expect_equal(p$col, 1) }) test_that("empty fields become empty strings", { expect_equal(parse_d(",\n,"), list(c("[EMPTY]", "[EMPTY]"), c("[EMPTY]", "[EMPTY]"))) expect_equal(parse_d(",\n,\n"), list(c("[EMPTY]", "[EMPTY]"), c("[EMPTY]", "[EMPTY]"))) expect_equal(parse_d('""'), list("[EMPTY]")) }) test_that("bare NA becomes missing value", { expect_equal(parse_b('NA,"NA"', quoted_na = FALSE), list(c("[MISSING]", "NA"))) expect_equal(parse_d('NA,"NA"', quoted_na = FALSE), list(c("[MISSING]", "NA"))) }) test_that("quoted NA also becomes missing value", { expect_equal(parse_b('NA,"NA"', quoted_na = TRUE), list(c("[MISSING]", "[MISSING]"))) expect_equal(parse_d('NA,"NA"', quoted_na = TRUE), list(c("[MISSING]", "[MISSING]"))) }) test_that("empty string become missing values", { expect_equal(parse_b('NA,""', na = ""), list(c("NA", "[MISSING]"))) }) test_that("NA with spaces becomes missing value", { expect_equal(parse_b(" NA "), list(c("[MISSING]"))) }) test_that("string can be ended by new line", { expect_equal(parse_d('123,"a"\n'), list(c("123", "a"))) }) test_that("can escape delimeter with backslash", { expect_equal(parse_b("1\\,2"), list("1,2")) }) test_that("doubled quote becomes single quote (with d-escaping)", { expect_equal(parse_d('""""'), list('"')) }) test_that("escaped quoted doesn't terminate string (with b-escaping)", { expect_equal(parse_b('"\\""'), list('"')) }) readr/tests/testthat/empty-file0000644000176200001440000000000014152512262016335 0ustar liggesusersreadr/tests/testthat/null-file0000644000176200001440000000002214152512262016155 0ustar liggesusersa,b,c 1,2, 3,4,5 readr/tests/testthat/test-write.R0000644000176200001440000002022714361335003016604 0ustar liggesuserstest_that("strings are only quoted if needed", { x <- c("a", ",") csv <- format_delim(data.frame(x), delim = ",", col_names = FALSE) expect_equal(csv, 'a\n\",\"\n') ssv <- format_delim(data.frame(x), delim = " ", col_names = FALSE) expect_equal(ssv, "a\n,\n") }) test_that("a literal NA is quoted", { skip_if_edition_second() expect_equal(format_csv(data.frame(x = "NA")), "x\n\"NA\"\n") }) test_that("na argument modifies how missing values are written", { df <- data.frame(x = c(NA, "a", "b"), y = c(1, 2, NA)) expect_equal(format_csv(df, na = "None"), "x,y\nNone,1\na,2\nb,None\n") }) test_that("read_delim/csv/tsv and write_delim round trip special chars", { x <- c("a", '"', ",", "\n", "at\t") output <- data.frame(x) input <- read_delim(I(format_delim(output, delim = " ")), delim = " ", trim_ws = FALSE) input_csv <- read_csv(I(format_delim(output, delim = ",")), trim_ws = FALSE) input_tsv <- read_tsv(I(format_delim(output, delim = "\t")), trim_ws = FALSE) expect_equal(input$x, input_csv$x) expect_equal(input_tsv$x, x) }) test_that("special floating point values translated to text", { df <- data.frame(x = c(NaN, NA, Inf, -Inf)) expect_equal(format_csv(df), "x\nNA\nNA\nInf\n-Inf\n") }) test_that("logical values give long names", { df <- data.frame(x = c(NA, FALSE, TRUE)) expect_equal(format_csv(df), "x\nNA\nFALSE\nTRUE\n") }) test_that("roundtrip preserved floating point numbers", { input <- data.frame(x = runif(100)) output <- read_delim(I(format_delim(input, delim = " ")), delim = " ") expect_equal(input$x, output$x) }) test_that("roundtrip preserves dates and datetimes", { x <- as.Date("2010-01-01") + 1:10 y <- as.POSIXct(x) attr(y, "tzone") <- "UTC" input <- data.frame(x, y) output <- read_delim(I(format_delim(input, delim = ",")), delim = ",") expect_equal(output$x, x) expect_equal(output$y, y) }) test_that("fails to create file in non-existent directory", { expect_error( expect_warning( write_csv(mtcars, file.path(tempdir(), "/x/y")), "open" ) ) }) test_that("write_excel_csv/csv2 includes a byte order mark", { skip_if_edition_first() tmp <- tempfile() on.exit(unlink(tmp)) tmp2 <- tempfile() on.exit(unlink(tmp2)) write_excel_csv(mtcars, tmp) write_excel_csv2(mtcars, tmp2) output <- readBin(tmp, "raw", file.info(tmp)$size) output2 <- readBin(tmp2, "raw", file.info(tmp2)$size) # BOM is there expect_equal(output[1:3], charToRaw("\xEF\xBB\xBF")) expect_equal(output2[1:3], charToRaw("\xEF\xBB\xBF")) # Rest of file also there expect_equal(output[4:8], charToRaw('"mpg"')) expect_equal(output2[4:8], charToRaw('"mpg"')) }) test_that("write_excel_csv/csv2 includes a byte order mark, but not when appending", { skip_if_edition_first() tmp <- tempfile() on.exit(unlink(tmp)) write_excel_csv(data.frame(a = 1), tmp) write_excel_csv(data.frame(a = 2), tmp, append = TRUE) output <- readBin(tmp, "raw", file.info(tmp)$size) expect_equal(output[1:3], charToRaw("\xEF\xBB\xBF")) # But not in the rest of the file expect_equal(output[-1:-3], charToRaw('"a"\n1\n2\n')) }) test_that("does not writes a tailing .0 for whole number doubles", { expect_equal(format_tsv(tibble::tibble(x = 1)), "x\n1\n") expect_equal(format_tsv(tibble::tibble(x = 0)), "x\n0\n") expect_equal(format_tsv(tibble::tibble(x = -1)), "x\n-1\n") expect_equal(format_tsv(tibble::tibble(x = 999)), "x\n999\n") expect_equal(format_tsv(tibble::tibble(x = -999)), "x\n-999\n") expect_equal(format_tsv(tibble::tibble(x = 123456789)), "x\n123456789\n") expect_equal(format_tsv(tibble::tibble(x = -123456789)), "x\n-123456789\n") }) test_that("write_csv can write to compressed files", { mt <- read_csv(readr_example("mtcars.csv.bz2")) filename <- file.path(tempdir(), "mtcars.csv.bz2") on.exit(unlink(filename)) write_csv(mt, filename) expect_true(is_bz2_file(filename)) expect_equal(mt, read_csv(filename)) }) test_that("write_csv writes large integers without scientific notation #671", { x <- data.frame(a = c(60150001022000, 60150001022001)) filename <- file.path(tempdir(), "test_large_integers.csv") on.exit(unlink(filename)) write_csv(x, filename) content <- read_file(filename) expect_equal(content, "a\n60150001022000\n60150001022001\n") }) test_that("write_csv writes large integers without scientific notation up to 1E15 #671", { x <- data.frame(a = c(1E13, 1E14, 1E15, 1E16)) filename <- file.path(tempdir(), "test_large_integers2.csv") on.exit(unlink(filename)) write_csv(x, filename) content <- read_file(filename) expect_equal(content, "a\n10000000000000\n100000000000000\n1e15\n1e16\n") x_exp <- read_csv(filename, col_types = "d") expect_equal(x$a, x_exp$a) }) test_that("write_csv2 and format_csv2 writes ; sep and , decimal mark", { df <- tibble::tibble(x = c(0.5, 2, 1.2), y = c("a", "b", "c")) expect_equal(format_csv2(df), "x;y\n0,5;a\n2,0;b\n1,2;c\n") filename <- tempfile(pattern = "readr", fileext = ".csv") on.exit(unlink(filename)) write_csv2(df, filename) expect_equal(c(df), suppressMessages(c(read_csv2(filename)))) }) test_that("write_csv2 and format_csv2 writes NA appropriately", { df <- tibble::tibble(x = c(0.5, NA, 1.2), y = c("a", "b", NA)) expect_equal(format_csv2(df), "x;y\n0,5;a\nNA;b\n1,2;NA\n") }) test_that("write_csv2 and format_csv2 produce no leading whitespace for numbers", { df <- tibble::tibble(x = c(6, 66)) expect_equal(format_csv2(df), "x\n6\n66\n") }) test_that("write_csv2 and format_csv2 use same precision as write.csv2 (#1087)", { tmp <- tempfile() on.exit(unlink(tmp), add = TRUE) df <- tibble::tibble(x = c(1234567.1), y = 5) con <- file(tmp, "wb") write.csv2(df, con, row.names = FALSE, quote = FALSE, eol = "\n") close(con) expect_equal(format_csv2(df), "x;y\n1234567,1;5\n") expect_equal(format_csv2(df), read_file(tmp)) }) test_that("Can change the escape behavior for quotes", { df <- data.frame(x = c("a", '"', ",", "\n")) expect_error(format_delim(df, "\t", escape = "invalid"), "should be one of") expect_equal(format_delim(df, "\t"), "x\na\n\"\"\"\"\n,\n\"\n\"\n") expect_equal(format_delim(df, "\t", escape = "double"), "x\na\n\"\"\"\"\n,\n\"\n\"\n") expect_equal(format_delim(df, "\t", escape = "backslash"), "x\na\n\"\\\"\"\n,\n\"\n\"\n") expect_equal(format_delim(df, "\t", escape = "none"), "x\na\n\"\"\"\n,\n\"\n\"\n") }) test_that("hms NAs are written without padding (#930)", { df <- data.frame(x = hms::as_hms(c(NA, 34.234))) expect_equal(format_tsv(df), "x\nNA\n00:00:34.234\n") }) test_that("Error when writing list columns or matrix columns", { df <- data.frame(x = LETTERS[1:4], y = I(list(1, "foo", 2:9, iris)), z = I(matrix(1:16, nrow = 4))) expect_error( write_csv(df, tempfile()), "`x` must not contain list or matrix columns" ) df2 <- data.frame(x = LETTERS[1:4], y = I(matrix(1:40, nrow = 4))) expect_error( write_csv(df2, tempfile()), "`x` must not contain list or matrix columns" ) }) test_that("duplicate columns data is duplicated (#1169)", { df <- tibble::tibble(x = 1, x2 = 2) names(df) <- c("x", "x") f <- tempfile() on.exit(unlink(f)) write_csv(df, f) expect_equal(readLines(f), c("x,x", "1,2")) }) test_that("write_ family of functions return input data frame without changes", { tmp <- tempfile() on.exit(unlink(tmp)) time_strings <- c("2019-11-14 15:44:00", "2019-11-14 15:47:00") times <- as.POSIXlt(time_strings, tz = "America/Los_Angeles") df <- data.frame(time = times, class = factor(c("a", "b"))) df_delim <- write_delim(df, tmp) expect_identical(df, df_delim) df_csv <- write_csv(df, tmp) expect_identical(df, df_csv) df_csv2 <- write_csv2(df, tmp) expect_identical(df, df_csv2) df_excel_csv <- write_excel_csv(df, tmp) expect_identical(df, df_excel_csv) df_excel_csv2 <- write_excel_csv2(df, tmp) expect_identical(df, df_excel_csv2) df_tsv <- write_tsv(df, tmp) expect_identical(df, df_tsv) }) test_that("write_*() supports writing with windows newlines", { tmp <- tempfile() on.exit(unlink(tmp)) write_delim(data.frame(x = 1:3), tmp, eol = "\r\n") expect_identical( readBin(tmp, file.info(tmp)$size, what = "raw"), charToRaw("x\r\n1\r\n2\r\n3\r\n") ) }) readr/tests/testthat/test-parsing-factors.R0000644000176200001440000000715414174704674020600 0ustar liggesuserstest_that("strings mapped to levels", { x <- parse_factor(c("a", "b"), levels = c("a", "b")) expect_equal(x, factor(c("a", "b"))) }) test_that("can generate ordered factor", { x <- parse_factor(c("a", "b"), levels = c("a", "b"), ordered = TRUE) expect_equal(x, ordered(c("a", "b"))) }) test_that("warning if value not in levels", { expect_warning(x <- parse_factor(c("a", "b", "c"), levels = c("a", "b"))) expect_equal(n_problems(x), 1) expect_equal(is.na(x), c(FALSE, FALSE, TRUE)) }) test_that("NAs silently passed along", { x <- parse_factor(c("a", "b", "NA"), levels = c("a", "b"), include_na = FALSE) expect_equal(n_problems(x), 0) expect_equal(x, factor(c("a", "b", NA))) }) test_that("levels = NULL (497)", { x <- parse_factor(c("a", "b", "c", "b"), levels = NULL) expect_equal(n_problems(x), 0) expect_equal(x, factor(c("a", "b", "c", "b"))) }) test_that("levels = NULL orders by data", { x <- parse_factor(c("b", "a", "c", "b"), levels = NULL) expect_equal(levels(x), c("b", "a", "c")) }) test_that("levels = NULL default (#862)", { x <- c("a", "b", "c", "b") expect_equal(parse_factor(x), parse_factor(x, levels = NULL)) }) test_that("NAs included in levels if desired", { x <- parse_factor(c("NA", "b", "a"), levels = c("a", "b", NA)) expect_equal(x, factor(c(NA, "b", "a"), levels = c("a", "b", NA), exclude = NULL)) x <- parse_factor(c("NA", "b", "a"), levels = c("a", "b"), include_na = TRUE) expect_equal(x, factor(c(NA, "b", "a"), levels = c("a", "b", NA), exclude = NULL)) x <- parse_factor(c("NA", "b", "a"), levels = c("a", "b"), include_na = FALSE) expect_equal(x, factor(c(NA, "b", "a"))) x <- parse_factor(c("NA", "b", "a"), levels = NULL, include_na = FALSE) expect_equal(x, factor(c(NA, "b", "a"), levels = c("b", "a"))) x <- parse_factor(c("NA", "b", "a"), levels = NULL, include_na = TRUE) expect_equal(x, factor(c(NA, "b", "a"), levels = c(NA, "b", "a"), exclude = NULL)) }) test_that("Factors handle encodings properly (#615)", { f <- tempfile() on.exit(unlink(f)) writeBin(charToRaw(encoded("test\nA\n\xC4\n", "latin1")), f) x <- read_csv(f, col_types = cols(col_factor(c("A", "\uC4"))), locale = locale(encoding = "latin1") ) expect_s3_class(x$test, "factor") expect_equal(x$test, factor(c("A", "\uC4"))) }) test_that("factors parse like factor if trim_ws = FALSE (735)", { expect_warning( regexp = "1 parsing failure", expect_equal( as.integer(parse_factor(c("a", "a "), levels = c("a"), trim_ws = FALSE)), as.integer(factor(c("a", "a "), levels = c("a"))) ) ) expect_warning( regexp = "1 parsing failure", expect_equal( as.integer(parse_factor(c("a", "a "), levels = c("a "), trim_ws = FALSE)), as.integer(factor(c("a", "a "), levels = c("a "))) ) ) expect_equal( as.integer(parse_factor(c("a", "a "), levels = c("a", "a "), trim_ws = FALSE)), as.integer(factor(c("a", "a "), levels = c("a", "a "))) ) expect_equal( as.integer(parse_factor(c("a", "a "), levels = c("a ", "a"), trim_ws = FALSE)), as.integer(factor(c("a", "a "), levels = c("a ", "a"))) ) }) test_that("Can parse a factor with levels of NA and empty string", { x <- c( "", "NC", "NC", "NC", "", "", "NB", "NA", "", "", "NB", "NA", "NA", "NC", "NB", "NB", "NC", "NB", "NA", "NA" ) expect_equal( as.integer(parse_factor(x, levels = c("NA", "NB", "NC", ""), na = character())), as.integer(factor(x, levels = c("NA", "NB", "NC", ""))) ) }) test_that("factor levels must be null or a character vector (#1140)", { expect_error(col_factor(levels = 1:10), "must be `NULL` or a character vector") }) readr/tests/testthat/test-parsing-count-fields.R0000644000176200001440000000070314174357220021513 0ustar liggesuserstest_that("counts correct number of fields based on supplied tokenizer", { string <- "1,a,NA\n2,b,NA\n" res_csv <- count_fields(string, tokenizer_csv()) res_tsv <- count_fields(string, tokenizer_tsv()) expect_equal(res_csv, c(3, 3)) expect_equal(res_tsv, c(1, 1)) }) test_that("maximum lines counted is respected", { string <- "1,a,NA\n2,b,NA\n" res_csv <- count_fields(string, tokenizer_csv(), n_max = 1) expect_equal(res_csv, 3) }) readr/tests/testthat/eol-cr.txt.gz0000644000176200001440000000005714152512262016714 0ustar liggesusersWUeol-cr.txtP2TH2RH2VHkxygreadr/tests/testthat/test-read-lines.R0000644000176200001440000000566614174704674017527 0ustar liggesuserstest_that("read_lines respects encoding", { skip_on_os("solaris") x <- read_lines("enc-iso-8859-1.txt", locale = locale(encoding = "ISO-8859-1")) expect_equal(x, c("fran\u00e7ais", "\u00e9l\u00e8ve")) }) test_that("read_lines returns an empty character vector on an empty file", { expect_equal(read_lines("empty-file"), character()) }) test_that("read_lines handles embedded nuls", { skip_if_edition_first() expect_warning(res <- read_lines(test_path("null-file"), lazy = FALSE)) expect_equal(res, c("a,b,c", "1,2,", "3,4,5")) }) test_that("read_lines uses na argument", { expect_equal(read_lines(I("abc\n123")), c("abc", "123")) expect_equal(read_lines(I("abc\n123"), na = "abc"), c(NA_character_, "123")) expect_equal(read_lines(I("abc\n123"), na = "123"), c("abc", NA_character_)) expect_equal(read_lines(I("abc\n123"), na = c("abc", "123")), c(NA_character_, NA_character_)) }) test_that("blank lines are passed unchanged", { expect_equal(read_lines(I("abc\n\n123")), c("abc", "", "123")) expect_equal(read_lines(I("abc\n\n123"), na = ""), c("abc", NA, "123")) }) test_that("read_lines can skip blank lines (#923)", { x <- I("1 2 3 foo bar baz ") expect_equal(read_lines(x), c("1", "2", "3", "", "foo", "bar", "baz")) expect_equal(read_lines(x, skip_empty_rows = TRUE), c("1", "2", "3", "foo", "bar", "baz")) expect_equal(read_lines(x, skip = 1), c("2", "3", "", "foo", "bar", "baz")) expect_equal(read_lines(x, skip = 2), c("3", "", "foo", "bar", "baz")) expect_equal(read_lines(x, skip = 3), c("", "foo", "bar", "baz")) expect_equal(read_lines(x, skip = 4), c("foo", "bar", "baz")) expect_equal(read_lines(x, skip = 5), c("bar", "baz")) expect_equal(read_lines(x, skip = 6), c("baz")) expect_equal(read_lines(x, skip = 7), character()) }) test_that("allocation works as expected", { tmp <- tempfile(fileext = ".gz") on.exit(unlink(tmp)) x <- rep(paste(rep("a", 2^10), collapse = ""), 2^11) writeLines(x, tmp) expect_equal(length(read_lines(tmp)), 2^11) }) test_that("read_lines(skip_empty_rows) works when blank lines are at the end of the file (#968)", { skip_on_os("windows") tmp <- tempfile() on.exit(unlink(tmp)) writeLines( con = tmp, "test " ) expect_equal(read_lines(tmp, skip_empty_rows = TRUE), "test") }) test_that("read_lines(skip_empty_rows) works if there are double quotes in the lines (#991)", { # TODO: turn on test skip_if_edition_second() data <- "a\"b cde f\"g hij" expect_equal( read_lines(data, skip = 1), c( "cde", "f\"g", "hij" ) ) }) # These tests are slow so are commented out # test_that("long vectors are supported", { # tmp <- tempfile(fileext = ".gz") # on.exit(unlink(tmp)) # x <- rep(paste(rep("a", 2 ^ 16), collapse = ''), 2 ^ 15) # con <- gzfile(tmp, open = "w", compression = 0) # writeLines(x, con) # close(con) # expect_equal(length(read_lines(tmp)), 2^15) # expect_equal(length(read_lines_raw(tmp)), 2^15) # }) readr/tests/testthat/eol-lf.csv0000644000176200001440000000003214152512262016237 0ustar liggesusers"x","y" 1,"a" 2,"b" 3,"c" readr/tests/testthat/test-parsing-numeric.R0000644000176200001440000001050014174704674020566 0ustar liggesuserses_MX <- locale("es", decimal_mark = ",") test_that("non-numeric integer/double matches fail", { expect_equal(n_problems(parse_double("d")), 1) expect_equal(n_problems(parse_integer("d")), 1) }) test_that("partial integer/double matches fail", { expect_equal(n_problems(parse_double("3d")), 1) expect_equal(n_problems(parse_integer("3d")), 1) }) test_that("parse functions converts NAs", { expect_equal(parse_double(c("1.5", "NA")), c(1.5, NA)) }) test_that("leading/trailing ws ignored when parsing", { expect_equal(parse_double(c(" 1.5", "1.5", "1.5 ")), rep(1.5, 3)) expect_equal(read_csv(I("x\n 1.5\n1.5\n1.5 \n"))$x, rep(1.5, 3)) }) test_that("lone - or decimal marks are not numbers", { expect_equal(guess_parser("-"), "character") expect_equal(guess_parser("."), "character") expect_equal(guess_parser(",", locale = es_MX), "character") expect_equal(n_problems(parse_number(c(".", "-"))), 2) }) test_that("Numbers with trailing characters are parsed as characters", { expect_equal(guess_parser("13T"), "character") expect_equal(guess_parser(c("13T", "13T", "10N")), "character") }) test_that("problems() returns the full failed string if parsing fails (548)", { skip_if_edition_first() probs <- problems(read_tsv("x\n1\nx", na = "", col_types = "n", lazy = FALSE)) expect_equal(probs$row, 3) expect_equal(probs$expected, "a number") expect_equal(probs$actual, "x") }) # Leading zeros ----------------------------------------------------------- test_that("leading zeros are not numbers", { expect_equal(guess_parser("0"), "double") expect_equal(guess_parser("0."), "double") expect_equal(guess_parser("0001"), "character") }) # Flexible number parsing ------------------------------------------------- test_that("col_number only takes first number", { expect_equal(parse_number("XYZ 123,000 BLAH 456"), 123000) }) test_that("col_number helps with currency", { expect_equal(parse_number("$1,000,000.00"), 1e6) expect_equal(parse_number("$1.000.000,00", locale = es_MX), 1e6) }) test_that("invalid numbers don't parse", { expect_warning(x <- parse_number(c("..", "--", "3.3.3", "4-1"))) expect_equal(n_problems(x), 2) expect_equal(c(x), c(NA, NA, 3.3, 4.0)) }) test_that("number not guess if leading/trailing", { expect_equal(guess_parser("X1"), "character") expect_equal(parse_number("X1"), 1) expect_equal(guess_parser("1X"), "character") expect_equal(parse_number("1X"), 1) }) # Decimal comma ----------------------------------------------------------- test_that("parse_vector passes along decimal_mark", { expect_equal(parse_double("1,5", locale = es_MX), 1.5) }) test_that("type_convert passes along decimal_mark", { df <- data.frame(x = "1,5", stringsAsFactors = FALSE) out <- type_convert(df, locale = es_MX) expect_equal(out$x, 1.5) }) test_that("read_tsv passes on decimal_mark", { out <- read_tsv(I("x\n1,5"), locale = es_MX) expect_equal(out$x, 1.5) }) # Negative numbers ----------------------------------------------------------- test_that("negative numbers return negative values", { expect_equal(parse_number("-2"), -2) expect_equal(parse_number("-100,000.00"), -100000) }) # Large numbers ------------------------------------------------------------- test_that("large numbers are parsed properly", { expect_equal(parse_double("100000000000000000000"), 1e20) expect_equal(parse_double("1267650600228229401496703205376"), 1.267650600228229401496703205376e+30) expect_equal(parse_double("100000000000000000000", locale = es_MX), 1e20) expect_equal(parse_double("1267650600228229401496703205376", locale = es_MX), 1.267650600228229401496703205376e+30) }) # Scientific Notation ------------------------------------------------------- test_that("scientific notation is parsed properly", { expect_equal(parse_number("1e20"), 1e20) expect_equal(parse_number("3e2"), 300) expect_equal(parse_number("1e+20"), 1e20) expect_equal(parse_number("3e+2"), 300) expect_equal(parse_number("3e0"), 3) expect_equal(parse_number("ignore17e4ignore"), 170000) expect_equal(parse_number("1.2345e4"), 12345) expect_equal(parse_number("-5.4e3"), -5400) expect_equal(parse_number("0E12"), 0) expect_equal(parse_number("17E-5"), 0.00017) expect_equal(parse_number("-17E-5"), -0.00017) expect_equal(parse_number("-17E-5-5"), -0.00017) expect_equal(parse_number("1.2E-3"), 0.0012) }) readr/tests/testthat/sample_text.txt0000644000176200001440000000000714152512262017434 0ustar liggesusersabc 123readr/tests/testthat/eol-crlf.txt0000644000176200001440000000002014152512262016605 0ustar liggesusersx y 1 a 2 b 3 c readr/tests/testthat/test-non-ascii-1152.rds0000644000176200001440000000511014174704674020321 0ustar liggesusers pTǗ#GeZkH=jjU۸$@BBA[ȊL$!rKE (ֶ^ZzMHK lv_M f.N$xЛC6٢~l5uf{| ⌇\`㚆[*7/XZXV>0*Ԟ3@  vT%MK'} G_ķh;?e|G`4 m\%+{qxn7L={G,Xn[{`@IիW}%Xn[aܛa'm_zgmMI389}cwCxm==|&x훕95#}vNx=?'5OHH`\a \c! p#L/-% 2w]p7̀;|/Cx ^_¯ ށޅ4{S0iNoB"~p!80 {"x>a|Fp)0s0wΧ Wyu0s0ga kkZ`Z`llllllllllllxlˆˆˆˆˆˆˆˆˆˆˆvM55555555ܵ,_____dždždždždždždždždžV~ggg[ kkkkkkk8j8j8j8i8i8i8i8i8i\/GGGGGGG ^K|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4|4Yc5YMMMMMMMMMMMMMMMMIIIIIIIIk(XˍXˍtcMΚ>5}8kpa> T_۴hMcUu`E%J(QD%J(QD%J(QD%J(QD%J(QD%J$ġfȨ˒$MH2IMu%J(QD%J(QD%J(QD%J(QD%J(QD%J$n81K*w}iq(q{޲m%iu+t1/gsHb霂mstC7ǰX1nN:d2~,?.'._8"PqtNG;-ʟZXVq<ĉ+_Ɛ|< JF=nJy\Sv|ϽΨY>cgbVő۟{gFJr:ɼcW/~wف[HIۺHAݪ;-w]~vjm:u􅲅幝KKa'r? Xqn>/ߟŸ;z)_qc]E\]+z9UoY>#(ψޕ܏}aQ{ ?PzunIx7:|%źn&9rG6i͝ժgΒ}݈Y9&%L̚9dj줜$߽3Y0srƜFNQWL͙76-Q#ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡuCP:ԡMW99?9/|wvf>?+=57=-O %mGHlDfs颶v-9aKƚzyk}Aj'xxX ՁM ;vw?[yp^zó,H6]`[Hv0l熶I-cO3J?f(f`ffvJ _6ZgF!SΪm=gk1 jkI̛M=b l@yѽBD\+IKm]gޗ9?Qreadr/tests/testthat/test-type-convert.R0000644000176200001440000000422714174704674020133 0ustar liggesuserstest_that("missing values removed before guessing col type", { df1 <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE) df2 <- type_convert(df1) expect_equal(df2$x, c(NA, 10L)) }) test_that("requires data.frame input", { not_df <- matrix(letters[1:4], nrow = 2) expect_error(type_convert(not_df), "is.data.frame") }) test_that("col_types accepts character specifications", { df <- data.frame(x = 1:3, y = "3", z = "a", stringsAsFactors = FALSE) df_conv <- data.frame(x = 1:3, y = 3L, z = "a", stringsAsFactors = FALSE) expect_error(type_convert(df, col_types = "i"), "must have consistent lengths") expect_error(type_convert(df, col_types = c("i", "b")), "must be a single string") expect_equal(type_convert(df, col_types = "_ic"), df_conv) }) test_that("col_types accepts cols specifications", { df <- data.frame(x = 1:3, y = "3", z = "a", stringsAsFactors = FALSE) df_conv <- data.frame(x = 1:3, y = 3L, z = "a", stringsAsFactors = FALSE) expect_equal(type_convert(df, col_types = cols(y = "i")), df_conv) expect_equal(type_convert(df, col_types = cols(y = col_integer(), z = col_character())), df_conv) # non-character cols silently ignored expect_equal(type_convert(df, col_types = cols(x = "c", y = "i")), df_conv) }) test_that("spec attribute is removed", { df1 <- read_csv( readr_example("mtcars.csv"), col_types = cols(.default = col_character()) ) df2 <- type_convert(df1) # The spec attribute should exist initially (b/c it's set by `read_csv()`). expect_false(is.null(attr(df1, "spec"))) # The spec attribute should be cleared by `type_convert()`. expect_null(attr(df2, "spec")) }) test_that("warning is thrown if there are no character columns (1020)", { expect_warning(type_convert(mtcars), "only converts columns") }) test_that("guess_integer is implemented", { df <- data.frame( a = c("a", "b", "c"), b = c("1", "0", "-12"), c = c("1", "0", ".00001"), stringsAsFactors = FALSE ) exp <- data.frame( a = c("a", "b", "c"), b = c(1L, 0L, -12L), c = c(1, 0, .00001), stringsAsFactors = FALSE ) expect_identical(type_convert(df, guess_integer = TRUE), exp) }) readr/tests/testthat/setup.R0000644000176200001440000000013514174704674015651 0ustar liggesuserspre_test_options <- options( readr.show_progress = FALSE, readr.show_col_types = FALSE ) readr/tests/testthat/test-col-spec.R0000644000176200001440000001742014304131171017155 0ustar liggesuserstest_that("supplied col names must match non-skipped col types", { out <- col_spec_standardise(col_types = "c_c", col_names = c("a", "c")) expect_equal(names(out[[1]]), c("a", "", "c")) }) test_that("supplied col names matches to non-skipped col types", { out <- col_spec_standardise("a,b,c\n1,2,3", col_types = "i_i") expect_equal(names(out[[1]]), c("a", "b", "c")) }) test_that("guess col names matches all col types", { out <- col_spec_standardise("a,b,c\n", col_types = "i_i") expect_equal(names(out[[1]]), c("a", "b", "c")) expect_equal(out[[1]][[2]], col_skip()) }) test_that("col_names expanded to col_types with dummy names", { expect_warning( out <- col_spec_standardise("1,2,3,4\n", c("a", "b"), "iiii"), "Insufficient `col_names`" ) expect_equal(names(out[[1]]), c("a", "b", "X3", "X4")) }) test_that("col_names expanded to match col_types, with skipping", { expect_warning( out <- col_spec_standardise(col_types = "c_c", col_names = "a"), "Insufficient `col_names`" ) expect_equal(names(out[[1]]), c("a", "", "X2")) }) test_that("col_types expanded to col_names by guessing", { skip_if(edition_first()) expect_warning( out <- col_spec_standardise("1,2,3\n", c("a", "b", "c"), "ii"), "Insufficient `col_types`" ) expect_equal(names(out[[1]]), c("a", "b", "c")) expect_equal(out[[1]][[3]], col_double()) }) test_that("defaults expanded to match names", { out <- col_spec_standardise("a,b,c\n1,2,3", col_types = cols(.default = "c")) expect_equal(out[[1]], list( a = col_character(), b = col_character(), c = col_character() )) }) test_that("col_spec_standardise works properly with 1 row inputs and no header columns (#333)", { expect_s3_class(col_spec_standardise("1\n", col_names = FALSE)[[1]]$X1, "collector_double") }) test_that("warns about duplicated names", { expect_warning(col_spec_standardise("a,a\n1,2"), "Duplicated column names") expect_warning( col_spec_standardise("1,2\n1,2", col_names = c("X", "X")), "Duplicated column names" ) }) test_that("warn about missing col names and fill in", { expect_warning(col_spec_standardise(",\n1,2"), "Missing column names") expect_warning( col_spec_standardise("1,2\n1,2", col_names = c("X", NA)), "Missing column names" ) }) test_that("spec object attached to read data", { skip_if(edition_first()) test_data <- read_csv(test_path("basic-df.csv"), col_types = NULL, col_names = TRUE) sp <- spec(test_data) sp$skip <- NULL expect_equal( sp, cols( .delim = ",", a = col_logical(), b = col_double(), c = col_double(), d = col_character() ) ) }) test_that("guess_types errors on invalid inputs", { expect_error(col_spec_standardise("a,b,c\n", guess_max = NA), "`guess_max` must be a positive integer") expect_error(col_spec_standardise("a,b,c\n", guess_max = -1), "`guess_max` must be a positive integer") expect_warning(col_spec_standardise("a,b,c\n", guess_max = Inf), "`guess_max` is a very large value") }) test_that("check_guess_max errors on invalid inputs", { expect_error(check_guess_max(NULL), "`guess_max` must be a positive integer") expect_error(check_guess_max("test"), "`guess_max` must be a positive integer") expect_error(check_guess_max(letters), "`guess_max` must be a positive integer") expect_error(check_guess_max(1:2), "`guess_max` must be a positive integer") expect_error(check_guess_max(NA), "`guess_max` must be a positive integer") expect_error(check_guess_max(-1), "`guess_max` must be a positive integer") expect_warning(check_guess_max(Inf), "`guess_max` is a very large value") }) test_that("as.col_types can handle named character input", { expect_equal(as.col_spec(c(a = "c")), cols(a = col_character())) }) test_that("as.col_types can convert data.frame", { spec <- as.col_spec(iris) exp <- cols( Sepal.Length = col_double(), Sepal.Width = col_double(), Petal.Length = col_double(), Petal.Width = col_double(), Species = col_factor(levels = c("setosa", "versicolor", "virginica"), ordered = FALSE, include_na = FALSE) ) expect_equal(spec, exp) }) test_that("as.character() works on col_spec objects", { spec <- as.col_spec(iris) expect_equal(as.character(spec), "ddddf") }) # Printing ---------------------------------------------------------------- test_that("print(col_spec) with guess_parser", { expect_snapshot(col_spec_standardise("a,b,c\n1,2,3")) }) test_that("print(col_spec) with collector_skip", { expect_snapshot(cols_only(a = col_integer(), c = col_integer())) }) test_that("print(col_spec) with truncated output", { out <- col_spec_standardise("a,b,c\n1,2,3", col_types = cols(.default = "c")) expect_snapshot( print(out, n = 2, condense = FALSE) ) }) test_that("print(col_spec) works with dates", { out <- col_spec_standardise("a,b,c\n", col_types = cols( a = col_date(format = "%Y-%m-%d"), b = col_date(), c = col_date() ) ) expect_snapshot(out) }) test_that("print(col_spec) with unnamed columns", { expect_snapshot( col_spec_standardise(col_types = "c_c", col_names = c("a", "c")) ) }) test_that("print(cols_only()) prints properly", { expect_snapshot( cols_only(a = col_character(), c = col_integer()) ) }) test_that("print(col_spec) with n == 0 prints nothing", { expect_silent(print(col_spec_standardise("a,b,c\n1,2,3"), n = 0)) }) test_that("print(cols_condense(col_spec)) condenses the spec", { expect_snapshot( cols_condense(col_spec_standardise("a,b,c,d\n1,2,3,a")) ) expect_snapshot( cols_condense(col_spec_standardise("a,b,c,d\n1,2,3,4")) ) }) test_that("print(col_spec) with no columns specified", { expect_snapshot(cols()) expect_snapshot(cols(.default = col_character())) }) test_that("print(col_spec) and condense edge cases", { expect_snapshot(print( cols(a = col_integer(), b = col_integer(), c = col_double()), n = 1, condense = TRUE, colour = FALSE )) }) test_that("print(col_spec) with colors", { local_reproducible_output(crayon = TRUE) out <- col_spec_standardise( "a,b,c,d,e,f,g,h,i\n1,2,F,a,2018-01-01,2018-01-01 12:01:01,12:01:01,foo,blah", col_types = c(b = "i", h = "f", i = "_") ) expect_snapshot_output(out) }) test_that("non-syntatic names are escaped", { expect_snapshot( col_spec_standardise("a b,_c,1,a`b\n1,2,3,4") ) }) # https://github.com/tidyverse/readr/issues/597 test_that("long spec declarations can be formatted", { expect_snapshot( cols(a = col_factor( levels = c("apple", "pear", "banana", "peach", "apricot", "orange", "plum"), ordered = TRUE )) ) }) test_that("options(readr.show_col_types) controls col spec printing", { # skip is temporarily necessary as of January 29 2022, due to the very # recent release of rlang 1.0.0 # Windows binaries aren't available yet and the different rlang versions # make it challenging to form test snapshots that work for whole GHA # build matrix skip_if_edition_first_windows() withr::local_options(list(readr.show_col_types = TRUE)) expect_snapshot( out <- read_csv(readr_example("mtcars.csv")), variant = edition_variant() ) withr::local_options(list(readr.show_col_types = FALSE)) expect_silent(out <- read_csv(readr_example("mtcars.csv"))) }) test_that("`show_col_types` controls col spec printing", { # skip is temporarily necessary as of January 29 2022, due to the very # recent release of rlang 1.0.0 # Windows binaries aren't available yet and the different rlang versions # make it challenging to form test snapshots that work for whole GHA # build matrix skip_if_edition_first_windows() expect_snapshot( out <- read_csv(readr_example("mtcars.csv"), show_col_types = TRUE), variant = edition_variant() ) expect_silent(out <- read_csv(readr_example("mtcars.csv"), show_col_types = FALSE)) }) readr/tests/testthat/table-crash0000644000176200001440000001000014152512262016450 0ustar liggesusers 3.5022800E+05 2.1990000E+02 1.7455317E-03 5.0152367E+00 1.0200010E+00 0.0000000E+00 1.0360000E+03 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 1.3231179E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 6.9944140E-03 -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 -9.9920000E+02 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 2.5980995E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 7.0062219E-03 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.0000000E+00 0.00000readr/tests/testthat/eol-cr.txt.bz20000644000176200001440000000006614152512262016771 0ustar liggesusersBZh91AY&SY_̛Y@88` 10!b>+j\H  vreadr/tests/testthat/eol-lf.txt0000644000176200001440000000002014152512262016260 0ustar liggesusersx y 1 a 2 b 3 c readr/tests/testthat/test-read-fwf.R0000644000176200001440000001720014377750076017164 0ustar liggesuserstest_that("trailing spaces ommitted", { spec <- fwf_empty("fwf-trailing.txt") expect_equal(spec$begin, c(0, 4)) expect_equal(spec$end, c(3, NA)) df <- read_fwf("fwf-trailing.txt", spec) expect_equal(df$X1, df$X2) }) test_that("respects the trim_ws argument", { x <- "a11 b22 c33\nd e f " out1 <- read_fwf(I(x), fwf_empty(I(x)), trim_ws = FALSE) expect_equal(out1$X1, c("a11", "d ")) expect_equal(out1$X2, c("b22", "e ")) expect_equal(out1$X3, c("c33", "f ")) out2 <- read_fwf(I(x), fwf_empty(I(x)), trim_ws = TRUE) expect_equal(out2$X1, c("a11", "d")) expect_equal(out2$X2, c("b22", "e")) expect_equal(out2$X3, c("c33", "f")) }) test_that("respects the trim_ws argument with empty fields", { x <- "a11 b22 c33\nd f " out1 <- read_fwf(I(x), fwf_empty(I(x)), trim_ws = FALSE) expect_equal(out1$X1, c("a11", "d ")) expect_equal(out1$X2, c("b22", " ")) expect_equal(out1$X3, c("c33", "f ")) out1 <- read_fwf(I(x), fwf_empty(I(x)), trim_ws = TRUE, na = "NA") }) test_that("skipping column doesn't pad col_names", { x <- "1 2 3\n4 5 6" out1 <- read_fwf(I(x), fwf_empty(I(x)), col_types = "d-d") expect_named(out1, c("X1", "X3")) names <- c("a", "b", "c") out2 <- read_fwf(I(x), fwf_empty(I(x), col_names = names), col_types = "d-d") expect_named(out2, c("a", "c")) }) test_that("fwf_empty can skip comments", { x <- "COMMENT\n1 2 3\n4 5 6" out1 <- read_fwf(I(x), fwf_empty(I(x), comment = "COMMENT"), comment = "COMMENT") expect_equal(dim(out1), c(2, 3)) }) test_that("passing \"\" to read_fwf's 'na' option", { expect_equal( read_fwf(I("foobar\nfoo "), fwf_widths(c(3, 3)), na = "")[[2]], c("bar", NA) ) }) test_that("ragged last column expanded with NA", { x <- read_fwf(I("1a\n2ab\n3abc"), fwf_widths(c(1, NA))) expect_equal(x$X2, c("a", "ab", "abc")) expect_equal(n_problems(x), 0) }) test_that("ragged last column shrunk with warning", { x <- read_fwf(I("1a\n2ab\n3abc"), fwf_widths(c(1, 3))) expect_equal(x$X2, c("a", "ab", "abc")) skip_if_edition_second() expect_equal(n_problems(x), 2) }) test_that("read all columns with positions, non ragged", { col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) x <- read_fwf(I("12345A\n67890BBBBBBBBB\n54321C"), col_positions = col_pos) expect_equal(x$X3, c("A", "B", "C")) expect_equal(n_problems(x), 0) }) test_that("read subset columns with positions", { col_pos <- fwf_positions(c(1, 3), c(2, 5)) x <- read_fwf(I("12345A\n67890BBBBBBBBB\n54321C"), col_positions = col_pos) expect_equal(x$X1, c(12, 67, 54)) expect_equal(x$X2, c(345, 890, 321)) expect_equal(n_problems(x), 0) }) test_that("read columns with positions, ragged", { col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, NA)) x <- read_fwf(I("12345A\n67890BBBBBBBBB\n54321C"), col_positions = col_pos) expect_equal(x$X1, c(12, 67, 54)) expect_equal(x$X2, c(345, 890, 321)) expect_equal(x$X3, c("A", "BBBBBBBBB", "C")) expect_equal(n_problems(x), 0) }) test_that("read columns with width, ragged", { col_pos <- fwf_widths(c(2, 3, NA)) x <- read_fwf(I("12345A\n67890BBBBBBBBB\n54321C"), col_positions = col_pos) expect_equal(x$X1, c(12, 67, 54)) expect_equal(x$X2, c(345, 890, 321)) expect_equal(x$X3, c("A", "BBBBBBBBB", "C")) expect_equal(n_problems(x), 0) }) test_that("read_fwf returns an empty data.frame on an empty file", { skip_if_edition_second() expect_true(all.equal(read_fwf(test_path("empty-file"), fwf_widths(c(1, 3))), tibble::tibble())) }) test_that("check for line breaks in between widths", { skip_if_edition_second() txt1 <- paste( "1 1", "2", "1 1 ", sep = "\n" ) expect_warning(out1 <- read_fwf(I(txt1), fwf_empty(I(txt1)))) expect_equal(n_problems(out1), 2) txt2 <- paste( " 1 1", " 2", " 1 1 ", sep = "\n" ) expect_warning(out2 <- read_fwf(I(txt2), fwf_empty(txt2))) expect_equal(n_problems(out2), 2) exp <- tibble::tibble(X1 = c(1, 2, 1), X2 = c(1, NA, 1)) expect_true(all.equal(out1, exp, check.attributes = FALSE)) expect_true(all.equal(out2, exp, check.attributes = FALSE)) }) test_that("ignore commented lines anywhere in file", { skip_if_edition_second() col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) x1 <- read_fwf(I("COMMENT\n12345A\n67890BBBBBBBBB\n54321C"), col_positions = col_pos, comment = "COMMENT") x2 <- read_fwf(I("12345A\n67890BBBBBBBBB\nCOMMENT\n54321C"), col_positions = col_pos, comment = "COMMENT") x3 <- read_fwf(I("12345A\n67890BBBBBBBBB\n54321C\nCOMMENT"), col_positions = col_pos, comment = "COMMENT") x4 <- read_fwf(I("COMMENT\n12345A\nCOMMENT\n67890BBBBBBBBB\n54321C\nCOMMENT"), col_positions = col_pos, comment = "COMMENT") expect_identical(x1, x2) expect_identical(x1, x3) expect_identical(x1, x4) expect_equal(x1$X3, c("A", "B", "C")) expect_equal(n_problems(x1), 0) }) test_that("error on empty spec (#511, #519)", { skip_if_edition_second() txt <- "foo\n" pos <- fwf_positions(start = numeric(0), end = numeric(0)) expect_error(read_fwf(I(txt), pos), "Zero-length.*specifications not supported") }) test_that("error on negatives in fwf spec", { skip_if_edition_second() txt <- "foo\n" pos <- fwf_positions(start = c(1, -1), end = c(2, 3)) expect_error(read_fwf(I(txt), pos), ".*offset.*greater than 0") }) test_that("fwf spec can overlap", { x <- read_fwf(I("2015a\n2016b"), fwf_positions(c(1, 3, 5), c(4, 4, 5))) expect_equal(x$X1, c(2015, 2016)) expect_equal(x$X2, c(15, 16)) expect_equal(x$X3, c("a", "b")) }) # fwf_cols test_that("fwf_cols produces correct fwf_positions object with elements of length 2", { expected <- fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "b", "d")) expect_equal(fwf_cols(a = c(1, 2), b = c(9, 12), d = c(4, 6)), expected, ignore_attr = TRUE) }) test_that("fwf_cols produces correct fwf_positions object with elements of length 1", { expected <- fwf_widths(c(2L, 4L, 3L), c("a", "b", "c")) expect_equal(fwf_cols(a = 2, b = 4, c = 3), expected, ignore_attr = TRUE) }) test_that("fwf_cols throws error when arguments are not length 1 or 2", { expect_error(fwf_cols(a = 1:3, b = 4:5)) expect_error(fwf_cols(a = c(), b = 4:5)) }) test_that("fwf_cols works with unnamed columns", { expect_equal( fwf_cols(c(1, 2), c(9, 12), c(4, 6)), fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("X1", "X2", "X3")), ignore_attr = TRUE ) expect_equal( fwf_cols(a = c(1, 2), c(9, 12), c(4, 6)), fwf_positions(c(1L, 9L, 4L), c(2L, 12L, 6L), c("a", "X2", "X3")), ignore_attr = TRUE ) }) # read_table ------------------------------------------------------------------- test_that("read_table silently reads ragged last column", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") expect_equal(x$foo, c(1, 3, 5)) }) test_that("read_table skips all comment lines", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") y <- read_table("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") expect_equal(x[], y[]) }) test_that("read_table can read from a pipe (552)", { x <- read_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6")) expect_equal(x$a, c(1, 4)) }) test_that("read_table does not duplicate header rows for leading whitespace (747)", { x <- read_table("\nfoo bar\n1 2") expect_equal(nrow(x), 1) expect_equal(x$foo, 1) }) # fwf_positions --------------------------------------------------------------- test_that("fwf_positions always returns col_names as character (#797)", { begin <- c(1, 2, 4, 8) end <- c(1, 3, 7, 15) # Input a factor, should return a character nms <- factor(letters[1:4]) info <- fwf_positions(begin, end, nms) expect_type(info$begin, "double") expect_type(info$end, "double") expect_type(info$col_names, "character") }) readr/tests/testthat/eol-crlf.csv0000644000176200001440000000003614152512262016570 0ustar liggesusers"x","y" 1,"a" 2,"b" 3,"c" readr/tests/testthat/teardown.R0000644000176200001440000000003214174704674016330 0ustar liggesusersoptions(pre_test_options) readr/tests/testthat/test-problems.R0000644000176200001440000000444314174704674017317 0ustar liggesuserstest_that("stop_for_problems throws error", { expect_warning(x <- parse_integer("1.234")) expect_error(stop_for_problems(x), "1 parsing failure") }) test_that("skipping columns gives incorrect problem column (#573)", { skip_if_edition_first() delim.skip0 <- problems(read_csv("aa,bb,cc\n", col_names = F, col_types = "dcc", lazy = FALSE)) delim.skip1 <- problems(read_csv("aa,bb,cc\n", col_names = F, col_types = "_dc", lazy = FALSE)) delim.skip2 <- problems(read_csv("aa,bb,cc\n", col_names = F, col_types = "--d", lazy = FALSE)) expect_equal(delim.skip0$col, 1) expect_equal(delim.skip1$col, 2) expect_equal(delim.skip2$col, 3) skip_if_edition_second() delim.sk0.2 <- problems(read_tsv("aa\tbb\tcc\n", col_names = F, col_types = "dcd")) delim.sk1.2 <- problems(read_tsv("aa\tbb\tcc\n", col_names = F, col_types = "_dd")) expect_equal(delim.sk0.2$col, c("X1", "X3")) expect_equal(delim.sk1.2$col, c("X2", "X3")) fwf.pos <- fwf_widths(c(2, 2, 2)) fwf.skip0 <- problems(read_fwf("aabbcc\n", fwf.pos, col_types = "dcc")) fwf.skip1 <- problems(read_fwf("aabbcc\n", fwf.pos, col_types = "_dc")) fwf.skip2 <- problems(read_fwf("aabbcc\n", fwf.pos, col_types = "--d")) fwf.sk0.2 <- problems(read_fwf("aabbcc\n", fwf.pos, col_types = "dcd")) fwf.sk1.2 <- problems(read_fwf("aabbcc\n", fwf.pos, col_types = "d-d")) expect_equal(fwf.skip0$col, "X1") expect_equal(fwf.skip1$col, "X2") expect_equal(fwf.skip2$col, "X3") expect_equal(fwf.sk0.2$col, c("X1", "X3")) expect_equal(fwf.sk1.2$col, c("X1", "X3")) }) test_that("problems returns the filename (#581)", { skip_if_edition_first() files <- problems(read_csv(readr_example("mtcars.csv"), col_types = cols(mpg = col_integer()), lazy = FALSE))$file expect_equal(length(files), 28L) expect_equal(basename(files)[[1L]], "mtcars.csv") }) test_that("problems returns full original field (#444)", { probs <- problems(read_tsv("X\n-$12,500\n$2,000\n-$5,000\n$1,000\n-$3,000\n", col_types = list(.default = col_number()), lazy = FALSE)) expect_equal(NROW(probs), 3) expect_equal(probs$actual, c("-$12,500", "-$5,000", "-$3,000")) }) test_that("warn_problems should not fail when parsing non-ASCII characters (#1152)", { expect_warning(probs <- warn_problems(readRDS("test-non-ascii-1152.rds"))) expect_equal(NROW(probs), 10) }) readr/tests/testthat/test-melt-fwf.R0000644000176200001440000001406414174704674017215 0ustar liggesuserstest_that("trailing spaces ommitted", { withr::local_options(lifecycle_verbosity = "quiet") spec <- fwf_empty("fwf-trailing.txt") expect_equal(spec$begin, c(0, 4)) expect_equal(spec$end, c(3, NA)) df <- melt_fwf("fwf-trailing.txt", spec) expect_true(all(df$value == "123")) }) test_that("respects the trim_ws argument", { withr::local_options(lifecycle_verbosity = "quiet") x <- "a11 b22 c33\nd e f " out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE) expect_equal(out1$value, c("a11", "b22", "c33", "d ", "e ", "f ")) out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE) expect_equal(out2$value, c("a11", "b22", "c33", "d", "e", "f")) }) test_that("respects the trim_ws argument with empty fields", { withr::local_options(lifecycle_verbosity = "quiet") x <- "a11 b22 c33\nd f " out1 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = FALSE) expect_equal(out1$value, c("a11", "b22", "c33", "d ", " ", "f ")) out2 <- melt_fwf(x, fwf_empty(I(x)), trim_ws = TRUE, na = "NA") expect_equal(out2$value, c("a11", "b22", "c33", "d", "", "f")) }) test_that("fwf_empty can skip comments", { withr::local_options(lifecycle_verbosity = "quiet") x <- "COMMENT\n1 2 3\n4 5 6" out1 <- melt_fwf(x, fwf_empty(I(x), comment = "COMMENT"), comment = "COMMENT") expect_equal(dim(out1), c(6, 4)) }) test_that("missing lines are not skipped", { withr::local_options(lifecycle_verbosity = "quiet") # first x <- "a b\n\n\n1 2" expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4) # middle x <- "a b\n1 2\n\n\n2 3" expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 5) # last (trailing \n is ignored) x <- "a b\n1 2\n\n\n" expect_equal(max(melt_fwf(x, fwf_empty(I(x)))$row), 4) }) test_that("passing \"\" to melt_fwf's 'na' option", { withr::local_options(lifecycle_verbosity = "quiet") expect_equal( melt_fwf("foobar\nfoo ", fwf_widths(c(3, 3)), na = "")$value, c("foo", "bar", "foo", NA) ) }) test_that("ragged last column expanded with NA", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, NA))) expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc")) expect_equal(n_problems(x), 0) }) test_that("ragged last column shrunk with warning", { withr::local_options(lifecycle_verbosity = "quiet") expect_warning(x <- melt_fwf("1a\n2ab\n3abc", fwf_widths(c(1, 3)))) expect_equal(x$value[c(2, 4, 6)], c("a", "ab", "abc")) expect_equal(n_problems(x), 2) }) test_that("melt all columns with positions, non ragged", { withr::local_options(lifecycle_verbosity = "quiet") col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) expect_equal(x$value[c(3, 6, 9)], c("A", "B", "C")) expect_equal(n_problems(x), 0) }) test_that("melt subset columns with positions", { withr::local_options(lifecycle_verbosity = "quiet") col_pos <- fwf_positions(c(1, 3), c(2, 5)) x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) expect_equal(x$value[c(1, 3, 5)], as.character(c(12, 67, 54))) expect_equal(x$value[c(2, 4, 6)], as.character(c(345, 890, 321))) expect_equal(n_problems(x), 0) }) test_that("melt columns with positions, ragged", { withr::local_options(lifecycle_verbosity = "quiet") col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, NA)) x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54))) expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321))) expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C")) expect_equal(n_problems(x), 0) }) test_that("melt columns with width, ragged", { withr::local_options(lifecycle_verbosity = "quiet") col_pos <- fwf_widths(c(2, 3, NA)) x <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos) expect_equal(x$value[c(1, 4, 7)], as.character(c(12, 67, 54))) expect_equal(x$value[c(2, 5, 8)], as.character(c(345, 890, 321))) expect_equal(x$value[c(3, 6, 9)], c("A", "BBBBBBBBB", "C")) expect_equal(n_problems(x), 0) }) test_that("melt_fwf returns an empty data.frame on an empty file", { withr::local_options(lifecycle_verbosity = "quiet") empty_df <- tibble::tibble( row = double(), col = double(), data_type = character(), value = character() ) expect_true(all.equal(melt_fwf("empty-file"), empty_df)) }) test_that("check for line breaks in between widths", { withr::local_options(lifecycle_verbosity = "quiet") txt1 <- paste( "1 1", "2", "1 1 ", sep = "\n" ) expect_warning(out1 <- melt_fwf(txt1, fwf_empty(I(txt1)))) expect_equal(n_problems(out1), 1) txt2 <- paste( " 1 1", " 2", " 1 1 ", sep = "\n" ) expect_warning(out2 <- melt_fwf(txt2, fwf_empty(I(txt2)))) expect_equal(n_problems(out2), 1) exp <- tibble::tibble( row = c(1, 1, 2, 3, 3), col = c(1, 2, 1, 1, 2), data_type = "integer", value = as.character(c(1, 1, 2, 1, 1)) ) expect_true(all.equal(out1, exp, check.attributes = FALSE)) expect_true(all.equal(out2, exp, check.attributes = FALSE)) }) test_that("ignore commented lines anywhere in file", { withr::local_options(lifecycle_verbosity = "quiet") col_pos <- fwf_positions(c(1, 3, 6), c(2, 5, 6)) x1 <- melt_fwf("COMMENT\n12345A\n67890BBBBBBBBB\n54321C", col_positions = col_pos, comment = "COMMENT") x2 <- melt_fwf("12345A\n67890BBBBBBBBB\nCOMMENT\n54321C", col_positions = col_pos, comment = "COMMENT") x3 <- melt_fwf("12345A\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT") x4 <- melt_fwf("COMMENT\n12345A\nCOMMENT\n67890BBBBBBBBB\n54321C\nCOMMENT", col_positions = col_pos, comment = "COMMENT") expect_identical(x1, x2) expect_identical(x1, x3) expect_identical(x1, x4) expect_equal(x1$value[c(3, 6, 9)], c("A", "B", "C")) expect_equal(n_problems(x1), 0) }) test_that("error on empty spec", { withr::local_options(lifecycle_verbosity = "quiet") txt <- "foo\n" pos <- fwf_positions(start = numeric(0), end = numeric(0)) expect_error(melt_fwf(txt, pos), "Zero-length.*specifications not supported") }) readr/tests/testthat/fwf-trailing.txt0000644000176200001440000000002014152512262017473 0ustar liggesusers123 123 123 123 readr/tests/testthat/test-encoding.R0000644000176200001440000000060314174357220017243 0ustar liggesuserstest_that("guess_encoding() works", { skip_on_os("solaris") x <- guess_encoding(readr_example("mtcars.csv")) expect_s3_class(x, "tbl_df") expect_equal(as.character(x$encoding), "ASCII") expect_equal(x$confidence, 1) x <- guess_encoding("a\n\u00b5\u00b5") expect_s3_class(x, "tbl_df") expect_equal(as.character(x$encoding), "UTF-8") expect_equal(x$confidence, 0.8) }) readr/tests/testthat/test-read-csv.R0000644000176200001440000003055414304131171017157 0ustar liggesuserstest_that("read_csv col imputation, col_name detection and NA detection works", { test_data <- read_csv(test_path("basic-df.csv"), col_types = list(), col_names = TRUE) expect_equal( unname(unlist(lapply(test_data, class))), c("logical", "numeric", "numeric", "character") ) expect_equal(names(test_data), c("a", "b", "c", "d")) expect_equal(sum(is.na(test_data$d)), 1) test_data2 <- read_csv("basic-df.csv", col_types = list(a = "l", b = "d", c = "d", d = "c"), col_names = TRUE) expect_equal(test_data, test_data2) }) test_that("read_csv's 'NA' option genuinely changes the NA values", { expect_equal(read_csv(I("a\nz"), na = "z")$a, NA) }) test_that("read_csv's 'NA' option works with multiple NA values", { expect_equal( read_csv(I("a\nNA\nmiss\n13"), na = c("13", "miss"))$a, c("NA", NA, NA) ) }) test_that('passing character() to read_csv\'s "NA" option reads "" correctly', { expect_equal(read_csv(I("a\nfoo\n"), na = character())$a, "foo") }) test_that("passing \"\" to read_csv's 'NA' option reads \"\" correctly", { expect_equal(read_csv(I("a,b\nfoo,bar\nfoo,\n"), na = "")$b, c("bar", NA)) }) test_that("changing read_csv's 'quote' argument works correctly", { test_data <- read_csv("basic-df.csv", col_types = NULL, col_names = TRUE) test_data_singlequote <- read_csv("basic-df-singlequote.csv", quote = "'") expect_identical(test_data, test_data_singlequote) }) test_that("read_csv's 'skip' option allows for skipping'", { test_data <- read_csv("basic-df.csv", skip = 1) expect_equal(nrow(test_data), 9) }) test_that("read_csv's 'skip' option allows for skipping when no header row is present'", { test_data <- read_csv("basic-df.csv", skip = 1, col_names = FALSE) expect_equal(nrow(test_data), 10) }) test_that("read_csv's 'n_max' allows for a maximum number of records and does not corrupt any", { test_data <- read_csv("basic-df.csv", n_max = 7) expect_equal(nrow(test_data), 7) expect_equal(sum(is.na(test_data)), 0) }) test_that("n_max also affects column guessing", { df <- read_csv( n_max = 1, I("x,y,z 1,2,3 1,2,3,4"), progress = FALSE ) expect_equal(dim(df), c(1, 3)) }) test_that("can read more than 100 columns", { set.seed(2015 - 3 - 13) x <- as.data.frame(matrix(rbinom(300, 2, .5), nrow = 2)) y <- format_csv(x) expect_equal(ncol(read_csv(I(y))), 150) }) test_that("encoding affects text and headers", { skip_on_os("solaris") x <- read_csv("enc-iso-8859-1.txt", locale = locale(encoding = "ISO-8859-1")) expect_identical(names(x), "fran\u00e7ais") expect_identical(x[[1]], "\u00e9l\u00e8ve") }) test_that("nuls are dropped with a warning", { skip_if_edition_second() expect_warning(x <- read_csv("raw.csv")) expect_equal(n_problems(x), 1) expect_equal(x$abc, "ab") }) test_that("can read from the clipboard", { skip_on_cran() skip_if_no_clipboard() clipr::write_clip("a,b,c\n1,2,3") expect_identical(read_csv(clipboard()), read_csv(I("a,b,c\n1,2,3\n"))) }) test_that("can read from a multi-line character vector", { expect_identical(nrow(read_csv(I(c("a,b,c", "1,2,3")))), 1L) }) # Column warnings --------------------------------------------------------- test_that("warnings based on number of columns (not output columns)", { skip_if_edition_second() # FIXME: the column name when skipping is the wrong name out1 <- read_csv(I("1,2,3\n4,5,6"), "z", "__i") out2 <- read_csv(I("1,2,3\n4,5,6"), FALSE, cols_only(X3 = "i")) expect_equal(n_problems(out1), 0) expect_equal(n_problems(out2), 0) }) test_that("missing last field generates warning", { expect_warning(out <- read_csv(I("a,b\n2"))) expect_equal(problems(out)$expected, "2 columns") }) test_that("missing lines are skipped without warning", { # first expect_silent(out <- read_csv(I("a,b\n\n\n1,2"))) # middle expect_silent(out <- read_csv(I("a,b\n1,2\n\n\n2,3\n"))) # last (trailing \n is ignored) expect_silent(out <- read_csv(I("a,b\n1,2\n\n\n"))) }) test_that("warning lines are correct after skipping", { skip_if_edition_second() expect_warning(out1 <- read_csv(I("v1,v2\n1,foo"), col_types = "ii", lazy = FALSE)) expect_warning(out2 <- read_csv(I("#foo\nv1,v2\n1,foo"), col_types = "ii", comment = "#", lazy = FALSE)) expect_equal(problems(out1)$row, 1) expect_equal(problems(out2)$row, 1) expect_snapshot( out3 <- read_csv(I("v1,v2\n\n1,2\n\n3,4"), col_types = "i"), variant = edition_variant() ) expect_snapshot( out4 <- read_csv(I("v1,v2\n#foo\n1,2\n#bar\n3,4"), col_types = "i", comment = "#"), variant = edition_variant() ) expect_equal(problems(out3)$row, c(1, 2)) expect_equal(problems(out4)$row, c(1, 2)) }) test_that("extra columns generates warnings", { skip_if_edition_second() expect_warning(out1 <- read_csv(I("a,b\n1,2,3\n"), lazy = FALSE)) expect_warning(out2 <- read_csv(I("a,b\n1,2,3"), col_types = "ii", lazy = FALSE)) # expect_warning(out3 <- read_csv(I("1,2,3\n"), c("a", "b"), lazy = FALSE)) expect_warning(out4 <- read_csv(I("1,2,3\n"), c("a", "b"), "ii", lazy = FALSE)) expect_equal(problems(out1)$expected, "2 columns") expect_equal(problems(out2)$expected, "2 columns") # expect_equal(problems(out3)$expected, "2 columns") expect_equal(problems(out4)$expected, "2 columns") }) test_that("too few or extra col_types generates warnings", { skip_if_edition_second() expect_snapshot( out1 <- read_csv(I("v1,v2\n1,2"), col_types = "i", lazy = FALSE), variant = edition_variant() ) expect_equal(problems(out1)$expected, "1 columns") expect_equal(problems(out1)$actual, "2 columns") expect_warning(out2 <- read_csv(I("v1,v2\n1,2"), col_types = "iii", lazy = FALSE)) expect_equal(ncol(out2), 2) }) # read_csv2 --------------------------------------------------------------- test_that("decimal mark automatically set to ,", { expect_message( x <- read_csv2(I("x\n1,23")), if (default_locale()$decimal_mark == ".") "decimal .*grouping mark" else NA ) expect_equal(x[[1]], 1.23) }) # Zero rows --------------------------------------------------------------- test_that("header only df gets character columns", { x <- read_csv(I("a,b\n")) expect_equal(dim(x), c(0, 2)) expect_equal(class(x$a), "character") expect_equal(class(x$b), "character") }) test_that("n_max 0 gives zero row data frame", { x <- read_csv(I("a,b\n1,2"), n_max = 0) expect_equal(dim(x), c(0, 2)) expect_equal(class(x$a), "character") expect_equal(class(x$b), "character") }) test_that("empty file with col_names and col_types creates correct columns", { skip_if_edition_first() x <- read_csv(I(""), c("a", "b"), "ii") expect_equal(dim(x), c(0, 2)) expect_equal(class(x$a), "integer") expect_equal(class(x$b), "integer") }) test_that("empty file returns an empty tibble", { tmp <- tempfile("empty-file-", fileext = ".csv") on.exit(unlink(tmp)) file.create(tmp) expect_equal(read_csv(tmp)[], tibble::tibble()) }) # Comments ---------------------------------------------------------------- test_that("comments are ignored regardless of where they appear", { out1 <- read_csv(I("x\n1#comment"), comment = "#") out2 <- read_csv(I("x\n1#comment\n#comment"), comment = "#") out3 <- read_csv(I('x\n"1"#comment'), comment = "#") expect_equal(out1$x, 1) expect_equal(out2$x, 1) expect_equal(out3$x, 1) expect_snapshot( out4 <- read_csv(I("x,y\n1,#comment"), comment = "#", col_types = "cc"), variant = edition_variant() ) expect_equal(out4$y, NA_character_) expect_warning(out5 <- read_csv(I("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6"), comment = "#", lazy = FALSE)) expect_warning(out6 <- read_csv(I("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\nA4,A5,A6"), comment = "#", lazy = FALSE)) expect_warning(out7 <- read_csv(I("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\n#comment\nA4,A5,A6"), comment = "#", lazy = FALSE)) chk <- tibble::tibble( x1 = c("A2", "A3", "A4"), x2 = c("B2", NA_character_, "A5"), x3 = c("C2", NA_character_, "A6") ) expect_true(all.equal(chk, out5, check.attributes = FALSE)) expect_true(all.equal(chk, out6, check.attributes = FALSE)) expect_true(all.equal(chk, out7, check.attributes = FALSE)) }) test_that("escaped/quoted comments are ignored", { out1 <- read_delim(I("x\n\\#"), comment = "#", delim = ",", escape_backslash = TRUE, escape_double = FALSE ) out2 <- read_csv(I('x\n"#"'), comment = "#") expect_equal(out1$x, "#") expect_equal(out2$x, "#") }) test_that("leading comments are ignored", { out <- read_csv(I("#a\n#b\nx\n1"), comment = "#") expect_equal(ncol(out), 1) expect_equal(out$x, 1L) }) test_that("skip respects comments", { read_x <- function(...) { read_csv(I("#a\nb\nc"), col_names = FALSE, ...)[[1]] } expect_equal(read_x(), c("#a", "b", "c")) expect_equal(read_x(skip = 1), c("b", "c")) expect_equal(read_x(comment = "#"), c("b", "c")) expect_equal(read_x(comment = "#", skip = 2), c("c")) }) test_that("skip respects newlines", { read_x <- function(...) { read_csv(I("1\n2\n3\n\na\nb\nc"), col_names = FALSE, ...)[[1]] } expect_equal(read_x(), c("1", "2", "3", "a", "b", "c")) expect_equal(read_x(skip = 3), c("a", "b", "c")) expect_equal(read_x(skip = 4), c("a", "b", "c")) expect_equal(read_x(skip = 5), c("b", "c")) expect_equal(read_x(skip_empty_rows = FALSE), c("1", "2", "3", NA, "a", "b", "c")) expect_equal(read_x(skip_empty_rows = TRUE, skip = 3), c("a", "b", "c")) expect_equal(read_x(skip_empty_rows = FALSE, skip = 3), c(NA, "a", "b", "c")) expect_equal(read_x(skip_empty_rows = FALSE, skip = 4), c("a", "b", "c")) }) test_that("read_csv returns an empty data.frame on an empty file", { expect_equal(read_csv(test_path("empty-file"))[], tibble::tibble()) }) test_that("read_delim errors on length 0 delimiter (557)", { expect_error( read_delim(I("a b\n1 2\n"), delim = ""), "`delim` must be at least one character, use `read_table\\(\\)` for whitespace delimited input\\." ) }) test_that("read_csv does not duplicate header rows for leading whitespace (747)", { x <- read_csv(I("\nfoo,bar\n1,2"), skip = 1) expect_equal(nrow(x), 1) expect_equal(x$foo, 1) }) test_that("read_csv handles whitespace between delimiters and quoted fields (668)", { skip_if_edition_first() expect_warning(x <- read_csv(I('x,y\n1, \"hi,there\"\n3,4'), lazy = FALSE)) expect_equal(x$y, c("hi,there", "4")) }) test_that("read_csv does not throw warnings for skipped columns (750, 833)", { expect_warning(x <- read_csv(I("x,y\n1,2"), col_types = "i_"), NA) }) test_that("read_csv reads headers with embedded newlines (#784)", { x <- read_csv(I("\"Header\nLine Two\"\nValue\n")) expect_equal(names(x), "Header\nLine Two") expect_equal(x[[1]], "Value") x <- read_csv(I("\"Header\",\"Second header\nLine Two\"\nValue,Value2\n")) expect_equal(names(x), c("Header", "Second header\nLine Two")) expect_equal(x[[2]], "Value2") }) test_that("read_csv reads headers with embedded newlines 2 (#772)", { x <- read_csv(I("\"Header\nLine Two\"\n\"Another line\nto\nskip\"\nValue,Value2\n"), skip = 2, col_names = FALSE) expect_equal(names(x), c("X1", "X2")) expect_equal(x$X1, "Value") expect_equal(x$X2, "Value2") }) test_that("read_csv returns a spec_tbl_df and the spec attribute is removed once it is subset (#934)", { x <- read_csv(I("foo\n1\n2\n")) has_spec <- function(x) !is.null(attr(x, "spec")) expect_true(inherits(x, "spec_tbl_df")) expect_true(has_spec(x)) y <- x[] expect_false(inherits(y, "spec_tbl_df")) expect_false(has_spec(y)) }) test_that("read_csv returns an empty tbl if all lines are comments", { skip_if_edition_second() x <- read_csv(I("#foo\n#bar"), comment = "#", col_names = c("X")) expect_equal(nrow(x), 0) expect_equal(ncol(x), 1) }) test_that("read_csv works with single quotes inside of double quotes (#944)", { x <- read_csv(I("\"O'Henry\"\nfoo\n"), skip = 1, col_names = "x") expect_equal(x$x, "foo") x <- read_csv(I("# \"O'Henry\"\n\"foo\"\n\"bar\"\n"), comment = "#", col_names = TRUE) expect_equal(x$foo, "bar") }) test_that("read_csv works with single quotes in skipped lines (#945)", { x <- read_tsv(I("# Director's\nUSGS\t02177000\t2012-09-01\t191\tA\n"), skip = 1, col_names = FALSE) expect_equal(nrow(x), 1) expect_equal(ncol(x), 5) }) test_that("read_tsv correctly uses the quote and na arguments (#1254, #1255)", { x <- read_tsv(I("foo\tbar\n\"one baz\"\ttwo\nthree\t\n"), quote = "", na = character()) expect_equal(x[[1]], c("\"one baz\"", "three")) expect_equal(x[[2]], c("two", "")) }) readr/tests/testthat/test-read-file.R0000644000176200001440000000477414174704674017333 0ustar liggesusers# df <- dplyr::tibble(français = "élève") # write.csv(df, # "tests/testthat/enc-iso-8859-1.txt", # fileEncoding = "ISO-8859-1", # row.names = FALSE, # quote = FALSE) test_that("read_file respects encoding", { skip_on_os("solaris") x <- read_file("enc-iso-8859-1.txt", locale(encoding = "ISO-8859-1")) expect_equal(substr(x, 5, 5), "\u00e7") }) sample_text_str <- "abc\n123" # contents of sample_text.txt eol_cr_text <- "x y\n1 a\n2 b\n3 c\n" # contents of eol_cr.txt test_that("read_file works with a local text file passed as character", { expect_equal(read_file("sample_text.txt"), sample_text_str) }) test_that("read_file works with a local text file, skipping one line", { expect_equal( read_file(datasource("sample_text.txt", skip = 1)), paste(tail(strsplit(sample_text_str, "\n")[[1]], -1), collapse = "\n") ) }) test_that("read_file works with a character datasource", { expect_equal(read_file(sample_text_str), sample_text_str) }) test_that("read_file works with a connection to a local file", { con <- file("sample_text.txt", "rb") on.exit(close(con), add = TRUE) expect_equal(read_file(con), sample_text_str) }) test_that("read_file works with a raw datasource", { expect_equal(read_file(charToRaw(sample_text_str)), sample_text_str) }) test_that("read_file works with compressed files", { expect_equal(read_file("eol-cr.txt.gz"), eol_cr_text) expect_equal(read_file("eol-cr.txt.bz2"), eol_cr_text) expect_equal(read_file("eol-cr.txt.xz"), eol_cr_text) expect_equal(read_file("eol-cr.txt.zip"), eol_cr_text) }) test_that("read_file works via https", { skip_on_cran() url <- "https://raw.githubusercontent.com/tidyverse/readr/main/tests/testthat/eol-cr.txt" expect_equal(read_file(url), eol_cr_text) }) test_that("read_file works via https on gz file", { skip_on_cran() url <- "https://raw.githubusercontent.com/tidyverse/readr/main/tests/testthat/eol-cr.txt.gz" expect_equal(read_file(url), eol_cr_text) }) test_that("read_file returns \"\" on an empty file", { expect_equal(read_file("empty-file"), "") }) # read_file_raw --------------------------------------------------------------- test_that("read_file_raw works with a local text file", { expect_equal(read_file_raw("sample_text.txt"), charToRaw("abc\n123")) }) test_that("read_file_raw works with a character datasource", { expect_equal(read_file_raw("abc\n123"), charToRaw("abc\n123")) }) test_that("read_file_raw returns raw() on an empty file", { expect_equal(read_file_raw("empty-file"), raw()) }) readr/tests/testthat/raw.csv0000644000176200001440000000002114152512262015650 0ustar liggesusersabc,def abc,def readr/tests/testthat/eol-cr.txt.zip0000644000176200001440000000027214152512262017075 0ustar liggesusersPK o%Gkxyg eol-cr.txtUT WUnUux x y 1 a 2 b 3 c PK o%Gkxyg eol-cr.txtUTWUux PKPTreadr/tests/testthat/test-read-table.R0000644000176200001440000000436614304131171017455 0ustar liggesusers# read_table ------------------------------------------------------------------- test_that("read_table silently reads ragged last column", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") expect_equal(x$foo, c(1, 3, 5)) }) test_that("read_table skips all comment lines", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") y <- read_table("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") expect_equal(x[], y[], ignore_attr = FALSE) }) test_that("read_table can read from a pipe (552)", { x <- read_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6")) expect_equal(x$a, c(1, 4)) }) test_that("read_table can read a truncated file without crashing (740)", { expect_error( suppressWarnings( read_table(test_path("table-crash"), col_names = FALSE) ), NA ) }) # read_table2 ------------------------------------------------------------------- test_that("read_table silently reads ragged columns", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") expect_equal(x$foo, c(1, 3, 5)) }) test_that("read_table skips all comment lines", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") y <- read_table("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") expect_equal(x[], y[]) }) test_that("read_table skips even more comment lines", { x <- read_table("foo bar\n1 2\n3 4\n5 6\n") y <- read_table("#comment1\n#comment2\nfoo bar # comment\n1 2 # comment\n3 4\n5 6\n #comment \n", comment = "#") expect_equal(x[], y[]) }) test_that("read_table can read from a pipe (552)", { x <- read_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6")) expect_equal(x$a, c(1, 4)) }) test_that("read_table does not duplicate header rows for leading whitespace (747)", { x <- read_table("\nfoo bar\n1 2") expect_equal(nrow(x), 1) expect_equal(x$foo, 1) }) test_that("read_table ignores blank lines at the end of a file (657)", { expect_warning(x <- read_table("x y\n1 2\n\n"), NA) expect_equal(nrow(x), 1) expect_equal(x$x, 1) }) test_that("read_table ignores unpaired quotes when skipping (#1180)", { res <- tibble::as_tibble(read_table('dummy"\nvar1 var2 var3\n1 2 3\n4 5 6\n', skip = 1)) expect_equal(res, tibble::tibble(var1 = c(1, 4), var2 = c(2, 5), var3 = c(3, 6))) }) readr/tests/testthat/test-locale.R0000644000176200001440000000103014174357220016707 0ustar liggesuserstest_that("setting decimal mark overrides grouping mark", { expect_equal(locale(decimal_mark = ".")$grouping_mark, ",") expect_equal(locale(decimal_mark = ",")$grouping_mark, ".") }) test_that("setting grouping mark overrides decimal mark", { expect_equal(locale(grouping_mark = ".")$decimal_mark, ",") expect_equal(locale(grouping_mark = ",")$decimal_mark, ".") }) test_that("grouping and decimal marks must be different", { expect_error( locale(grouping_mark = ".", decimal_mark = "."), "must be different" ) }) readr/tests/testthat/test-parsing-datetime.R0000644000176200001440000002315214174704674020727 0ustar liggesuserstest_that("utctime is equivalent to R conversion", { year <- seq(0, 4000) mon <- rep(3L, length(year)) day <- rep(1L, length(year)) zero_i <- rep(0L, length(year)) zero_d <- rep(0, length(year)) expect_equal( utctime(year, mon, day, zero_i, zero_i, zero_i, zero_d), ISOdatetime(year, mon, day, zero_i, zero_i, zero_i, tz = "UTC") ) }) # Parsing ---------------------------------------------------------------------- r_parse <- function(x, fmt) as.POSIXct(strptime(x, fmt, tz = "UTC")) test_that("%d, %m and %y", { target <- utctime(2010L, 2L, 3L, 0L, 0L, 0L, 0) expect_equal(parse_datetime("10-02-03", "%y-%m-%d"), target) expect_equal(parse_datetime("10-03-02", "%y-%d-%m"), target) expect_equal(parse_datetime("03/02/10", "%d/%m/%y"), target) expect_equal(parse_datetime("02/03/10", "%m/%d/%y"), target) }) test_that("Compound formats work", { target <- utctime(2010L, 2L, 3L, 0L, 0L, 0L, 0) expect_equal(parse_datetime("02/03/10", "%D"), target) expect_equal(parse_datetime("2010-02-03", "%F"), target) expect_equal(parse_datetime("10/02/03", "%x"), target) }) test_that("%y matches R behaviour", { expect_equal( parse_datetime("01-01-69", "%d-%m-%y"), r_parse("01-01-69", "%d-%m-%y") ) expect_equal( parse_datetime("01-01-68", "%d-%m-%y"), r_parse("01-01-68", "%d-%m-%y") ) }) test_that("%e allows leading space", { expect_equal(parse_datetime("201010 1", "%Y%m%e"), utctime(2010L, 10L, 1L, 0L, 0L, 0L, 0)) }) test_that("%OS captures partial seconds", { x <- parse_datetime("2001-01-01 00:00:01.125", "%Y-%m-%d %H:%M:%OS") expect_equal(as.POSIXlt(x)$sec, 1.125) x <- parse_datetime("2001-01-01 00:00:01.333", "%Y-%m-%d %H:%M:%OS") expect_equal(as.POSIXlt(x)$sec, 1.333, tolerance = 1e-6) }) test_that("%y requries 4 digits", { expect_warning(parse_date("003-01-01", "%Y-%m-%d"), "parsing failure") expect_warning(parse_date("03-01-01", "%Y-%m-%d"), "parsing failure") expect_warning(parse_date("00003-01-01", "%Y-%m-%d"), "parsing failure") }) test_that("invalid dates return NA", { expect_warning(x <- parse_datetime("2010-02-30", "%Y-%m-%d")) expect_true(is.na(x)) }) test_that("failed parsing returns NA", { expect_warning({ x <- parse_datetime(c("2010-02-ab", "2010-02", "2010/02/01"), "%Y-%m-%d") }) expect_equal(is.na(x), c(TRUE, TRUE, TRUE)) expect_equal(n_problems(x), 3) }) test_that("invalid specs returns NA", { expect_warning(x <- parse_datetime("2010-02-20", "%Y-%m-%m")) expect_equal(is.na(x), TRUE) expect_equal(n_problems(x), 1) }) test_that("ISO8601 partial dates are not parsed", { expect_equal(n_problems(parse_datetime("20")), 1) expect_equal(n_problems(parse_datetime("2001")), 1) expect_equal(n_problems(parse_datetime("2001-01")), 1) }) test_that("Year only gets parsed", { expect_equal(parse_datetime("2010", "%Y"), ISOdate(2010, 1, 1, 0, tz = "UTC")) expect_equal(parse_datetime("2010-06", "%Y-%m"), ISOdate(2010, 6, 1, 0, tz = "UTC")) }) test_that("%p detects AM/PM", { am <- parse_datetime(c("2015-01-01 01:00 AM", "2015-01-01 01:00 am"), "%F %I:%M %p") pm <- parse_datetime(c("2015-01-01 01:00 PM", "2015-01-01 01:00 pm"), "%F %I:%M %p") expect_equal(pm, am + 12 * 3600) expect_equal( parse_datetime("12/31/1991 12:01 AM", "%m/%d/%Y %I:%M %p"), POSIXct(694137660, "UTC") ) expect_equal( parse_datetime("12/31/1991 12:01 PM", "%m/%d/%Y %I:%M %p"), POSIXct(694180860, "UTC") ) expect_equal( parse_datetime("12/31/1991 1:01 AM", "%m/%d/%Y %I:%M %p"), POSIXct(694141260, "UTC") ) expect_warning(x <- parse_datetime( c("12/31/1991 00:01 PM", "12/31/1991 13:01 PM"), "%m/%d/%Y %I:%M %p" )) expect_equal(n_problems(x), 2) }) test_that("%b and %B are case insensitve", { ref <- parse_date("2001-01-01") expect_equal(parse_date("2001 JAN 01", "%Y %b %d"), ref) expect_equal(parse_date("2001 JANUARY 01", "%Y %B %d"), ref) }) test_that("%. requires a value", { ref <- parse_date("2001-01-01") expect_equal(parse_date("2001?01?01", "%Y%.%m%.%d"), ref) expect_warning( out <- parse_date("20010101", "%Y%.%m%.%d") ) expect_equal(n_problems(out), 1) }) test_that("%Z detects named time zones", { ref <- POSIXct(1285912800, "America/Chicago") ct <- locale(tz = "America/Chicago") expect_equal(parse_datetime("2010-10-01 01:00", locale = ct), ref) expect_equal( parse_datetime("2010-10-01 01:00 America/Chicago", "%Y-%m-%d %H:%M %Z", locale = ct), ref ) }) test_that("parse_date returns a double like as.Date()", { ref <- parse_date("2001-01-01") expect_type(parse_datetime("2001-01-01"), "double") }) test_that("parses NA/empty correctly", { expect_equal(parse_datetime(""), POSIXct(NA_real_)) expect_equal(parse_date(""), as.Date(NA)) expect_equal(parse_datetime("NA"), POSIXct(NA_real_)) expect_equal(parse_date("NA"), as.Date(NA)) expect_equal(parse_datetime("TeSt", na = "TeSt"), POSIXct(NA_real_)) expect_equal(parse_date("TeSt", na = "TeSt"), as.Date(NA)) }) # Locales ----------------------------------------------------------------- test_that("locale affects months", { jan1 <- as.Date("2010-01-01") fr <- locale("fr") expect_equal(parse_date("1 janv. 2010", "%d %b %Y", locale = fr), jan1) expect_equal(parse_date("1 janvier 2010", "%d %B %Y", locale = fr), jan1) }) test_that("locale affects day of week", { a <- parse_datetime("2010-01-01") b <- parse_date("2010-01-01") fr <- locale("fr") expect_equal(parse_datetime("Ven. 1 janv. 2010", "%a %d %b %Y", locale = fr), a) expect_equal(parse_date("Ven. 1 janv. 2010", "%a %d %b %Y", locale = fr), b) expect_warning(parse_datetime("Fri 1 janv. 1020", "%a %d %b %Y", locale = fr)) expect_warning(parse_date("Fri 1 janv. 2010", "%a %d %b %Y", locale = fr)) }) test_that("locale affects am/pm", { a <- parse_time("1:30 PM", "%H:%M %p") b <- parse_time("오후 1시 30분", "%p %H시 %M분", locale = locale("ko")) expect_equal(a, b) }) test_that("locale affects both guessing and parsing", { out <- parse_guess("01/02/2013", locale = locale(date_format = "%m/%d/%Y")) expect_equal(out, as.Date("2013-01-02")) }) test_that("na affects both guessing and parsing (#1041)", { out <- parse_guess(c("123", "NA"), na = "NA") expect_equal(out, c(123, NA_real_)) }) test_that("text re-encoded before strings are parsed", { skip_on_cran() # need to figure out why this fails skip_on_os("solaris") x <- "1 f\u00e9vrier 2010" y <- iconv(x, from = "UTF-8", to = "ISO-8859-1") feb01 <- as.Date(ISOdate(2010, 02, 01)) expect_equal( parse_date(x, "%d %B %Y", locale = locale("fr")), feb01 ) expect_equal( parse_date(y, "%d %B %Y", locale = locale("fr", encoding = "ISO-8859-1")), feb01 ) }) # Time zones ------------------------------------------------------------------ test_that("same times with different offsets parsed as same time", { # From http://en.wikipedia.org/wiki/ISO_8601#Time_offsets_from_UTC same_time <- paste("2010-02-03", c("18:30Z", "22:30+04", "1130-0700", "15:00-03:30")) parsed <- parse_datetime(same_time) expect_equal(parsed, rep(utctime(2010L, 2L, 3L, 18L, 30L, 0L, 0), 4)) }) test_that("offsets can cross date boundaries", { expect_equal( parse_datetime("2015-01-31T2000-0500"), parse_datetime("2015-02-01T0100Z") ) }) test_that("unambiguous times with and without daylight savings", { skip_on_cran() # need to figure out why this fails melb <- locale(tz = "Australia/Melbourne") # Melbourne had daylight savings in 2015 that ended the morning of 2015-04-05 expect_equal( parse_datetime(c("2015-04-04 12:00:00", "2015-04-06 12:00:00"), locale = melb), POSIXct(c(1428109200, 1428285600), "Australia/Melbourne") ) # Japan didn't have daylight savings in 2015 ja <- locale(tz = "Japan") expect_equal( parse_datetime(c("2015-04-04 12:00:00", "2015-04-06 12:00:00"), locale = ja), POSIXct(c(1428116400, 1428289200), "Japan") ) }) test_that("ambiguous times always choose the earliest time", { ny <- locale(tz = "America/New_York") format <- "%Y-%m-%d %H:%M:%S%z" expected <- as.POSIXct("1970-10-25 01:30:00-0400", tz = "America/New_York", format = format) actual <- parse_datetime("1970-10-25 01:30:00", locale = ny) expect_equal(actual, expected) }) test_that("nonexistent times return NA", { ny <- locale(tz = "America/New_York") expected <- .POSIXct(NA_real_, tz = "America/New_York") actual <- parse_datetime("1970-04-26 02:30:00", locale = ny) expect_equal(actual, expected) }) test_that("can use `tz = ''` for system time zone", { withr::local_timezone("Europe/London") system <- locale(tz = "") expected <- as.POSIXct("1970-01-01 00:00:00", tz = "Europe/London") actual <- parse_datetime("1970-01-01 00:00:00", locale = system) expect_equal(actual, expected) }) test_that("can catch faulty system time zones", { withr::local_timezone("foo") expect_error(locale(tz = ""), "Unknown TZ foo") }) # Guessing --------------------------------------------------------------------- test_that("DDDD-DD not parsed as date (i.e. doesn't trigger partial date match)", { expect_equal(guess_parser(c("1989-90", "1990-91")), "character") }) test_that("leading zeros don't get parsed as date without explicit separator", { expect_equal(guess_parser("00010203"), "character") expect_equal(guess_parser("0001-02-03"), "date") }) test_that("must have either two - or none", { expect_equal(guess_parser("2000-10-10"), "date") expect_equal(guess_parser("2000-1010"), "character") expect_equal(guess_parser("200010-10"), "character") expect_equal(guess_parser("20001010"), "double") }) test_that("Invalid formats error", { expect_error(parse_date("2020-11-17", "%%Y-%m-%d"), "Unsupported format %%Y-%m-%d") }) readr/tests/testthat/basic-df-singlequote.csv0000644000176200001440000000046014152512262021073 0ustar liggesusersa,b,c,d TRUE,7,0.181526642525569,'m' TRUE,2,0.833227441413328,'z' TRUE,8,0.926790483295918,'r' FALSE,10,0.375270307529718,'s' TRUE,6,0.420266286935657,'g' TRUE,3,0.435449987649918,'h' TRUE,5,0.0210941969417036,'w' FALSE,9,0.0915570755023509,'u' FALSE,1,0.756106866057962,'l' FALSE,4,0.353530979715288,NA readr/tests/testthat/test-read_log.R0000644000176200001440000000143714304131171017225 0ustar liggesuserstest_that("read_log trims whitespace", { tf <- tempfile() on.exit(unlink(tf)) writeLines( 'Nov 4 00:00:55 vrpweb1 httpd: 131.161.8.219 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-includes/js/jquery/jquery-migrate.min.js?ver=1.4.1 HTTP/1.1" 200 10056 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36" Nov 14 00:00:55 vrpweb1 httpd: 131.161.8.216 - - [04/Nov/2017:00:00:55 -0400] "GET /wp-content/plugins/wooslider-AxZp6o/assets/js/jquery.flexslider.min.js?ver=2.4.1-20170608 HTTP/1.1" 200 22414 "http://www.colby.edu/" "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1500.72 Safari/537.36"', tf ) res <- read_log(tf) expect_equal(res[[2]], c(4, 14)) }) readr/tests/testthat/helper.R0000644000176200001440000000206414304131171015750 0ustar liggesusers# Provide helper overriding tibble::all.equal.tbl_df as it requires dplyr # https://github.com/tidyverse/readr/pull/577 # Using this helper allows us to avoid Suggesting dplyr all.equal.tbl_df <- function(target, current, ..., check.attributes = FALSE) { all.equal.list(target, current, ..., check.attributes = check.attributes) } is_bz2_file <- function(x) { # Magic number for bz2 is "BZh" in ASCII # https://en.wikipedia.org/wiki/Bzip2#File_format identical(charToRaw("BZh"), readBin(x, n = 3, what = "raw")) } encoded <- function(x, encoding) { Encoding(x) <- encoding x } skip_if_no_clipboard <- function() { if (!clipr::clipr_available()) { testthat::skip("System clipboard is not available - skipping test.") } return(invisible(TRUE)) } skip_if_edition_second <- function() { skip_if_not(edition_first()) } skip_if_edition_first <- function() { skip_if(edition_first()) } edition_variant <- function() { paste0("edition-", edition_get()) } skip_if_edition_first_windows <- function() { if (edition_first()) skip_on_os("windows") } readr/tests/testthat/_snaps/0000755000176200001440000000000014510344025015632 5ustar liggesusersreadr/tests/testthat/_snaps/col-spec.md0000644000176200001440000000606614533445467017712 0ustar liggesusers# print(col_spec) with guess_parser Code col_spec_standardise("a,b,c\n1,2,3") Output cols( a = col_double(), b = col_double(), c = col_double() ) # print(col_spec) with collector_skip Code cols_only(a = col_integer(), c = col_integer()) Output cols_only( a = col_integer(), c = col_integer() ) # print(col_spec) with truncated output Code print(out, n = 2, condense = FALSE) Output cols( .default = col_character(), a = col_character(), b = col_character() # ... with 1 more columns ) # print(col_spec) works with dates Code out Output cols( a = col_date(format = "%Y-%m-%d"), b = col_date(format = ""), c = col_date(format = "") ) # print(col_spec) with unnamed columns Code col_spec_standardise(col_types = "c_c", col_names = c("a", "c")) Output cols( a = col_character(), col_skip(), c = col_character() ) # print(cols_only()) prints properly Code cols_only(a = col_character(), c = col_integer()) Output cols_only( a = col_character(), c = col_integer() ) # print(cols_condense(col_spec)) condenses the spec Code cols_condense(col_spec_standardise("a,b,c,d\n1,2,3,a")) Output cols( .default = col_double(), d = col_character() ) --- Code cols_condense(col_spec_standardise("a,b,c,d\n1,2,3,4")) Output cols( .default = col_double() ) # print(col_spec) with no columns specified Code cols() Output cols() --- Code cols(.default = col_character()) Output cols( .default = col_character() ) # print(col_spec) and condense edge cases Code print(cols(a = col_integer(), b = col_integer(), c = col_double()), n = 1, condense = TRUE, colour = FALSE) Output cols( .default = col_integer(), c = col_double() ) # print(col_spec) with colors cols( a = col_double(), b = col_integer(), c = col_logical(), d = col_character(), e = col_date(format = ""), f = col_datetime(format = ""), g = col_time(format = ""), h = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE), i = col_skip() ) # non-syntatic names are escaped Code col_spec_standardise("a b,_c,1,a`b\n1,2,3,4") Output cols( `a b` = col_double(), `_c` = col_double(), `1` = col_double(), `a\`b` = col_double() ) # long spec declarations can be formatted Code cols(a = col_factor(levels = c("apple", "pear", "banana", "peach", "apricot", "orange", "plum"), ordered = TRUE)) Output cols( a = col_factor(levels = c("apple", "pear", "banana", "peach", "apricot", "orange", "plum" ), ordered = TRUE, include_na = FALSE) ) readr/tests/testthat/_snaps/edition-2/0000755000176200001440000000000014510343777017441 5ustar liggesusersreadr/tests/testthat/_snaps/edition-2/read-csv.md0000644000176200001440000000022014533445471021460 0ustar liggesusers# comments are ignored regardless of where they appear Code out4 <- read_csv(I("x,y\n1,#comment"), comment = "#", col_types = "cc") readr/tests/testthat/_snaps/edition-2/col-spec.md0000644000176200001440000000201214533445467021467 0ustar liggesusers# options(readr.show_col_types) controls col spec printing Code out <- read_csv(readr_example("mtcars.csv")) Message Rows: 32 Columns: 11 -- Column specification -------------------------------------------------------- Delimiter: "," dbl (11): mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb i Use `spec()` to retrieve the full column specification for this data. i Specify the column types or set `show_col_types = FALSE` to quiet this message. # `show_col_types` controls col spec printing Code out <- read_csv(readr_example("mtcars.csv"), show_col_types = TRUE) Message Rows: 32 Columns: 11 -- Column specification -------------------------------------------------------- Delimiter: "," dbl (11): mpg, cyl, disp, hp, drat, wt, qsec, vs, am, gear, carb i Use `spec()` to retrieve the full column specification for this data. i Specify the column types or set `show_col_types = FALSE` to quiet this message. readr/tests/testthat/_snaps/edition-1/0000755000176200001440000000000014510344142017423 5ustar liggesusersreadr/tests/testthat/_snaps/edition-1/read-csv.md0000644000176200001440000000275714510344200021457 0ustar liggesusers# warning lines are correct after skipping Code out3 <- read_csv(I("v1,v2\n\n1,2\n\n3,4"), col_types = "i") Condition Warning: Unnamed `col_types` should have the same length as `col_names`. Using smaller of the two. Warning: 2 parsing failures. row col expected actual file 1 -- 1 columns 2 columns literal data 2 -- 1 columns 2 columns literal data --- Code out4 <- read_csv(I("v1,v2\n#foo\n1,2\n#bar\n3,4"), col_types = "i", comment = "#") Condition Warning: Unnamed `col_types` should have the same length as `col_names`. Using smaller of the two. Warning: 2 parsing failures. row col expected actual file 1 -- 1 columns 2 columns literal data 2 -- 1 columns 2 columns literal data # too few or extra col_types generates warnings Code out1 <- read_csv(I("v1,v2\n1,2"), col_types = "i", lazy = FALSE) Condition Warning: Unnamed `col_types` should have the same length as `col_names`. Using smaller of the two. Warning: 1 parsing failure. row col expected actual file 1 -- 1 columns 2 columns literal data # comments are ignored regardless of where they appear Code out4 <- read_csv(I("x,y\n1,#comment"), comment = "#", col_types = "cc") Condition Warning: 1 parsing failure. row col expected actual file 1 -- 2 columns 1 columns literal data readr/tests/testthat/_snaps/edition-1/col-spec.md0000644000176200001440000000214014510344176021456 0ustar liggesusers# options(readr.show_col_types) controls col spec printing Code out <- read_csv(readr_example("mtcars.csv")) Message -- Column specification -------------------------------------------------------- cols( mpg = col_double(), cyl = col_double(), disp = col_double(), hp = col_double(), drat = col_double(), wt = col_double(), qsec = col_double(), vs = col_double(), am = col_double(), gear = col_double(), carb = col_double() ) # `show_col_types` controls col spec printing Code out <- read_csv(readr_example("mtcars.csv"), show_col_types = TRUE) Message -- Column specification -------------------------------------------------------- cols( mpg = col_double(), cyl = col_double(), disp = col_double(), hp = col_double(), drat = col_double(), wt = col_double(), qsec = col_double(), vs = col_double(), am = col_double(), gear = col_double(), carb = col_double() ) readr/tests/testthat/_snaps/utils.md0000644000176200001440000000036414533445473017335 0ustar liggesusers# check_string() works Code name <- 1 check_string(name) Condition Error: ! `name` must be a string. Code check_string(name, nm = "NAME!") Condition Error: ! `NAME!` must be a string. readr/tests/testthat/basic-df.csv0000644000176200001440000000046014152512262016536 0ustar liggesusersa,b,c,d TRUE,7,0.181526642525569,"m" TRUE,2,0.833227441413328,"z" TRUE,8,0.926790483295918,"r" FALSE,10,0.375270307529718,"s" TRUE,6,0.420266286935657,"g" TRUE,3,0.435449987649918,"h" TRUE,5,0.0210941969417036,"w" FALSE,9,0.0915570755023509,"u" FALSE,1,0.756106866057962,"l" FALSE,4,0.353530979715288,NA readr/tests/testthat/test-eol.R0000644000176200001440000000322114174704674016244 0ustar liggesusersif (FALSE) { df <- data.frame(x = 1:3, y = letters[1:3], stringsAsFactors = FALSE) write.csv(df, test_path("eol-lf.csv"), row.names = FALSE, eol = "\n") write.csv(df, test_path("eol-cr.csv"), row.names = FALSE, eol = "\r") write.csv(df, test_path("eol-crlf.csv"), row.names = FALSE, eol = "\r\n") write.fwf <- function(x, path, ...) { write.table(x, path, row.names = FALSE, quote = FALSE) } write.fwf(df, test_path("eol-lf.txt"), row.names = FALSE, eol = "\n") write.fwf(df, test_path("eol-cr.txt"), row.names = FALSE, eol = "\r") write.fwf(df, test_path("eol-crlf.txt"), row.names = FALSE, eol = "\r\n") } test_that("read_csv standardises line breaks", { expect_equal(read_csv(test_path("eol-lf.csv"))$y, letters[1:3]) # expect_equal(read_csv(test_path("eol-cr.csv"))$y, letters[1:3]) expect_equal(read_csv(test_path("eol-crlf.csv"))$y, letters[1:3]) }) test_that("read_csv skipping works with windows newlines", { expect_equal(read_csv(test_path("eol-lf.csv"), skip = 2, col_names = FALSE)[[2]], letters[2:3]) # expect_equal(read_csv("eol-cr.csv", skip = 2, col_names = FALSE)[[2]], letters[2:3]) expect_equal(read_csv("eol-crlf.csv", skip = 2, col_names = FALSE)[[2]], letters[2:3]) }) test_that("read_lines standardises line breaks", { lf <- read_lines(test_path("eol-lf.csv")) # expect_equal(read_lines(test_path("eol-cr.csv")), lf) expect_equal(read_lines(test_path("eol-crlf.csv")), lf) }) test_that("read_fwf/read_table standardises line breaks", { expect_equal(read_table("eol-lf.txt")$y, letters[1:3]) # expect_equal(read_table("eol-cr.txt")$y, letters[1:3]) expect_equal(read_table("eol-crlf.txt")$y, letters[1:3]) }) readr/tests/testthat/eol-cr.txt0000644000176200001440000000002014152512262016263 0ustar liggesusersx y 1 a 2 b 3 c readr/tests/testthat/test-collectors.R0000644000176200001440000000251414174704674017642 0ustar liggesuserstest_that("guess for empty strings is logical", { expect_equal(guess_parser(c("", "")), "logical") }) test_that("guess for missing vector is logical", { expect_equal(guess_parser(NA_character_), "logical") }) test_that("empty + NA ignored when determining type", { expect_equal(guess_parser(c("1", "")), "double") expect_equal(guess_parser(c("1", NA)), "double") }) test_that("guess decimal commas with correct locale", { expect_equal(guess_parser("1,300"), "number") expect_equal(guess_parser("1,300", locale(decimal_mark = ",")), "double") }) # Numbers ----------------------------------------------------------------- test_that("only accept numbers with grouping mark", { expect_equal(guess_parser("1,300"), "number") expect_equal(guess_parser("1,300.00"), "number") }) # Concise collectors specification ---------------------------------------- test_that("_ or - skips column", { out1 <- read_csv(I("x,y\n1,2\n3,4"), col_types = "-i") out2 <- read_csv(I("x,y\n1,2\n3,4"), col_types = "_i") expect_equal(names(out1), "y") expect_equal(names(out2), "y") }) test_that("? guesses column type", { out1 <- read_csv(I("x,y\n1,2\n3,4"), col_types = "?i") expect_equal(out1$x, c(1L, 3L)) }) test_that("f parses factor (#810)", { out <- read_csv(I("x,y\na,2\nb,4"), col_types = "fi") expect_s3_class(out$x, "factor") }) readr/tests/testthat/test-melt-csv.R0000644000176200001440000001576414174704674017236 0ustar liggesuserstest_that("read_csv type imputation and NA detection works", { skip_on_os("windows") withr::local_options(lifecycle_verbosity = "quiet") melt_data <- melt_csv("non-tabular.csv", na = "NA") expect_equal( melt_data$data_type[7:11], c("missing", "empty", "character", "integer", "double") ) }) test_that("read_tsv works on a simple file", { withr::local_options(lifecycle_verbosity = "quiet") melt_data <- melt_tsv("a\tb\n1\t2") expect_equal(melt_data$data_type, rep(c("character", "integer"), each = 2)) }) test_that("melt_csv's 'NA' option genuinely changes the NA values", { withr::local_options(lifecycle_verbosity = "quiet") expect_equal(melt_csv("z\n", na = "z")$data_type, "missing") }) test_that("melt_csv's 'NA' option works with multiple NA values", { withr::local_options(lifecycle_verbosity = "quiet") expect_equal( melt_csv("NA\nmiss\n13", na = c("13", "miss"))$data_type, c("character", "missing", "missing") ) }) test_that('passing character() to melt_csv\'s "NA" option reads "" correctly', { withr::local_options(lifecycle_verbosity = "quiet") expect_equal(melt_csv("foo\n", na = character())$value, "foo") }) test_that("passing \"\" to melt_csv's 'NA' option reads \"\" correctly", { withr::local_options(lifecycle_verbosity = "quiet") expect_equal( melt_csv("foo,bar\nfoo,\n", na = "")$value, c("foo", "bar", "foo", NA) ) }) test_that("changing melt_csv's 'quote' argument works correctly", { withr::local_options(lifecycle_verbosity = "quiet") test_data <- melt_csv("basic-df.csv") test_data_singlequote <- melt_csv("basic-df-singlequote.csv", quote = "'") expect_identical(test_data, test_data_singlequote) }) test_that("melt_csv's 'skip' option allows for skipping'", { withr::local_options(lifecycle_verbosity = "quiet") test_data <- melt_csv("basic-df.csv", skip = 1) expect_equal(nrow(test_data), 40) }) test_that("melt_csv's 'n_max' allows for a maximum number of records and does not corrupt any", { withr::local_options(lifecycle_verbosity = "quiet") test_data <- melt_csv("basic-df.csv", n_max = 7) expect_equal(nrow(test_data), 28) expect_equal(sum(test_data$data_type == "missing"), 0) }) test_that("can read more than 100 columns", { withr::local_options(lifecycle_verbosity = "quiet") set.seed(2015 - 3 - 13) x <- as.data.frame(matrix(rbinom(300, 2, .5), nrow = 2)) y <- format_csv(x) expect_equal(max(melt_csv(y)$col), 150) }) test_that("encoding affects text", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_csv("enc-iso-8859-1.txt", locale = locale(encoding = "ISO-8859-1")) expect_identical(x$value[2], "\u00e9l\u00e8ve") }) test_that("nuls are dropped with a warning", { withr::local_options(lifecycle_verbosity = "quiet") expect_warning(x <- melt_csv("raw.csv")) expect_equal(readr:::n_problems(x), 1) expect_equal(x$value[3], "ab") }) test_that("can read from the clipboard", { withr::local_options(lifecycle_verbosity = "quiet") skip_on_cran() skip_if_no_clipboard() clipr::write_clip("a,b,c\n1,2,3") expect_identical(melt_csv(clipboard()), melt_csv("a,b,c\n1,2,3")) }) test_that("can read from a multi-line character vector", { withr::local_options(lifecycle_verbosity = "quiet") expect_identical(max(melt_csv(c("a,b,c", "1,2,3"))$row), 2) }) # Column warnings --------------------------------------------------------- test_that("missing lines are not skipped", { withr::local_options(lifecycle_verbosity = "quiet") # first expect_equal(max(melt_csv("a,b\n\n\n1,2")$row), 4) # middle expect_equal(max(melt_csv("a,b\n1,2\n\n\n2,3\n")$row), 5) # last (trailing \n is ignored) expect_equal(max(melt_csv("a,b\n1,2\n\n\n")$row), 4) }) # read_csv2 --------------------------------------------------------------- test_that("decimal mark automatically set to ,", { withr::local_options(lifecycle_verbosity = "quiet") expect_message( x <- melt_csv2("x\n1,23"), if (default_locale()$decimal_mark == ".") "decimal .*grouping .*mark" else NA ) expect_equal(x$data_type[2], "double") }) # Zero rows --------------------------------------------------------------- test_that("n_max 0 gives zero row data frame", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_csv("a,b\n1,2", n_max = 0) expect_equal(dim(x), c(0, 4)) }) # Comments ---------------------------------------------------------------- test_that("comments are ignored regardless of where they appear", { withr::local_options(lifecycle_verbosity = "quiet") out1 <- melt_csv("x\n1#comment", comment = "#") out2 <- melt_csv("x\n1#comment\n#comment", comment = "#") out3 <- melt_csv('x\n"1"#comment', comment = "#") chk1 <- tibble::tibble( row = c(1, 2), col = c(1, 1), data_type = c("character", "integer"), value = c("x", "1") ) expect_true(all.equal(chk1, out1)) expect_true(all.equal(chk1, out2)) expect_true(all.equal(chk1, out3)) out5 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3#,B2,C2\nA4,A5,A6", comment = "#") out6 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\nA4,A5,A6", comment = "#") out7 <- melt_csv("x1,x2,x3\nA2,B2,C2\nA3,#B2,C2\n#comment\nA4,A5,A6", comment = "#") chk2 <- tibble::tibble( row = c(1, 1, 1, 2, 2, 2, 3, 4, 4, 4), col = c(1, 2, 3, 1, 2, 3, 1, 1, 2, 3), data_type = "character", value = c("x1", "x2", "x3", "A2", "B2", "C2", "A3", "A4", "A5", "A6") ) expect_true(all.equal(chk2, out5)) expect_true(all.equal(chk2, out6)) expect_true(all.equal(chk2, out7)) }) test_that("escaped/quoted comments are ignored", { withr::local_options(lifecycle_verbosity = "quiet") out1 <- melt_delim("x\n\\#", comment = "#", delim = ",", escape_backslash = TRUE, escape_double = FALSE ) out2 <- melt_csv('x\n"#"', comment = "#") expect_equal(out1$value[2], "#") expect_equal(out2$value[2], "#") }) test_that("leading comments are ignored", { withr::local_options(lifecycle_verbosity = "quiet") out <- melt_csv("#a\n#b\nx\n1", comment = "#") expect_equal(nrow(out), 2) expect_equal(out$value[2], "1") }) test_that("skip respects comments", { withr::local_options(lifecycle_verbosity = "quiet") melt_x <- function(...) { melt_csv("#a\nb\nc", ...)$value } expect_equal(melt_x(), c("#a", "b", "c")) expect_equal(melt_x(skip = 1), c("b", "c")) expect_equal(melt_x(comment = "#"), c("b", "c")) expect_equal(melt_x(comment = "#", skip = 2), c("c")) }) test_that("melt_csv returns a four-col zero-row data.frame on an empty file", { withr::local_options(lifecycle_verbosity = "quiet") expect_equal(dim(melt_csv("empty-file")), c(0, 4)) }) test_that("melt_delim errors on length 0 delimiter", { withr::local_options(lifecycle_verbosity = "quiet") expect_error( melt_delim("a b\n1 2\n", delim = ""), "`delim` must be at least one character, use `melt_table\\(\\)` for whitespace delimited input\\." ) }) test_that("melt_csv handles whitespace between delimiters and quoted fields", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_csv('1, \"hi,there\"\n3,4') expect_equal(x$value[2:3], c("hi,there", "3")) }) readr/tests/testthat/enc-iso-8859-1.txt0000644000176200001440000000001714152512262017216 0ustar liggesusersfranais lve readr/tests/testthat/test-melt-table.R0000644000176200001440000000627314174704674017525 0ustar liggesusers# melt_table ------------------------------------------------------------------- test_that("melt_table silently reads ragged last column", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table("foo bar\n1 2\n3 4\n5 6\n") expect_equal(x$value[-1:-2], as.character(1:6)) }) test_that("melt_table skips all comment lines", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table("foo bar\n1 2\n3 4\n5 6\n") y <- melt_table("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") expect_equal(x, y) }) test_that("missing lines are not skipped", { withr::local_options(lifecycle_verbosity = "quiet") # first expect_equal(max(melt_table("a b\n\n\n12 34")$row), 4) # middle expect_equal(max(melt_table("a b\n12 34\n\n\n23 45")$row), 5) # last (trailing \n is ignored) expect_equal(max(melt_table("a b\n12 34\n\n\n")$row), 4) }) test_that("melt_table can read from a pipe", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table(pipe("echo a b c && echo 1 2 3 && echo 4 5 6")) expect_equal(x$value[-1:-3], as.character(1:6)) }) test_that("melt_table can read a truncated file without crashing", { withr::local_options(lifecycle_verbosity = "quiet") expect_warning(expect_error(melt_table("table-crash"), NA)) }) test_that("melt_table returns an empty data.frame on an empty file", { withr::local_options(lifecycle_verbosity = "quiet") empty_df <- tibble::tibble( row = double(), col = double(), data_type = character(), value = character() ) expect_true(all.equal(melt_table("empty-file"), empty_df)) }) # melt_table2 ------------------------------------------------------------------- test_that("melt_table2 silently reads ragged columns", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table2("foo bar\n1 2\n3 4\n5 6\n") expect_equal(x$value[-1:-2], as.character(1:6)) }) test_that("melt_table2 skips all comment lines", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table2("foo bar\n1 2\n3 4\n5 6\n") y <- melt_table2("#comment1\n#comment2\nfoo bar\n1 2\n3 4\n5 6\n", comment = "#") expect_equal(x, y) }) test_that("melt_table2 can read from a pipe", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table2(pipe("echo a b c&& echo 1 2 3&& echo 4 5 6")) expect_equal(x$value[-1:-3], as.character(1:6)) }) test_that("melt_table2 does not duplicate header rows for leading whitespace", { withr::local_options(lifecycle_verbosity = "quiet") x <- melt_table2("foo bar\n1 2\n") expect_equal(nrow(x), 4L) expect_equal(x$value[-1:-2], as.character(1:2)) }) test_that("melt_table2 ignores blank lines at the end of a file", { withr::local_options(lifecycle_verbosity = "quiet") expect_warning(x <- melt_table2("x y\n1 2\n\n"), NA) expect_equal(nrow(x), 5L) expect_equal(x$value[3:4], as.character(1:2)) }) test_that("melt_table2 returns an empty data.frame on an empty file", { withr::local_options(lifecycle_verbosity = "quiet") empty_df <- tibble::tibble( row = double(), col = double(), data_type = character(), value = character() ) expect_true(all.equal(melt_table2("empty-file"), empty_df)) }) readr/tests/testthat/test-read-chunked.R0000644000176200001440000001276314304131171020007 0ustar liggesuserstest_that("read_lines_chunked", { file <- readr_example("mtcars.csv") num_rows <- length(readLines(file)) get_sizes <- function(data, pos) sizes[[length(sizes) + 1]] <<- length(data) # Full file in one chunk sizes <- list() read_lines_chunked(file, get_sizes) expect_equal(num_rows, sizes[[1]]) # Each line separately sizes <- list() read_lines_chunked(file, get_sizes, chunk_size = 1) expect_true(all(sizes == 1)) expect_equal(num_rows, length(sizes)) # In chunks of 5 sizes <- list() read_lines_chunked(file, get_sizes, chunk_size = 5) expect_true(all(sizes[1:6] == 5)) expect_true(all(sizes[[7]] == 3)) # Halting early get_sizes_stop <- function(data, pos) { sizes[[length(sizes) + 1]] <<- length(data) if (pos >= 5) { return(FALSE) } } sizes <- list() read_lines_chunked(file, get_sizes_stop, chunk_size = 5) expect_true(length(sizes) == 2) expect_true(all(sizes[1:2] == 5)) }) test_that("read_lines_raw_chunked", { file <- readr_example("mtcars.csv") num_rows <- length(readLines(file)) get_sizes <- function(data, pos) sizes[[length(sizes) + 1]] <<- length(data) # Full file in one chunk sizes <- list() read_lines_raw_chunked(file, get_sizes) expect_equal(num_rows, sizes[[1]]) # Each line separately sizes <- list() read_lines_raw_chunked(file, get_sizes, chunk_size = 1) expect_true(all(sizes == 1)) expect_equal(num_rows, length(sizes)) # In chunks of 5 sizes <- list() read_lines_raw_chunked(file, get_sizes, chunk_size = 5) expect_true(all(sizes[1:6] == 5)) expect_true(all(sizes[[7]] == 3)) # Halting early get_sizes_stop <- function(data, pos) { sizes[[length(sizes) + 1]] <<- length(data) if (pos >= 5) { return(FALSE) } } sizes <- list() read_lines_raw_chunked(file, get_sizes_stop, chunk_size = 5) expect_true(length(sizes) == 2) expect_true(all(sizes[1:2] == 5)) }) test_that("read_delim_chunked", { file <- readr_example("mtcars.csv") unchunked <- read_csv(file) get_dims <- function(data, pos) dims[[length(dims) + 1]] <<- dim(data) # Full file in one chunk dims <- list() read_csv_chunked(file, get_dims) expect_equal(dim(unchunked), dims[[1]]) # Each line separately dims <- list() read_csv_chunked(file, get_dims, chunk_size = 1) expect_true(all(vapply(dims[1:6], identical, logical(1), c(1L, 11L)))) expect_equal(nrow(unchunked), length(dims)) # In chunks of 5 dims <- list() read_csv_chunked(file, get_dims, chunk_size = 5) expect_true(all(vapply(dims[1:6], identical, logical(1), c(5L, 11L)))) expect_true(identical(dims[[7]], c(2L, 11L))) # In chunks of 5 with read_delim dims <- list() read_delim_chunked(file, delim = ",", get_dims, chunk_size = 5) expect_true(all(vapply(dims[1:6], identical, logical(1), c(5L, 11L)))) expect_true(identical(dims[[7]], c(2L, 11L))) # Halting early get_dims_stop <- function(data, pos) { dims[[length(dims) + 1]] <<- dim(data) if (pos >= 5) { return(FALSE) } } dims <- list() read_csv_chunked(file, get_dims_stop, chunk_size = 5) expect_true(length(dims) == 2) expect_true(all(vapply(dims[1:2], identical, logical(1), c(5L, 11L)))) }) test_that("DataFrameCallback works as intended", { f <- readr_example("mtcars.csv") out0 <- subset(read_csv(f), gear == 3) attr(out0, "problems") <- NULL fun3 <- DataFrameCallback$new(function(x, pos) subset(x, gear == 3)) out1 <- read_csv_chunked(f, fun3) # Need to set guess_max higher than 1 to guess correct column types out2 <- read_csv_chunked(f, fun3, chunk_size = 1, guess_max = 10) out3 <- read_csv_chunked(f, fun3, chunk_size = 10) expect_true(all.equal(out0, out1)) expect_true(all.equal(out0, out2)) expect_true(all.equal(out0, out3)) # No matching rows out0 <- subset(read_csv(f), gear == 5) attr(out0, "problems") <- NULL fun5 <- DataFrameCallback$new(function(x, pos) subset(x, gear == 5)) out1 <- read_csv_chunked(f, fun5) # Need to set guess_max higher than 1 to guess correct column types out2 <- read_csv_chunked(f, fun5, chunk_size = 1, guess_max = 10) out3 <- read_csv_chunked(f, fun5, chunk_size = 10) expect_true(all.equal(out0, out1)) expect_true(all.equal(out0, out2)) expect_true(all.equal(out0, out3)) }) test_that("ListCallback works as intended", { f <- readr_example("mtcars.csv") out0 <- read_csv(f) fun <- ListCallback$new(function(x, pos) x[["mpg"]]) out1 <- read_csv_chunked(f, fun, chunk_size = 10) expect_equal(out0[["mpg"]], unlist(out1)) }) test_that("AccumulateCallback works as intended", { f <- readr_example("mtcars.csv") out0 <- read_csv(f) min_chunks <- function(x, pos, acc) { f <- function(x) { x[order(x$wt), ][1, ] } if (is.null(acc)) { acc <- data.frame() } f(rbind(x, acc)) } fun1 <- AccumulateCallback$new(min_chunks) out1 <- read_csv_chunked(f, fun1, chunk_size = 10) expect_equal(min_chunks(out0, acc = NULL), out1) sum_chunks <- function(x, pos, acc) { sum(x$wt) + acc } fun2 <- AccumulateCallback$new(sum_chunks, acc = 0) out2 <- read_csv_chunked(f, fun2, chunk_size = 10) expect_equal(sum_chunks(out0, acc = 0), out2) expect_error( AccumulateCallback$new(function(x, i) x), "`callback` must have three or more arguments" ) }) test_that("Chunks include their spec (#1143)", { res <- read_csv_chunked(readr_example("mtcars.csv"), callback = ListCallback$new(function(x, pos) spec(x)), chunk_size = 20 ) expect_equal(res[[1]]$cols, spec_csv(readr_example("mtcars.csv"))$cols) }) readr/tests/testthat/test-utils.R0000644000176200001440000000036214304131171016605 0ustar liggesuserstest_that("check_string() works", { expect_null(check_string("string")) expect_null(check_string(NULL, optional = TRUE)) expect_snapshot(error = TRUE, { name <- 1 check_string(name) check_string(name, nm = "NAME!") }) }) readr/tests/testthat/test_list_col_name.csv0000644000176200001440000000654214174704674020762 0ustar liggesusersx,y A,1 B,foo C,2:9 D,"list(Sepal.Length = c(5.1, 4.9, 4.7, 4.6, 5, 5.4, 4.6, 5, 4.4, 4.9, 5.4, 4.8, 4.8, 4.3, 5.8, 5.7, 5.4, 5.1, 5.7, 5.1, 5.4, 5.1, 4.6, 5.1, 4.8, 5, 5, 5.2, 5.2, 4.7, 4.8, 5.4, 5.2, 5.5, 4.9, 5, 5.5, 4.9, 4.4, 5.1, 5, 4.5, 4.4, 5, 5.1, 4.8, 5.1, 4.6, 5.3, 5, 7, 6.4, 6.9, 5.5, 6.5, 5.7, 6.3, 4.9, 6.6, 5.2, 5, 5.9, 6, 6.1, 5.6, 6.7, 5.6, 5.8, 6.2, 5.6, 5.9, 6.1, 6.3, 6.1, 6.4, 6.6, 6.8, 6.7, 6, 5.7, 5.5, 5.5, 5.8, 6, 5.4, 6, 6.7, 6.3, 5.6, 5.5, 5.5, 6.1, 5.8, 5, 5.6, 5.7, 5.7, 6.2, 5.1, 5.7, 6.3, 5.8, 7.1, 6.3, 6.5, 7.6, 4.9, 7.3, 6.7, 7.2, 6.5, 6.4, 6.8, 5.7, 5.8, 6.4, 6.5, 7.7, 7.7, 6, 6.9, 5.6, 7.7, 6.3, 6.7, 7.2, 6.2, 6.1, 6.4, 7.2, 7.4, 7.9, 6.4, 6.3, 6.1, 7.7, 6.3, 6.4, 6, 6.9, 6.7, 6.9, 5.8, 6.8, 6.7, 6.7, 6.3, 6.5, 6.2, 5.9), Sepal.Width = c(3.5, 3, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7, 3.4, 3, 3, 4, 4.4, 3.9, 3.5, 3.8, 3.8, 3.4, 3.7, 3.6, 3.3, 3.4, 3, 3.4, 3.5, 3.4, 3.2, 3.1, 3.4, 4.1, 4.2, 3.1, 3.2, 3.5, 3.6, 3, 3.4, 3.5, 2.3, 3.2, 3.5, 3.8, 3, 3.8, 3.2, 3.7, 3.3, 3.2, 3.2, 3.1, 2.3, 2.8, 2.8, 3.3, 2.4, 2.9, 2.7, 2, 3, 2.2, 2.9, 2.9, 3.1, 3, 2.7, 2.2, 2.5, 3.2, 2.8, 2.5, 2.8, 2.9, 3, 2.8, 3, 2.9, 2.6, 2.4, 2.4, 2.7, 2.7, 3, 3.4, 3.1, 2.3, 3, 2.5, 2.6, 3, 2.6, 2.3, 2.7, 3, 2.9, 2.9, 2.5, 2.8, 3.3, 2.7, 3, 2.9, 3, 3, 2.5, 2.9, 2.5, 3.6, 3.2, 2.7, 3, 2.5, 2.8, 3.2, 3, 3.8, 2.6, 2.2, 3.2, 2.8, 2.8, 2.7, 3.3, 3.2, 2.8, 3, 2.8, 3, 2.8, 3.8, 2.8, 2.8, 2.6, 3, 3.4, 3.1, 3, 3.1, 3.1, 3.1, 2.7, 3.2, 3.3, 3, 2.5, 3, 3.4, 3), Petal.Length = c(1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5, 1.6, 1.4, 1.1, 1.2, 1.5, 1.3, 1.4, 1.7, 1.5, 1.7, 1.5, 1, 1.7, 1.9, 1.6, 1.6, 1.5, 1.4, 1.6, 1.6, 1.5, 1.5, 1.4, 1.5, 1.2, 1.3, 1.4, 1.3, 1.5, 1.3, 1.3, 1.3, 1.6, 1.9, 1.4, 1.6, 1.4, 1.5, 1.4, 4.7, 4.5, 4.9, 4, 4.6, 4.5, 4.7, 3.3, 4.6, 3.9, 3.5, 4.2, 4, 4.7, 3.6, 4.4, 4.5, 4.1, 4.5, 3.9, 4.8, 4, 4.9, 4.7, 4.3, 4.4, 4.8, 5, 4.5, 3.5, 3.8, 3.7, 3.9, 5.1, 4.5, 4.5, 4.7, 4.4, 4.1, 4, 4.4, 4.6, 4, 3.3, 4.2, 4.2, 4.2, 4.3, 3, 4.1, 6, 5.1, 5.9, 5.6, 5.8, 6.6, 4.5, 6.3, 5.8, 6.1, 5.1, 5.3, 5.5, 5, 5.1, 5.3, 5.5, 6.7, 6.9, 5, 5.7, 4.9, 6.7, 4.9, 5.7, 6, 4.8, 4.9, 5.6, 5.8, 6.1, 6.4, 5.6, 5.1, 5.6, 6.1, 5.6, 5.5, 4.8, 5.4, 5.6, 5.1, 5.1, 5.9, 5.7, 5.2, 5, 5.2, 5.4, 5.1), Petal.Width = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2, 0.2, 0.1, 0.1, 0.2, 0.4, 0.4, 0.3, 0.3, 0.3, 0.2, 0.4, 0.2, 0.5, 0.2, 0.2, 0.4, 0.2, 0.2, 0.2, 0.2, 0.4, 0.1, 0.2, 0.2, 0.2, 0.2, 0.1, 0.2, 0.2, 0.3, 0.3, 0.2, 0.6, 0.4, 0.3, 0.2, 0.2, 0.2, 0.2, 1.4, 1.5, 1.5, 1.3, 1.5, 1.3, 1.6, 1, 1.3, 1.4, 1, 1.5, 1, 1.4, 1.3, 1.4, 1.5, 1, 1.5, 1.1, 1.8, 1.3, 1.5, 1.2, 1.3, 1.4, 1.4, 1.7, 1.5, 1, 1.1, 1, 1.2, 1.6, 1.5, 1.6, 1.5, 1.3, 1.3, 1.3, 1.2, 1.4, 1.2, 1, 1.3, 1.2, 1.3, 1.3, 1.1, 1.3, 2.5, 1.9, 2.1, 1.8, 2.2, 2.1, 1.7, 1.8, 1.8, 2.5, 2, 1.9, 2.1, 2, 2.4, 2.3, 1.8, 2.2, 2.3, 1.5, 2.3, 2, 2, 1.8, 2.1, 1.8, 1.8, 1.8, 2.1, 1.6, 1.9, 2, 2.2, 1.5, 1.4, 2.3, 2.4, 1.8, 1.8, 2.1, 2.4, 2.3, 1.9, 2.3, 2.5, 2.3, 1.9, 2, 2.3, 1.8), Species = c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3))" readr/tests/testthat/test-parsing-logical.R0000644000176200001440000000220514174357220020530 0ustar liggesuserstest_that("TRUE and FALSE parsed", { expect_equal(parse_logical(c("TRUE", "FALSE")), c(TRUE, FALSE)) }) test_that("true and false parsed", { expect_equal(parse_logical(c("true", "false")), c(TRUE, FALSE)) }) test_that("True and False parsed", { expect_equal(parse_logical(c("True", "False")), c(TRUE, FALSE)) }) test_that("T and F parsed", { expect_equal(parse_logical(c("T", "F")), c(TRUE, FALSE)) }) test_that("t and f parsed", { expect_equal(parse_logical(c("t", "f")), c(TRUE, FALSE)) }) test_that("1 and 0 parsed", { expect_equal(parse_logical(c("1", "0")), c(TRUE, FALSE)) }) test_that("true and false guessed", { expect_equal(guess_parser(c("true", "false")), "logical") expect_equal(guess_parser(c("TRUE", "FALSE")), "logical") expect_equal(guess_parser(c("T", "F")), "logical") expect_equal(guess_parser(c("t", "f")), "logical") expect_equal(guess_parser(c("t", "f", "z")), "character") }) test_that("other values generate warnings", { expect_warning(out <- parse_logical(c("A", "AB", "ABCD", "ABCDE", "NA"))) probs <- attr(out, "problems") expect_equal(c(out), rep(NA, 5)) expect_equal(n_problems(out), 4) }) readr/tests/testthat/test-write-lines.R0000644000176200001440000000624214174704674017735 0ustar liggesuserstest_that("write_lines uses UTF-8 encoding", { skip_on_os("solaris") tmp <- tempfile() on.exit(unlink(tmp)) write_lines(c("fran\u00e7ais", "\u00e9l\u00e8ve"), tmp) x <- read_lines(tmp, locale = locale(encoding = "UTF-8")) expect_equal(x, c("fran\u00e7ais", "\u00e9l\u00e8ve")) }) test_that("write_lines writes an empty file if given a empty character vector", { tmp <- tempfile() on.exit(unlink(tmp)) write_lines(character(), tmp) expect_true(empty_file(tmp)) }) test_that("write_lines respects the NA argument", { tmp <- tempfile() tmp2 <- tempfile() on.exit(unlink(c(tmp, tmp2))) write_lines(c("first", NA_character_, "last"), tmp) expect_equal(read_lines(tmp), c("first", "NA", "last")) write_lines(c("first", NA_character_, "last"), tmp2, na = "test") expect_equal(read_lines(tmp2), c("first", "test", "last")) }) test_that("write_lines can append to a file", { tmp <- tempfile() on.exit(unlink(tmp)) write_lines(c("first", "last"), tmp) write_lines(c("first", "last"), tmp, append = TRUE) expect_equal(read_lines(tmp), c("first", "last", "first", "last")) }) test_that("write_lines accepts a list of raws", { x <- lapply(seq_along(1:10), function(x) charToRaw(paste0(collapse = "", sample(letters, size = sample(0:22, 1))))) tmp <- tempfile() on.exit(unlink(tmp)) write_lines(x, tmp) expect_equal(read_lines(tmp), vapply(x, rawToChar, character(1))) }) # write_file ------------------------------------------------------------------ test_that("write_file round trips", { tmp <- tempfile() on.exit(unlink(tmp)) x <- "foo\nbar" write_file(x, tmp) expect_equal(read_file(tmp), x) }) test_that("write_file round trips with an empty vector", { tmp <- tempfile() on.exit(unlink(tmp)) x <- "" write_file(x, tmp) expect_equal(read_file(tmp), x) }) test_that("write_file errors if given a character vector of length != 1", { tmp <- tempfile() expect_error(write_file(character(), tmp)) expect_error(write_file(c("foo", "bar"), tmp)) }) test_that("write_file with raw round trips", { tmp <- tempfile() on.exit(unlink(tmp)) x <- charToRaw("foo\nbar") write_file(x, tmp) expect_equal(read_file_raw(tmp), x) }) test_that("write_file with raw round trips with an empty vector", { tmp <- tempfile() on.exit(unlink(tmp)) x <- raw() write_file(x, tmp) expect_equal(read_file_raw(tmp), x) }) test_that("write_lines can write to compressed files", { filename <- file.path(tempdir(), "foo.bz2") on.exit(unlink(filename)) write_lines(c("foo", "bar", "baz"), filename) expect_true(is_bz2_file(filename)) expect_equal(c("foo", "bar", "baz"), read_lines(filename)) }) test_that("write_lines can write CRLF files", { filename <- tempfile() on.exit(unlink(filename)) write_lines(c("a", "b", "c"), filename, sep = "\r\n") expect_identical(charToRaw("a\r\nb\r\nc\r\n"), readBin(filename, n = 9, what = "raw")) }) test_that("write_file can write to compressed files", { mt <- read_file(readr_example("mtcars.csv.bz2")) filename <- file.path(tempdir(), "mtcars.csv.bz2") on.exit(unlink(filename)) write_file(mt, filename) expect_true(is_bz2_file(filename)) expect_equal(mt, read_file(filename)) }) readr/tests/testthat/eol-cr.csv0000644000176200001440000000003214152512262016242 0ustar liggesusers"x","y" 1,"a" 2,"b" 3,"c" readr/tests/testthat/non-tabular.csv0000644000176200001440000000004514152512262017307 0ustar liggesusersa,"b",'c' ,,NA,"NA", a,1,1.0,1.1,1e3 readr/tests/testthat/eol-cr.txt.xz0000644000176200001440000000011014152512262016723 0ustar liggesusers7zXZִF!t/x y 1 a 2 b 3 c 4xT(( l`}YZreadr/tests/testthat/colour-test0000644000176200001440000000053714152512262016561 0ustar liggesuserscols( a = col_double(), b = col_integer(), c = col_logical(), d = col_character(), e = col_date(format = ""), f = col_datetime(format = ""), g = col_time(format = ""), h = col_factor(levels = NULL, ordered = FALSE, include_na = FALSE), i = col_skip() ) readr/tests/testthat/test-parsing-time.R0000644000176200001440000000337614174704674020077 0ustar liggesuserstest_that("default format captures cases", { late_night <- hms::hms(seconds = 22 * 3600 + 20 * 60) expect_equal(parse_time("22:20"), late_night) expect_equal(parse_time("10:20 pm"), late_night) expect_equal(parse_time("22:20:05"), hms::as_hms(late_night + 5)) expect_equal(parse_time("10:20:05 pm"), hms::as_hms(late_night + 5)) }) test_that("twelve o'clock is parsed properly", { morning <- hms::hms(seconds = 0 * 3600 + 1 * 60) midday <- hms::hms(seconds = 12 * 3600 + 1 * 60) expect_equal(parse_time("12:01 AM"), morning) expect_equal(parse_time("12:01 PM"), midday) expect_equal(parse_time("12:01"), midday) }) test_that("accepts single digit hour", { early_morn <- hms::hms(seconds = 1 * 3600 + 20 * 60) expect_equal(parse_time("1:20 am"), early_morn) }) test_that("parses NA/empty correctly", { out <- parse_time(c("NA", "")) exp <- hms::hms(seconds = c(NA_real_, NA_real_)) expect_equal(out, exp) expect_equal( parse_time("TeSt", na = "TeSt"), hms::hms(seconds = NA_real_) ) }) test_that("times are guessed as expected", { expect_equal(guess_parser("12:01"), "time") expect_equal( guess_parser("12:01:01"), "time" ) expect_equal( guess_parser(c("04:00:00", "04:30:00", "14:00:22")), "time" ) expect_equal( guess_parser("25:01:01"), "time" ) }) test_that("durations", { expect_warning(parse_time("25:00:00", format = "%H:%M:%S")) expect_equal(parse_time("25:00:00", format = "%h:%M:%S"), hms::hms(hours = 25)) expect_equal(parse_time("1000000000:00:00", format = "%h:%M:%S"), hms::hms(hours = 1e9)) expect_equal(parse_time("-1:23:45", format = "%h:%M:%S"), hms::as_hms(-hms::hms(45, 23, 1))) expect_equal(parse_time("-1:23:45.67", format = "%h:%M:%OS"), hms::as_hms(-hms::hms(45.67, 23, 1))) }) readr/tests/testthat/test-read-builtin.R0000644000176200001440000000116414174704674020050 0ustar liggesuserstest_that("read_builtin works", { # fails with unquoted symbol expect_error(read_builtin(AirPassengers, "datasets")) # fails with an error if data set doesn't exist in package expect_error(read_builtin("nasa", "readr")) # fails with error if the dataset namespace is not attached if (!"dplyr" %in% loadedNamespaces()) { expect_error(read_builtin("starwars")) } # works if data set exists in package expect_true(is.data.frame(read_builtin("BOD", "datasets"))) # works if data set package is loaded if ("datasets" %in% loadedNamespaces()) { expect_true(is.data.frame(read_builtin("BOD"))) } }) readr/tests/first_edition.R0000644000176200001440000000010714174704674015512 0ustar liggesuserslibrary(testthat) library(readr) local_edition(1) test_check("readr") readr/tests/second_edition.R0000644000176200001440000000006614174704674015642 0ustar liggesuserslibrary(testthat) library(readr) test_check("readr") readr/src/0000755000176200001440000000000014547603065012150 5ustar liggesusersreadr/src/mio.h0000644000176200001440000016427314174704674013126 0ustar liggesusers/* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_MMAP_HEADER #define MIO_MMAP_HEADER // #include "mio/page.hpp" /* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_PAGE_HEADER #define MIO_PAGE_HEADER #ifdef _WIN32 # include #else # include #endif namespace mio { /** * This is used by `basic_mmap` to determine whether to create a read-only or * a read-write memory mapping. */ enum class access_mode { read, write }; /** * Determines the operating system's page allocation granularity. * * On the first call to this function, it invokes the operating system specific syscall * to determine the page size, caches the value, and returns it. Any subsequent call to * this function serves the cached value, so no further syscalls are made. */ inline size_t page_size() { static const size_t page_size = [] { #ifdef _WIN32 SYSTEM_INFO SystemInfo; GetSystemInfo(&SystemInfo); return SystemInfo.dwAllocationGranularity; #else return sysconf(_SC_PAGE_SIZE); #endif }(); return page_size; } /** * Alligns `offset` to the operating's system page size such that it subtracts the * difference until the nearest page boundary before `offset`, or does nothing if * `offset` is already page aligned. */ inline size_t make_offset_page_aligned(size_t offset) noexcept { const size_t page_size_ = page_size(); // Use integer division to round down to the nearest page alignment. return offset / page_size_ * page_size_; } } // namespace mio #endif // MIO_PAGE_HEADER #include #include #include #include #ifdef _WIN32 # ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN # endif // WIN32_LEAN_AND_MEAN # include #else // ifdef _WIN32 # define INVALID_HANDLE_VALUE -1 #endif // ifdef _WIN32 namespace mio { // This value may be provided as the `length` parameter to the constructor or // `map`, in which case a memory mapping of the entire file is created. enum { map_entire_file = 0 }; #ifdef _WIN32 using file_handle_type = HANDLE; #else using file_handle_type = int; #endif // This value represents an invalid file handle type. This can be used to // determine whether `basic_mmap::file_handle` is valid, for example. const static file_handle_type invalid_handle = INVALID_HANDLE_VALUE; template struct basic_mmap { using value_type = ByteT; using size_type = size_t; using reference = value_type&; using const_reference = const value_type&; using pointer = value_type*; using const_pointer = const value_type*; using difference_type = std::ptrdiff_t; using iterator = pointer; using const_iterator = const_pointer; using reverse_iterator = std::reverse_iterator; using const_reverse_iterator = std::reverse_iterator; using iterator_category = std::random_access_iterator_tag; using handle_type = file_handle_type; static_assert(sizeof(ByteT) == sizeof(char), "ByteT must be the same size as char."); private: // Points to the first requested byte, and not to the actual start of the mapping. pointer data_ = nullptr; // Length, in bytes, requested by user, which may not be the length of the full // mapping, and the entire length of the full mapping. size_type length_ = 0; size_type mapped_length_ = 0; // Letting user map a file using both an existing file handle and a path introcudes // On POSIX, we only need a file handle to create a mapping, while on Windows // systems the file handle is necessary to retrieve a file mapping handle, but any // subsequent operations on the mapped region must be done through the latter. handle_type file_handle_ = INVALID_HANDLE_VALUE; #ifdef _WIN32 handle_type file_mapping_handle_ = INVALID_HANDLE_VALUE; #endif // Letting user map a file using both an existing file handle and a path // introcudes some complexity in that we must not close the file handle if // user provided it, but we must close it if we obtained it using the // provided path. For this reason, this flag is used to determine when to // close file_handle_. bool is_handle_internal_ = false; public: /** * The default constructed mmap object is in a non-mapped state, that is, * any operation that attempts to access nonexistent underlying data will * result in undefined behaviour/segmentation faults. */ basic_mmap() = default; #ifdef __cpp_exceptions /** * The same as invoking the `map` function, except any error that may occur * while establishing the mapping is wrapped in a `std::system_error` and is * thrown. */ template basic_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file) { std::error_code error; map(path, offset, length, error); if(error) { throw std::system_error(error); } } /** * The same as invoking the `map` function, except any error that may occur * while establishing the mapping is wrapped in a `std::system_error` and is * thrown. */ basic_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file) { std::error_code error; map(handle, offset, length, error); if(error) { throw std::system_error(error); } } #endif // __cpp_exceptions /** * `basic_mmap` has single-ownership semantics, so transferring ownership * may only be accomplished by moving the object. */ basic_mmap(const basic_mmap&) = delete; basic_mmap(basic_mmap&&); basic_mmap& operator=(const basic_mmap&) = delete; basic_mmap& operator=(basic_mmap&&); /** * If this is a read-write mapping, the destructor invokes sync. Regardless * of the access mode, unmap is invoked as a final step. */ ~basic_mmap(); /** * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows, * however, a mapped region of a file gets its own handle, which is returned by * 'mapping_handle'. */ handle_type file_handle() const noexcept { return file_handle_; } handle_type mapping_handle() const noexcept; /** Returns whether a valid memory mapping has been created. */ bool is_open() const noexcept { return file_handle_ != invalid_handle; } /** * Returns true if no mapping was established, that is, conceptually the * same as though the length that was mapped was 0. This function is * provided so that this class has Container semantics. */ bool empty() const noexcept { return length() == 0; } /** Returns true if a mapping was established. */ bool is_mapped() const noexcept; /** * `size` and `length` both return the logical length, i.e. the number of bytes * user requested to be mapped, while `mapped_length` returns the actual number of * bytes that were mapped which is a multiple of the underlying operating system's * page allocation granularity. */ size_type size() const noexcept { return length(); } size_type length() const noexcept { return length_; } size_type mapped_length() const noexcept { return mapped_length_; } /** * Returns the offset, relative to the file's start, at which the mapping was * requested to be created. */ size_type offset() const noexcept { return mapped_length_ - length_; } /** * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping * exists. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > pointer data() noexcept { return data_; } const_pointer data() const noexcept { return data_; } /** * Returns an iterator to the first requested byte, if a valid memory mapping * exists, otherwise this function call is undefined behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > iterator begin() noexcept { return data(); } const_iterator begin() const noexcept { return data(); } const_iterator cbegin() const noexcept { return data(); } /** * Returns an iterator one past the last requested byte, if a valid memory mapping * exists, otherwise this function call is undefined behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > iterator end() noexcept { return data() + length(); } const_iterator end() const noexcept { return data() + length(); } const_iterator cend() const noexcept { return data() + length(); } /** * Returns a reverse iterator to the last memory mapped byte, if a valid * memory mapping exists, otherwise this function call is undefined * behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > reverse_iterator rbegin() noexcept { return reverse_iterator(end()); } const_reverse_iterator rbegin() const noexcept { return const_reverse_iterator(end()); } const_reverse_iterator crbegin() const noexcept { return const_reverse_iterator(end()); } /** * Returns a reverse iterator past the first mapped byte, if a valid memory * mapping exists, otherwise this function call is undefined behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > reverse_iterator rend() noexcept { return reverse_iterator(begin()); } const_reverse_iterator rend() const noexcept { return const_reverse_iterator(begin()); } const_reverse_iterator crend() const noexcept { return const_reverse_iterator(begin()); } /** * Returns a reference to the `i`th byte from the first requested byte (as returned * by `data`). If this is invoked when no valid memory mapping has been created * prior to this call, undefined behaviour ensues. */ reference operator[](const size_type i) noexcept { return data_[i]; } const_reference operator[](const size_type i) const noexcept { return data_[i]; } /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `path`, which must be a path to an existing file, is used to retrieve a file * handle (which is closed when the object destructs or `unmap` is called), which is * then used to memory map the requested region. Upon failure, `error` is set to * indicate the reason and the object remains in an unmapped state. * * `offset` is the number of bytes, relative to the start of the file, where the * mapping should begin. When specifying it, there is no need to worry about * providing a value that is aligned with the operating system's page allocation * granularity. This is adjusted by the implementation such that the first requested * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at * `offset` from the start of the file. * * `length` is the number of bytes to map. It may be `map_entire_file`, in which * case a mapping of the entire file is created. */ template void map(const String& path, const size_type offset, const size_type length, std::error_code& error); /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `path`, which must be a path to an existing file, is used to retrieve a file * handle (which is closed when the object destructs or `unmap` is called), which is * then used to memory map the requested region. Upon failure, `error` is set to * indicate the reason and the object remains in an unmapped state. * * The entire file is mapped. */ template void map(const String& path, std::error_code& error) { map(path, 0, map_entire_file, error); } /** * Establishes a memory mapping with AccessMode. If the mapping is * unsuccesful, the reason is reported via `error` and the object remains in * a state as if this function hadn't been called. * * `handle`, which must be a valid file handle, which is used to memory map the * requested region. Upon failure, `error` is set to indicate the reason and the * object remains in an unmapped state. * * `offset` is the number of bytes, relative to the start of the file, where the * mapping should begin. When specifying it, there is no need to worry about * providing a value that is aligned with the operating system's page allocation * granularity. This is adjusted by the implementation such that the first requested * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at * `offset` from the start of the file. * * `length` is the number of bytes to map. It may be `map_entire_file`, in which * case a mapping of the entire file is created. */ void map(const handle_type handle, const size_type offset, const size_type length, std::error_code& error); /** * Establishes a memory mapping with AccessMode. If the mapping is * unsuccesful, the reason is reported via `error` and the object remains in * a state as if this function hadn't been called. * * `handle`, which must be a valid file handle, which is used to memory map the * requested region. Upon failure, `error` is set to indicate the reason and the * object remains in an unmapped state. * * The entire file is mapped. */ void map(const handle_type handle, std::error_code& error) { map(handle, 0, map_entire_file, error); } /** * If a valid memory mapping has been created prior to this call, this call * instructs the kernel to unmap the memory region and disassociate this object * from the file. * * The file handle associated with the file that is mapped is only closed if the * mapping was created using a file path. If, on the other hand, an existing * file handle was used to create the mapping, the file handle is not closed. */ void unmap(); void swap(basic_mmap& other); /** Flushes the memory mapped page to disk. Errors are reported via `error`. */ template typename std::enable_if::type sync(std::error_code& error); /** * All operators compare the address of the first byte and size of the two mapped * regions. */ private: template< access_mode A = AccessMode, typename = typename std::enable_if::type > pointer get_mapping_start() noexcept { return !data() ? nullptr : data() - offset(); } const_pointer get_mapping_start() const noexcept { return !data() ? nullptr : data() - offset(); } /** * The destructor syncs changes to disk if `AccessMode` is `write`, but not * if it's `read`, but since the destructor cannot be templated, we need to * do SFINAE in a dedicated function, where one syncs and the other is a noop. */ template typename std::enable_if::type conditional_sync(); template typename std::enable_if::type conditional_sync(); }; template bool operator==(const basic_mmap& a, const basic_mmap& b); template bool operator!=(const basic_mmap& a, const basic_mmap& b); template bool operator<(const basic_mmap& a, const basic_mmap& b); template bool operator<=(const basic_mmap& a, const basic_mmap& b); template bool operator>(const basic_mmap& a, const basic_mmap& b); template bool operator>=(const basic_mmap& a, const basic_mmap& b); /** * This is the basis for all read-only mmap objects and should be preferred over * directly using `basic_mmap`. */ template using basic_mmap_source = basic_mmap; /** * This is the basis for all read-write mmap objects and should be preferred over * directly using `basic_mmap`. */ template using basic_mmap_sink = basic_mmap; /** * These aliases cover the most common use cases, both representing a raw byte stream * (either with a char or an unsigned char/uint8_t). */ using mmap_source = basic_mmap_source; using ummap_source = basic_mmap_source; using mmap_sink = basic_mmap_sink; using ummap_sink = basic_mmap_sink; /** * Convenience factory method that constructs a mapping for any `basic_mmap` or * `basic_mmap` type. */ template< typename MMap, typename MappingToken > MMap make_mmap(const MappingToken& token, int64_t offset, int64_t length, std::error_code& error) { MMap mmap; mmap.map(token, offset, length, error); return mmap; } /** * Convenience factory method. * * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`, * `std::filesystem::path`, `std::vector`, or similar), or a * `mmap_source::handle_type`. */ template mmap_source make_mmap_source(const MappingToken& token, mmap_source::size_type offset, mmap_source::size_type length, std::error_code& error) { return make_mmap(token, offset, length, error); } template mmap_source make_mmap_source(const MappingToken& token, std::error_code& error) { return make_mmap_source(token, 0, map_entire_file, error); } /** * Convenience factory method. * * MappingToken may be a String (`std::string`, `std::string_view`, `const char*`, * `std::filesystem::path`, `std::vector`, or similar), or a * `mmap_sink::handle_type`. */ template mmap_sink make_mmap_sink(const MappingToken& token, mmap_sink::size_type offset, mmap_sink::size_type length, std::error_code& error) { return make_mmap(token, offset, length, error); } template mmap_sink make_mmap_sink(const MappingToken& token, std::error_code& error) { return make_mmap_sink(token, 0, map_entire_file, error); } } // namespace mio // #include "detail/mmap.ipp" /* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_BASIC_MMAP_IMPL #define MIO_BASIC_MMAP_IMPL // #include "mio/mmap.hpp" // #include "mio/page.hpp" // #include "mio/detail/string_util.hpp" /* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_STRING_UTIL_HEADER #define MIO_STRING_UTIL_HEADER #include namespace mio { namespace detail { template< typename S, typename C = typename std::decay::type, typename = decltype(std::declval().data()), typename = typename std::enable_if< std::is_same::value #ifdef _WIN32 || std::is_same::value #endif >::type > struct char_type_helper { using type = typename C::value_type; }; template struct char_type { using type = typename char_type_helper::type; }; // TODO: can we avoid this brute force approach? template<> struct char_type { using type = char; }; template<> struct char_type { using type = char; }; template struct char_type { using type = char; }; template struct char_type { using type = char; }; #ifdef _WIN32 template<> struct char_type { using type = wchar_t; }; template<> struct char_type { using type = wchar_t; }; template struct char_type { using type = wchar_t; }; template struct char_type { using type = wchar_t; }; #endif // _WIN32 template struct is_c_str_helper { static constexpr bool value = std::is_same< CharT*, // TODO: I'm so sorry for this... Can this be made cleaner? typename std::add_pointer< typename std::remove_cv< typename std::remove_pointer< typename std::decay< S >::type >::type >::type >::type >::value; }; template struct is_c_str { static constexpr bool value = is_c_str_helper::value; }; #ifdef _WIN32 template struct is_c_wstr { static constexpr bool value = is_c_str_helper::value; }; #endif // _WIN32 template struct is_c_str_or_c_wstr { static constexpr bool value = is_c_str::value #ifdef _WIN32 || is_c_wstr::value #endif ; }; template< typename String, typename = decltype(std::declval().data()), typename = typename std::enable_if::value>::type > const typename char_type::type* c_str(const String& path) { return path.data(); } template< typename String, typename = decltype(std::declval().empty()), typename = typename std::enable_if::value>::type > bool empty(const String& path) { return path.empty(); } template< typename String, typename = typename std::enable_if::value>::type > const typename char_type::type* c_str(String path) { return path; } template< typename String, typename = typename std::enable_if::value>::type > bool empty(String path) { return !path || (*path == 0); } } // namespace detail } // namespace mio #endif // MIO_STRING_UTIL_HEADER #include #ifndef _WIN32 # include # include # include # include #endif namespace mio { namespace detail { #ifdef _WIN32 namespace win { /** Returns the 4 upper bytes of an 8-byte integer. */ inline DWORD int64_high(int64_t n) noexcept { return n >> 32; } /** Returns the 4 lower bytes of an 8-byte integer. */ inline DWORD int64_low(int64_t n) noexcept { return n & 0xffffffff; } template< typename String, typename = typename std::enable_if< std::is_same::type, char>::value >::type > file_handle_type open_file_helper(const String& path, const access_mode mode) { return ::CreateFileA(c_str(path), mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); } template typename std::enable_if< std::is_same::type, wchar_t>::value, file_handle_type >::type open_file_helper(const String& path, const access_mode mode) { return ::CreateFileW(c_str(path), mode == access_mode::read ? GENERIC_READ : GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, 0, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, 0); } } // win #endif // _WIN32 /** * Returns the last platform specific system error (errno on POSIX and * GetLastError on Win) as a `std::error_code`. */ inline std::error_code last_error() noexcept { std::error_code error; #ifdef _WIN32 error.assign(GetLastError(), std::system_category()); #else error.assign(errno, std::system_category()); #endif return error; } template file_handle_type open_file(const String& path, const access_mode mode, std::error_code& error) { error.clear(); if(detail::empty(path)) { error = std::make_error_code(std::errc::invalid_argument); return invalid_handle; } #ifdef _WIN32 const auto handle = win::open_file_helper(path, mode); #else // POSIX const auto handle = ::open(c_str(path), mode == access_mode::read ? O_RDONLY : O_RDWR); #endif if(handle == invalid_handle) { error = detail::last_error(); } return handle; } inline size_t query_file_size(file_handle_type handle, std::error_code& error) { error.clear(); #ifdef _WIN32 LARGE_INTEGER file_size; if(::GetFileSizeEx(handle, &file_size) == 0) { error = detail::last_error(); return 0; } return static_cast(file_size.QuadPart); #else // POSIX struct stat sbuf; if(::fstat(handle, &sbuf) == -1) { error = detail::last_error(); return 0; } return sbuf.st_size; #endif } struct mmap_context { char* data; int64_t length; int64_t mapped_length; #ifdef _WIN32 file_handle_type file_mapping_handle; #endif }; inline mmap_context memory_map(const file_handle_type file_handle, const int64_t offset, const int64_t length, const access_mode mode, std::error_code& error) { const int64_t aligned_offset = make_offset_page_aligned(offset); const int64_t length_to_map = offset - aligned_offset + length; #ifdef _WIN32 const int64_t max_file_size = offset + length; const auto file_mapping_handle = ::CreateFileMapping( file_handle, 0, mode == access_mode::read ? PAGE_READONLY : PAGE_READWRITE, win::int64_high(max_file_size), win::int64_low(max_file_size), 0); if(file_mapping_handle == invalid_handle) { error = detail::last_error(); return {}; } char* mapping_start = static_cast(::MapViewOfFile( file_mapping_handle, mode == access_mode::read ? FILE_MAP_READ : FILE_MAP_WRITE, win::int64_high(aligned_offset), win::int64_low(aligned_offset), length_to_map)); if(mapping_start == nullptr) { error = detail::last_error(); return {}; } #else // POSIX char* mapping_start = static_cast(::mmap( 0, // Don't give hint as to where to map. length_to_map, mode == access_mode::read ? PROT_READ : PROT_WRITE, MAP_SHARED, file_handle, aligned_offset)); if(mapping_start == MAP_FAILED) { error = detail::last_error(); return {}; } #endif mmap_context ctx; ctx.data = mapping_start + offset - aligned_offset; ctx.length = length; ctx.mapped_length = length_to_map; #ifdef _WIN32 ctx.file_mapping_handle = file_mapping_handle; #endif return ctx; } } // namespace detail // -- basic_mmap -- template basic_mmap::~basic_mmap() { conditional_sync(); unmap(); } template basic_mmap::basic_mmap(basic_mmap&& other) : data_(std::move(other.data_)) , length_(std::move(other.length_)) , mapped_length_(std::move(other.mapped_length_)) , file_handle_(std::move(other.file_handle_)) #ifdef _WIN32 , file_mapping_handle_(std::move(other.file_mapping_handle_)) #endif , is_handle_internal_(std::move(other.is_handle_internal_)) { other.data_ = nullptr; other.length_ = other.mapped_length_ = 0; other.file_handle_ = invalid_handle; #ifdef _WIN32 other.file_mapping_handle_ = invalid_handle; #endif } template basic_mmap& basic_mmap::operator=(basic_mmap&& other) { if(this != &other) { // First the existing mapping needs to be removed. unmap(); data_ = std::move(other.data_); length_ = std::move(other.length_); mapped_length_ = std::move(other.mapped_length_); file_handle_ = std::move(other.file_handle_); #ifdef _WIN32 file_mapping_handle_ = std::move(other.file_mapping_handle_); #endif is_handle_internal_ = std::move(other.is_handle_internal_); // The moved from basic_mmap's fields need to be reset, because // otherwise other's destructor will unmap the same mapping that was // just moved into this. other.data_ = nullptr; other.length_ = other.mapped_length_ = 0; other.file_handle_ = invalid_handle; #ifdef _WIN32 other.file_mapping_handle_ = invalid_handle; #endif other.is_handle_internal_ = false; } return *this; } template typename basic_mmap::handle_type basic_mmap::mapping_handle() const noexcept { #ifdef _WIN32 return file_mapping_handle_; #else return file_handle_; #endif } template template void basic_mmap::map(const String& path, const size_type offset, const size_type length, std::error_code& error) { error.clear(); if(detail::empty(path)) { error = std::make_error_code(std::errc::invalid_argument); return; } const auto handle = detail::open_file(path, AccessMode, error); if(error) { return; } map(handle, offset, length, error); // This MUST be after the call to map, as that sets this to true. if(!error) { is_handle_internal_ = true; } } template void basic_mmap::map(const handle_type handle, const size_type offset, const size_type length, std::error_code& error) { error.clear(); if(handle == invalid_handle) { error = std::make_error_code(std::errc::bad_file_descriptor); return; } const auto file_size = detail::query_file_size(handle, error); if(error) { return; } if(offset + length > file_size) { error = std::make_error_code(std::errc::invalid_argument); return; } const auto ctx = detail::memory_map(handle, offset, length == map_entire_file ? (file_size - offset) : length, AccessMode, error); if(!error) { // We must unmap the previous mapping that may have existed prior to this call. // Note that this must only be invoked after a new mapping has been created in // order to provide the strong guarantee that, should the new mapping fail, the // `map` function leaves this instance in a state as though the function had // never been invoked. unmap(); file_handle_ = handle; is_handle_internal_ = false; data_ = reinterpret_cast(ctx.data); length_ = ctx.length; mapped_length_ = ctx.mapped_length; #ifdef _WIN32 file_mapping_handle_ = ctx.file_mapping_handle; #endif } } template template typename std::enable_if::type basic_mmap::sync(std::error_code& error) { error.clear(); if(!is_open()) { error = std::make_error_code(std::errc::bad_file_descriptor); return; } if(data()) { #ifdef _WIN32 if(::FlushViewOfFile(get_mapping_start(), mapped_length_) == 0 || ::FlushFileBuffers(file_handle_) == 0) #else // POSIX if(::msync(get_mapping_start(), mapped_length_, MS_SYNC) != 0) #endif { error = detail::last_error(); return; } } #ifdef _WIN32 if(::FlushFileBuffers(file_handle_) == 0) { error = detail::last_error(); } #endif } template void basic_mmap::unmap() { if(!is_open()) { return; } // TODO do we care about errors here? #ifdef _WIN32 if(is_mapped()) { ::UnmapViewOfFile(get_mapping_start()); ::CloseHandle(file_mapping_handle_); } #else // POSIX if(data_) { ::munmap(const_cast(get_mapping_start()), mapped_length_); } #endif // If file_handle_ was obtained by our opening it (when map is called with a path, // rather than an existing file handle), we need to close it, otherwise it must not // be closed as it may still be used outside this instance. if(is_handle_internal_) { #ifdef _WIN32 ::CloseHandle(file_handle_); #else // POSIX ::close(file_handle_); #endif } // Reset fields to their default values. data_ = nullptr; length_ = mapped_length_ = 0; file_handle_ = invalid_handle; #ifdef _WIN32 file_mapping_handle_ = invalid_handle; #endif } template bool basic_mmap::is_mapped() const noexcept { #ifdef _WIN32 return file_mapping_handle_ != invalid_handle; #else // POSIX return is_open(); #endif } template void basic_mmap::swap(basic_mmap& other) { if(this != &other) { using std::swap; swap(data_, other.data_); swap(file_handle_, other.file_handle_); #ifdef _WIN32 swap(file_mapping_handle_, other.file_mapping_handle_); #endif swap(length_, other.length_); swap(mapped_length_, other.mapped_length_); swap(is_handle_internal_, other.is_handle_internal_); } } template template typename std::enable_if::type basic_mmap::conditional_sync() { // This is invoked from the destructor, so not much we can do about // failures here. std::error_code ec; sync(ec); } template template typename std::enable_if::type basic_mmap::conditional_sync() { // noop } template bool operator==(const basic_mmap& a, const basic_mmap& b) { return a.data() == b.data() && a.size() == b.size(); } template bool operator!=(const basic_mmap& a, const basic_mmap& b) { return !(a == b); } template bool operator<(const basic_mmap& a, const basic_mmap& b) { if(a.data() == b.data()) { return a.size() < b.size(); } return a.data() < b.data(); } template bool operator<=(const basic_mmap& a, const basic_mmap& b) { return !(a > b); } template bool operator>(const basic_mmap& a, const basic_mmap& b) { if(a.data() == b.data()) { return a.size() > b.size(); } return a.data() > b.data(); } template bool operator>=(const basic_mmap& a, const basic_mmap& b) { return !(a < b); } } // namespace mio #endif // MIO_BASIC_MMAP_IMPL #endif // MIO_MMAP_HEADER /* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_PAGE_HEADER #define MIO_PAGE_HEADER #ifdef _WIN32 # include #else # include #endif namespace mio { /** * This is used by `basic_mmap` to determine whether to create a read-only or * a read-write memory mapping. */ enum class access_mode { read, write }; /** * Determines the operating system's page allocation granularity. * * On the first call to this function, it invokes the operating system specific syscall * to determine the page size, caches the value, and returns it. Any subsequent call to * this function serves the cached value, so no further syscalls are made. */ inline size_t page_size() { static const size_t page_size = [] { #ifdef _WIN32 SYSTEM_INFO SystemInfo; GetSystemInfo(&SystemInfo); return SystemInfo.dwAllocationGranularity; #else return sysconf(_SC_PAGE_SIZE); #endif }(); return page_size; } /** * Alligns `offset` to the operating's system page size such that it subtracts the * difference until the nearest page boundary before `offset`, or does nothing if * `offset` is already page aligned. */ inline size_t make_offset_page_aligned(size_t offset) noexcept { const size_t page_size_ = page_size(); // Use integer division to round down to the nearest page alignment. return offset / page_size_ * page_size_; } } // namespace mio #endif // MIO_PAGE_HEADER /* Copyright 2017 https://github.com/mandreyel * * Permission is hereby granted, free of charge, to any person obtaining a copy of this * software and associated documentation files (the "Software"), to deal in the Software * without restriction, including without limitation the rights to use, copy, modify, * merge, publish, distribute, sublicense, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to the following * conditions: * * The above copyright notice and this permission notice shall be included in all copies * or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, * INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A * PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF * CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE * OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef MIO_SHARED_MMAP_HEADER #define MIO_SHARED_MMAP_HEADER // #include "mio/mmap.hpp" #include // std::error_code #include // std::shared_ptr namespace mio { /** * Exposes (nearly) the same interface as `basic_mmap`, but endowes it with * `std::shared_ptr` semantics. * * This is not the default behaviour of `basic_mmap` to avoid allocating on the heap if * shared semantics are not required. */ template< access_mode AccessMode, typename ByteT > class basic_shared_mmap { using impl_type = basic_mmap; std::shared_ptr pimpl_; public: using value_type = typename impl_type::value_type; using size_type = typename impl_type::size_type; using reference = typename impl_type::reference; using const_reference = typename impl_type::const_reference; using pointer = typename impl_type::pointer; using const_pointer = typename impl_type::const_pointer; using difference_type = typename impl_type::difference_type; using iterator = typename impl_type::iterator; using const_iterator = typename impl_type::const_iterator; using reverse_iterator = typename impl_type::reverse_iterator; using const_reverse_iterator = typename impl_type::const_reverse_iterator; using iterator_category = typename impl_type::iterator_category; using handle_type = typename impl_type::handle_type; using mmap_type = impl_type; basic_shared_mmap() = default; basic_shared_mmap(const basic_shared_mmap&) = default; basic_shared_mmap& operator=(const basic_shared_mmap&) = default; basic_shared_mmap(basic_shared_mmap&&) = default; basic_shared_mmap& operator=(basic_shared_mmap&&) = default; /** Takes ownership of an existing mmap object. */ basic_shared_mmap(mmap_type&& mmap) : pimpl_(std::make_shared(std::move(mmap))) {} /** Takes ownership of an existing mmap object. */ basic_shared_mmap& operator=(mmap_type&& mmap) { pimpl_ = std::make_shared(std::move(mmap)); return *this; } /** Initializes this object with an already established shared mmap. */ basic_shared_mmap(std::shared_ptr mmap) : pimpl_(std::move(mmap)) {} /** Initializes this object with an already established shared mmap. */ basic_shared_mmap& operator=(std::shared_ptr mmap) { pimpl_ = std::move(mmap); return *this; } #ifdef __cpp_exceptions /** * The same as invoking the `map` function, except any error that may occur * while establishing the mapping is wrapped in a `std::system_error` and is * thrown. */ template basic_shared_mmap(const String& path, const size_type offset = 0, const size_type length = map_entire_file) { std::error_code error; map(path, offset, length, error); if(error) { throw std::system_error(error); } } /** * The same as invoking the `map` function, except any error that may occur * while establishing the mapping is wrapped in a `std::system_error` and is * thrown. */ basic_shared_mmap(const handle_type handle, const size_type offset = 0, const size_type length = map_entire_file) { std::error_code error; map(handle, offset, length, error); if(error) { throw std::system_error(error); } } #endif // __cpp_exceptions /** * If this is a read-write mapping and the last reference to the mapping, * the destructor invokes sync. Regardless of the access mode, unmap is * invoked as a final step. */ ~basic_shared_mmap() = default; /** Returns the underlying `std::shared_ptr` instance that holds the mmap. */ std::shared_ptr get_shared_ptr() { return pimpl_; } /** * On UNIX systems 'file_handle' and 'mapping_handle' are the same. On Windows, * however, a mapped region of a file gets its own handle, which is returned by * 'mapping_handle'. */ handle_type file_handle() const noexcept { return pimpl_ ? pimpl_->file_handle() : invalid_handle; } handle_type mapping_handle() const noexcept { return pimpl_ ? pimpl_->mapping_handle() : invalid_handle; } /** Returns whether a valid memory mapping has been created. */ bool is_open() const noexcept { return pimpl_ && pimpl_->is_open(); } /** * Returns true if no mapping was established, that is, conceptually the * same as though the length that was mapped was 0. This function is * provided so that this class has Container semantics. */ bool empty() const noexcept { return !pimpl_ || pimpl_->empty(); } /** * `size` and `length` both return the logical length, i.e. the number of bytes * user requested to be mapped, while `mapped_length` returns the actual number of * bytes that were mapped which is a multiple of the underlying operating system's * page allocation granularity. */ size_type size() const noexcept { return pimpl_ ? pimpl_->length() : 0; } size_type length() const noexcept { return pimpl_ ? pimpl_->length() : 0; } size_type mapped_length() const noexcept { return pimpl_ ? pimpl_->mapped_length() : 0; } /** * Returns the offset, relative to the file's start, at which the mapping was * requested to be created. */ size_type offset() const noexcept { return pimpl_ ? pimpl_->offset() : 0; } /** * Returns a pointer to the first requested byte, or `nullptr` if no memory mapping * exists. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > pointer data() noexcept { return pimpl_->data(); } const_pointer data() const noexcept { return pimpl_ ? pimpl_->data() : nullptr; } /** * Returns an iterator to the first requested byte, if a valid memory mapping * exists, otherwise this function call is undefined behaviour. */ iterator begin() noexcept { return pimpl_->begin(); } const_iterator begin() const noexcept { return pimpl_->begin(); } const_iterator cbegin() const noexcept { return pimpl_->cbegin(); } /** * Returns an iterator one past the last requested byte, if a valid memory mapping * exists, otherwise this function call is undefined behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > iterator end() noexcept { return pimpl_->end(); } const_iterator end() const noexcept { return pimpl_->end(); } const_iterator cend() const noexcept { return pimpl_->cend(); } /** * Returns a reverse iterator to the last memory mapped byte, if a valid * memory mapping exists, otherwise this function call is undefined * behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > reverse_iterator rbegin() noexcept { return pimpl_->rbegin(); } const_reverse_iterator rbegin() const noexcept { return pimpl_->rbegin(); } const_reverse_iterator crbegin() const noexcept { return pimpl_->crbegin(); } /** * Returns a reverse iterator past the first mapped byte, if a valid memory * mapping exists, otherwise this function call is undefined behaviour. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > reverse_iterator rend() noexcept { return pimpl_->rend(); } const_reverse_iterator rend() const noexcept { return pimpl_->rend(); } const_reverse_iterator crend() const noexcept { return pimpl_->crend(); } /** * Returns a reference to the `i`th byte from the first requested byte (as returned * by `data`). If this is invoked when no valid memory mapping has been created * prior to this call, undefined behaviour ensues. */ reference operator[](const size_type i) noexcept { return (*pimpl_)[i]; } const_reference operator[](const size_type i) const noexcept { return (*pimpl_)[i]; } /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `path`, which must be a path to an existing file, is used to retrieve a file * handle (which is closed when the object destructs or `unmap` is called), which is * then used to memory map the requested region. Upon failure, `error` is set to * indicate the reason and the object remains in an unmapped state. * * `offset` is the number of bytes, relative to the start of the file, where the * mapping should begin. When specifying it, there is no need to worry about * providing a value that is aligned with the operating system's page allocation * granularity. This is adjusted by the implementation such that the first requested * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at * `offset` from the start of the file. * * `length` is the number of bytes to map. It may be `map_entire_file`, in which * case a mapping of the entire file is created. */ template void map(const String& path, const size_type offset, const size_type length, std::error_code& error) { map_impl(path, offset, length, error); } /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `path`, which must be a path to an existing file, is used to retrieve a file * handle (which is closed when the object destructs or `unmap` is called), which is * then used to memory map the requested region. Upon failure, `error` is set to * indicate the reason and the object remains in an unmapped state. * * The entire file is mapped. */ template void map(const String& path, std::error_code& error) { map_impl(path, 0, map_entire_file, error); } /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `handle`, which must be a valid file handle, which is used to memory map the * requested region. Upon failure, `error` is set to indicate the reason and the * object remains in an unmapped state. * * `offset` is the number of bytes, relative to the start of the file, where the * mapping should begin. When specifying it, there is no need to worry about * providing a value that is aligned with the operating system's page allocation * granularity. This is adjusted by the implementation such that the first requested * byte (as returned by `data` or `begin`), so long as `offset` is valid, will be at * `offset` from the start of the file. * * `length` is the number of bytes to map. It may be `map_entire_file`, in which * case a mapping of the entire file is created. */ void map(const handle_type handle, const size_type offset, const size_type length, std::error_code& error) { map_impl(handle, offset, length, error); } /** * Establishes a memory mapping with AccessMode. If the mapping is unsuccesful, the * reason is reported via `error` and the object remains in a state as if this * function hadn't been called. * * `handle`, which must be a valid file handle, which is used to memory map the * requested region. Upon failure, `error` is set to indicate the reason and the * object remains in an unmapped state. * * The entire file is mapped. */ void map(const handle_type handle, std::error_code& error) { map_impl(handle, 0, map_entire_file, error); } /** * If a valid memory mapping has been created prior to this call, this call * instructs the kernel to unmap the memory region and disassociate this object * from the file. * * The file handle associated with the file that is mapped is only closed if the * mapping was created using a file path. If, on the other hand, an existing * file handle was used to create the mapping, the file handle is not closed. */ void unmap() { if(pimpl_) pimpl_->unmap(); } void swap(basic_shared_mmap& other) { pimpl_.swap(other.pimpl_); } /** Flushes the memory mapped page to disk. Errors are reported via `error`. */ template< access_mode A = AccessMode, typename = typename std::enable_if::type > void sync(std::error_code& error) { if(pimpl_) pimpl_->sync(error); } /** All operators compare the underlying `basic_mmap`'s addresses. */ friend bool operator==(const basic_shared_mmap& a, const basic_shared_mmap& b) { return a.pimpl_ == b.pimpl_; } friend bool operator!=(const basic_shared_mmap& a, const basic_shared_mmap& b) { return !(a == b); } friend bool operator<(const basic_shared_mmap& a, const basic_shared_mmap& b) { return a.pimpl_ < b.pimpl_; } friend bool operator<=(const basic_shared_mmap& a, const basic_shared_mmap& b) { return a.pimpl_ <= b.pimpl_; } friend bool operator>(const basic_shared_mmap& a, const basic_shared_mmap& b) { return a.pimpl_ > b.pimpl_; } friend bool operator>=(const basic_shared_mmap& a, const basic_shared_mmap& b) { return a.pimpl_ >= b.pimpl_; } private: template void map_impl(const MappingToken& token, const size_type offset, const size_type length, std::error_code& error) { if(!pimpl_) { mmap_type mmap = make_mmap(token, offset, length, error); if(error) { return; } pimpl_ = std::make_shared(std::move(mmap)); } else { pimpl_->map(token, offset, length, error); } } }; /** * This is the basis for all read-only mmap objects and should be preferred over * directly using basic_shared_mmap. */ template using basic_shared_mmap_source = basic_shared_mmap; /** * This is the basis for all read-write mmap objects and should be preferred over * directly using basic_shared_mmap. */ template using basic_shared_mmap_sink = basic_shared_mmap; /** * These aliases cover the most common use cases, both representing a raw byte stream * (either with a char or an unsigned char/uint8_t). */ using shared_mmap_source = basic_shared_mmap_source; using shared_ummap_source = basic_shared_mmap_source; using shared_mmap_sink = basic_shared_mmap_sink; using shared_ummap_sink = basic_shared_mmap_sink; } // namespace mio #endif // MIO_SHARED_MMAP_HEADER readr/src/type_convert.cpp0000644000176200001440000000164214174704674015404 0ustar liggesusers#include "cpp11/list.hpp" #include "cpp11/sexp.hpp" #include "cpp11/strings.hpp" #include "Collector.h" #include "LocaleInfo.h" #include "Token.h" [[cpp11::register]] cpp11::sexp type_convert_col( const cpp11::strings& x, const cpp11::list& spec, const cpp11::list& locale_, int col, const std::vector& na, bool trim_ws) { LocaleInfo locale(locale_); CollectorPtr collector = Collector::create(spec, &locale); collector->resize(x.size()); for (int i = 0; i < x.size(); ++i) { SEXP string = x[i]; Token t; if (string == NA_STRING) { t = Token(TOKEN_MISSING, i - 1, col - 1); } else { const char* begin = CHAR(string); t = Token(begin, begin + Rf_length(string), i - 1, col - 1, false); if (trim_ws) { t.trim(); } t.flagNA(na); } collector->setValue(i, t); } return static_cast(collector->vector()); } readr/src/TokenizerLog.h0000644000176200001440000000761014174704674014745 0ustar liggesusers#ifndef FASTREAD_TOKENIZER_LOG_H_ #define FASTREAD_TOKENIZER_LOG_H_ #include "cpp11/protect.hpp" #include "Token.h" #include "Tokenizer.h" #include "utils.h" enum LogState { LOG_DELIM, LOG_FIELD, LOG_STRING, LOG_ESCAPE, LOG_QUOTE, LOG_DATE }; class TokenizerLog : public Tokenizer { SourceIterator begin_, cur_, end_; LogState state_; int row_, col_; bool moreTokens_; bool trimWS_; public: TokenizerLog(bool trimWS) : trimWS_(trimWS) {} void tokenize(SourceIterator begin, SourceIterator end) { cur_ = begin; begin_ = begin; end_ = end; row_ = 0; col_ = 0; state_ = LOG_DELIM; moreTokens_ = true; } std::pair progress() { size_t bytes = cur_ - begin_; return std::make_pair(bytes / (double)(end_ - begin_), bytes); } Token nextToken() { // Capture current position int row = row_, col = col_; if (!moreTokens_) return Token(TOKEN_EOF, row, col); SourceIterator token_begin = cur_; while (cur_ != end_) { Advance advance(&cur_); if ((row_ + 1) % 100000 == 0 || (col_ + 1) % 100000 == 0) cpp11::check_user_interrupt(); switch (state_) { case LOG_DELIM: if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); advanceForLF(&cur_, end_); return Token(TOKEN_EMPTY, row, col); } else if (*cur_ == ' ') { break; } else if (*cur_ == '"') { state_ = LOG_STRING; } else if (*cur_ == '[') { state_ = LOG_DATE; } else { state_ = LOG_FIELD; } break; case LOG_FIELD: if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); return fieldToken(token_begin, advanceForLF(&cur_, end_), row, col); } else if (*cur_ == ' ') { newField(); return fieldToken(token_begin, cur_, row, col); } break; case LOG_QUOTE: if (*cur_ == ' ') { newField(); return fieldToken(token_begin + 1, cur_ - 1, row, col); } else if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); return fieldToken( token_begin + 1, advanceForLF(&cur_, end_) - 1, row, col); } else { state_ = LOG_STRING; } break; case LOG_STRING: if (*cur_ == '"') { state_ = LOG_QUOTE; } else if (*cur_ == '\\') { state_ = LOG_ESCAPE; } break; case LOG_ESCAPE: state_ = LOG_STRING; break; case LOG_DATE: if (*cur_ == ']') { newField(); if (cur_ + 1 != end_) cur_++; return fieldToken(token_begin + 1, cur_ - 1, row, col); } break; } } // Reached end of Source: cur_ == end_ moreTokens_ = false; switch (state_) { case LOG_DELIM: if (col_ == 0) { return Token(TOKEN_EOF, row, col); } else { return Token(TOKEN_EMPTY, row, col); } case LOG_QUOTE: return fieldToken(token_begin + 1, end_ - 1, row, col); case LOG_STRING: return fieldToken(token_begin + 1, end_, row, col); case LOG_ESCAPE: warn(row, col, "closing escape at end of file"); return fieldToken(token_begin + 1, end_, row, col); case LOG_DATE: warn(row, col, "closing ] at end of file"); return fieldToken(token_begin + 1, end_, row, col); case LOG_FIELD: return fieldToken(token_begin, end_, row, col); } return Token(TOKEN_EOF, row, col); } private: void newField() { col_++; state_ = LOG_DELIM; } void newRecord() { row_++; col_ = 0; state_ = LOG_DELIM; } Token fieldToken(SourceIterator begin, SourceIterator end, int row, int col) { Token t(begin, end, row, col, false); if (trimWS_) { t.trim(); } t.flagNA(std::vector(1, "-")); return t; } }; #endif readr/src/write_delim.cpp0000644000176200001440000001253514174704674015172 0ustar liggesusers#include "cpp11/list.hpp" #include "cpp11/sexp.hpp" #include "cpp11/strings.hpp" #include "connection.h" #include "grisu3.h" #include #include #include enum quote_escape_t { DOUBLE = 1, BACKSLASH = 2, NONE = 3 }; void stream_delim( const cpp11::sexp& connection, const cpp11::sexp& x, int i, char delim, const std::string& na, quote_escape_t escape); void stream_delim_row( const cpp11::sexp& connection, const cpp11::list& x, int i, char delim, const std::string& na, quote_escape_t escape, const char* eol) { int p = Rf_length(x); for (int j = 0; j < p - 1; ++j) { stream_delim(connection, x.at(j), i, delim, na, escape); write_bytes(connection, &delim, 1); } stream_delim(connection, x.at(p - 1), i, delim, na, escape); write_bytes(connection, eol, strlen(eol)); } bool needs_quote(const char* string, char delim, const std::string& na) { if (string == na) { return true; } for (const char* cur = string; *cur != '\0'; ++cur) { if (*cur == '\n' || *cur == '\r' || *cur == '"' || *cur == delim) { return true; } } return false; } void stream_delim( const cpp11::sexp& connection, const char* string, char delim, const std::string& na, quote_escape_t escape) { bool quotes = needs_quote(string, delim, na); if (quotes) { write_bytes(connection, "\"", 1); } for (const char* cur = string; *cur != '\0'; ++cur) { switch (*cur) { case '"': switch (escape) { case DOUBLE: write_bytes(connection, "\"\"", 2); break; case BACKSLASH: write_bytes(connection, "\\\"", 2); break; case NONE: write_bytes(connection, "\"", 1); break; } break; default: write_bytes(connection, cur, 1); } } if (quotes) { write_bytes(connection, "\"", 1); } } void validate_col_type(SEXP x, const std::string& name) { switch (TYPEOF(x)) { case LGLSXP: case INTSXP: case REALSXP: case STRSXP: break; default: cpp11::stop( "Don't know how to handle vector of type %s in column '%s'.", Rf_type2char(TYPEOF(x)), name.c_str()); } } void stream_delim( const cpp11::sexp& connection, const cpp11::list& df, char delim, const std::string& na, bool col_names, bool bom, quote_escape_t escape, const char* eol) { int p = Rf_length(df); if (p == 0) { return; } if (bom) { write_bytes(connection, "\xEF\xBB\xBF", 3); } cpp11::strings names(df.attr("names")); // Validate column types for (int j = 0; j < p; ++j) { validate_col_type(df.at(j), names[j]); } if (col_names) { cpp11::strings names(df.attr("names")); for (int j = 0; j < p; ++j) { stream_delim(connection, names, j, delim, na, escape); if (j != p - 1) { write_bytes(connection, &delim, 1); } } write_bytes(connection, eol, strlen(eol)); } cpp11::sexp first_col = df[0]; int n = Rf_length(first_col); for (int i = 0; i < n; ++i) { stream_delim_row(connection, df, i, delim, na, escape, eol); } } [[cpp11::register]] void stream_delim_( const cpp11::list& df, const cpp11::sexp& connection, char delim, const std::string& na, bool col_names, bool bom, int quote_escape, const char* eol) { stream_delim( connection, df, delim, na, col_names, bom, static_cast(quote_escape), eol); } // ============================================================================= // Derived from EncodeElementS in RPostgreSQL // Written by: tomoakin@kenroku.kanazawa-u.ac.jp // License: GPL-2 void stream_delim( const cpp11::sexp& connection, const cpp11::sexp& x, int i, char delim, const std::string& na, quote_escape_t escape) { switch (TYPEOF(x)) { case LGLSXP: { int value = LOGICAL(x)[i]; if (value == TRUE) { write_bytes(connection, "TRUE", 4); } else if (value == FALSE) { write_bytes(connection, "FALSE", 5); } else { write_bytes(connection, na.c_str(), na.size()); } break; } case INTSXP: { int value = INTEGER(x)[i]; if (value == NA_INTEGER) { write_bytes(connection, na.c_str(), na.size()); } else { std::array str; int len = snprintf(str.data(), 32, "%i", value); if (len > 32) { cpp11::stop("integer too big"); } write_bytes(connection, str.data(), len); } break; } case REALSXP: { double value = REAL(x)[i]; if (!R_FINITE(value)) { if (ISNA(value) || ISNAN(value)) { write_bytes(connection, na.c_str(), na.size()); } else if (value > 0) { write_bytes(connection, "Inf", 3); } else { write_bytes(connection, "-Inf", 4); } } else { std::array str; int len = dtoa_grisu3(value, str.data()); write_bytes(connection, str.data(), len); } break; } case STRSXP: { if (STRING_ELT(x, i) == NA_STRING) { write_bytes(connection, na.c_str(), na.size()); } else { stream_delim( connection, Rf_translateCharUTF8(STRING_ELT(x, i)), delim, na, escape); } break; } default: cpp11::stop( "Don't know how to handle vector of type %s.", Rf_type2char(TYPEOF(x))); } } readr/src/read.cpp0000644000176200001440000001616414174704674013603 0ustar liggesusers#include #include "cpp11/environment.hpp" #include "cpp11/function.hpp" #include "cpp11/list.hpp" #include "cpp11/strings.hpp" #include "Collector.h" #include "LocaleInfo.h" #include "Progress.h" #include "Reader.h" #include "Source.h" #include "Tokenizer.h" #include "TokenizerLine.h" #include "Warnings.h" [[cpp11::register]] cpp11::strings read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_) { SourcePtr source = Source::create(sourceSpec); LocaleInfo locale(locale_); return cpp11::writable::strings( locale.encoder_.makeSEXP(source->begin(), source->end())); } [[cpp11::register]] cpp11::raws read_file_raw_(const cpp11::list& sourceSpec) { SourcePtr source = Source::create(sourceSpec); cpp11::writable::raws res( static_cast(source->end() - source->begin())); std::copy(source->begin(), source->end(), RAW(res)); return SEXP(res); } [[cpp11::register]] cpp11::writable::strings read_lines_( const cpp11::list& sourceSpec, const cpp11::list& locale_, std::vector na, int n_max, bool skip_empty_rows, bool progress) { LocaleInfo locale(locale_); Reader r( Source::create(sourceSpec), TokenizerPtr(new TokenizerLine(std::move(na), skip_empty_rows)), CollectorPtr(new CollectorCharacter(&locale.encoder_)), progress); return SEXP(r.readToVector(n_max)); } cpp11::function R6method(const cpp11::environment& env, const std::string& method) { return static_cast(env[method.c_str()]); } bool isTrue(SEXP x) { if (!(TYPEOF(x) == LGLSXP && Rf_length(x) == 1)) { cpp11::stop("`continue()` must return a length 1 logical vector"); } return LOGICAL(x)[0] == TRUE; } [[cpp11::register]] void read_lines_chunked_( const cpp11::list& sourceSpec, const cpp11::list& locale_, std::vector na, int chunkSize, const cpp11::environment& callback, bool skip_empty_rows, bool progress) { LocaleInfo locale(locale_); Reader r( Source::create(sourceSpec), TokenizerPtr(new TokenizerLine(std::move(na), skip_empty_rows)), CollectorPtr(new CollectorCharacter(&locale.encoder_)), progress); cpp11::strings out; int pos = 1; while (isTrue(R6method(callback, "continue")())) { cpp11::strings out = r.readToVector(chunkSize); if (out.size() == 0) { return; } R6method(callback, "receive")(out, pos); pos += out.size(); } } [[cpp11::register]] cpp11::list read_lines_raw_( const cpp11::list& sourceSpec, int n_max = -1, bool progress = false) { Reader r( Source::create(sourceSpec), TokenizerPtr(new TokenizerLine()), CollectorPtr(new CollectorRaw()), progress); return r.readToVector(n_max); } [[cpp11::register]] void read_lines_raw_chunked_( const cpp11::list& sourceSpec, int chunkSize, const cpp11::environment& callback, bool progress) { Reader r( Source::create(sourceSpec), TokenizerPtr(new TokenizerLine()), CollectorPtr(new CollectorRaw()), progress); cpp11::list out; int pos = 1; while (isTrue(R6method(callback, "continue")())) { cpp11::list out = r.readToVector(chunkSize); if (out.size() == 0) { return; } R6method(callback, "receive")(out, pos); pos += out.size(); } } typedef std::vector::iterator CollectorItr; [[cpp11::register]] cpp11::sexp read_tokens_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::strings& colNames, const cpp11::list& locale_, int n_max, bool progress) { LocaleInfo l(locale_); Reader r( Source::create(sourceSpec), Tokenizer::create(tokenizerSpec), collectorsCreate(colSpecs, &l), progress, colNames); return r.readToDataFrame(n_max); } [[cpp11::register]] void read_tokens_chunked_( const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::strings& colNames, const cpp11::list& locale_, const cpp11::sexp& spec, bool progress) { LocaleInfo l(locale_); Reader r( Source::create(sourceSpec), Tokenizer::create(tokenizerSpec), collectorsCreate(colSpecs, &l), progress, colNames); int pos = 1; while (isTrue(R6method(callback, "continue")())) { cpp11::data_frame out(r.readToDataFrame(chunkSize)); if (out.nrow() == 0) { return; } // We use the C API directly, as we are modifying the read-only data_frame // here. Rf_setAttrib(out, Rf_install("spec"), spec); R6method(callback, "receive")(out, pos); pos += out.nrow(); } } [[cpp11::register]] cpp11::sexp melt_tokens_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, int n_max, bool progress) { LocaleInfo l(locale_); Reader r( Source::create(sourceSpec), Tokenizer::create(tokenizerSpec), collectorsCreate(colSpecs, &l), progress); return r.meltToDataFrame(cpp11::list(locale_), n_max); } [[cpp11::register]] void melt_tokens_chunked_( const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, bool progress) { LocaleInfo l(locale_); Reader r( Source::create(sourceSpec), Tokenizer::create(tokenizerSpec), collectorsCreate(colSpecs, &l), progress); int pos = 1; while (isTrue(R6method(callback, "continue")())) { cpp11::data_frame out( r.meltToDataFrame(static_cast(locale_), chunkSize)); if (out.nrow() == 0) { return; } R6method(callback, "receive")(out, pos); pos += out.nrow(); } } [[cpp11::register]] std::vector guess_types_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& locale_, int n) { Warnings warnings; SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); // silence warnings LocaleInfo locale(locale_); std::vector collectors; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (t.row() >= (size_t)n) { break; } // Add new collectors, if needed if (t.col() >= collectors.size()) { int p = collectors.size() - t.col() + 1; for (int j = 0; j < p; ++j) { CollectorPtr col = CollectorPtr(new CollectorCharacter(&locale.encoder_)); col->setWarnings(&warnings); col->resize(n); collectors.push_back(col); } } collectors[t.col()]->setValue(t.row(), t); } std::vector out; for (auto& collector : collectors) { cpp11::strings col(collector->vector()); out.push_back(collectorGuess(SEXP(col), cpp11::list(locale_))); } return out; } readr/src/Tokenizer.cpp0000644000176200001440000000511514174704674014634 0ustar liggesusers#include "cpp11/as.hpp" #include "cpp11/integers.hpp" #include "cpp11/list.hpp" #include "Tokenizer.h" #include "TokenizerDelim.h" #include "TokenizerFwf.h" #include "TokenizerLine.h" #include "TokenizerLog.h" #include "TokenizerWs.h" TokenizerPtr Tokenizer::create(const cpp11::list& spec) { std::string subclass(cpp11::strings(spec.attr("class"))[0]); if (subclass == "tokenizer_delim") { char delim = cpp11::as_cpp(spec["delim"]); char quote = cpp11::as_cpp(spec["quote"]); std::vector na = cpp11::as_cpp>(spec["na"]); std::string comment = cpp11::as_cpp(spec["comment"]); bool trimWs = cpp11::as_cpp(spec["trim_ws"]); bool escapeDouble = cpp11::as_cpp(spec["escape_double"]); bool escapeBackslash = cpp11::as_cpp(spec["escape_backslash"]); bool quotedNA = cpp11::as_cpp(spec["quoted_na"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); return TokenizerPtr(new TokenizerDelim( delim, quote, na, comment, trimWs, escapeBackslash, escapeDouble, quotedNA, skipEmptyRows)); } if (subclass == "tokenizer_fwf") { std::vector begin = cpp11::as_cpp>(spec["begin"]); std::vector end = cpp11::as_cpp>(spec["end"]); std::vector na = cpp11::as_cpp>(spec["na"]); std::string comment = cpp11::as_cpp(spec["comment"]); bool trimWs = cpp11::as_cpp(spec["trim_ws"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); return TokenizerPtr( new TokenizerFwf(begin, end, na, comment, trimWs, skipEmptyRows)); } if (subclass == "tokenizer_line") { std::vector na = cpp11::as_cpp>(spec["na"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); return TokenizerPtr(new TokenizerLine(na, skipEmptyRows)); } if (subclass == "tokenizer_log") { bool trimWs = cpp11::as_cpp(spec["trim_ws"]); return TokenizerPtr(new TokenizerLog(trimWs)); } if (subclass == "tokenizer_ws") { std::vector na = cpp11::as_cpp>(spec["na"]); std::string comment = cpp11::as_cpp(spec["comment"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); return TokenizerPtr(new TokenizerWs(na, comment, skipEmptyRows)); } cpp11::stop("Unknown tokenizer type"); return TokenizerPtr(); } readr/src/TokenizerDelim.h0000644000176200001440000000346314174704674015260 0ustar liggesusers#ifndef FASTREAD_TOKENIZEDELIM_H_ #define FASTREAD_TOKENIZEDELIM_H_ #include "cpp11/R.hpp" #include "Token.h" #include "Tokenizer.h" #include "utils.h" enum DelimState { STATE_DELIM, STATE_FIELD, STATE_STRING, STATE_QUOTE, STATE_ESCAPE_S, STATE_ESCAPE_F, STATE_STRING_END, STATE_COMMENT }; class TokenizerDelim : public Tokenizer { char delim_, quote_; std::vector NA_; std::string comment_; bool hasComment_, trimWS_, escapeBackslash_, escapeDouble_, quotedNA_, hasEmptyNA_; SourceIterator begin_, cur_, end_; DelimState state_; int row_, col_; bool moreTokens_; bool skipEmptyRows_; public: TokenizerDelim( char delim = ',', char quote = '"', std::vector NA = std::vector(1, "NA"), const std::string& comment = "", bool trimWS = true, bool escapeBackslash = false, bool escapeDouble = true, bool quotedNA = true, bool skipEmptyRows = true); void tokenize(SourceIterator begin, SourceIterator end); std::pair progress(); Token nextToken(); void unescape(SourceIterator begin, SourceIterator end, std::string* pOut); private: bool isComment(const char* cur) const; void newField(); void newRecord(); Token emptyToken(int row, int col) const; Token fieldToken( SourceIterator begin, SourceIterator end, bool hasEscapeB, bool hasNull, int row, int col); Token stringToken( SourceIterator begin, SourceIterator end, bool hasEscapeB, bool hasEscapeD, bool hasNull, int row, int col); void unescapeBackslash( SourceIterator begin, SourceIterator end, std::string* pOut); void unescapeDouble(SourceIterator begin, SourceIterator end, std::string* pOut) const; }; #endif readr/src/Reader.cpp0000644000176200001440000001532314174704674014066 0ustar liggesusers#include "Reader.h" #include "cpp11/function.hpp" #include "cpp11/list.hpp" #include #include Reader::Reader( SourcePtr source, TokenizerPtr tokenizer, std::vector collectors, bool progress, const cpp11::strings& colNames) : source_(std::move(source)), tokenizer_(std::move(tokenizer)), collectors_(std::move(collectors)), progress_(progress), begun_(false) { init(colNames); } Reader::Reader( SourcePtr source, TokenizerPtr tokenizer, const CollectorPtr& collector, bool progress, const cpp11::strings& colNames) : source_(std::move(source)), tokenizer_(std::move(tokenizer)), progress_(progress), begun_(false) { collectors_.push_back(collector); init(colNames); } void Reader::init(const cpp11::strings& colNames) { tokenizer_->tokenize(source_->begin(), source_->end()); tokenizer_->setWarnings(&warnings_); // Work out which output columns we are keeping and set warnings for each // collector size_t p = collectors_.size(); for (size_t j = 0; j < p; ++j) { if (!collectors_[j]->skip()) { keptColumns_.push_back(j); collectors_[j]->setWarnings(&warnings_); } } if (colNames.size() > 0) { outNames_ = cpp11::writable::strings(keptColumns_.size()); int i = 0; for (int keptColumn : keptColumns_) { outNames_[i++] = colNames[keptColumn]; } } } cpp11::sexp Reader::readToDataFrame(R_xlen_t lines) { R_xlen_t rows = read(lines); // Save individual columns into a data frame cpp11::writable::list out(outNames_.size()); R_xlen_t j = 0; for (int keptColumn : keptColumns_) { out[j++] = collectors_[keptColumn]->vector(); } cpp11::sexp out2(warnings_.addAsAttribute(static_cast(out))); out2.attr("names") = outNames_; out2.attr("class") = {"spec_tbl_df", "tbl_df", "tbl", "data.frame"}; out2.attr("row.names") = {NA_REAL, -(static_cast(rows + 1))}; collectorsClear(); warnings_.clear(); // TODO: call tibble name repair function when tibble 1.5.0 is released. return out2; } R_xlen_t Reader::read(R_xlen_t lines) { if (t_.type() == TOKEN_EOF) { return (-1); } R_xlen_t n = (lines < 0) ? 1000 : lines; collectorsResize(n); R_xlen_t last_row = -1; R_xlen_t last_col = -1; R_xlen_t cells = 0; R_xlen_t first_row; if (!begun_) { t_ = tokenizer_->nextToken(); begun_ = true; first_row = 0; } else { first_row = t_.row(); } while (t_.type() != TOKEN_EOF) { if (progress_ && (++cells) % progressStep_ == 0) { progressBar_.show(tokenizer_->progress()); } if (t_.col() == 0 && static_cast(t_.row()) != first_row) { checkColumns(last_row, last_col, collectors_.size()); } if (lines >= 0 && static_cast(t_.row()) - first_row >= lines) { break; } if (static_cast(t_.row()) - first_row >= n) { // Estimate rows in full dataset and resize collectors n = ((t_.row() - first_row) / tokenizer_->progress().first) * 1.1; collectorsResize(n); } // only set value if within the expected number of columns if (t_.col() < collectors_.size()) { collectors_[t_.col()]->setValue(t_.row() - first_row, t_); } last_row = t_.row(); last_col = t_.col(); t_ = tokenizer_->nextToken(); } if (last_row != -1) { checkColumns(last_row, last_col, collectors_.size()); } if (progress_) { progressBar_.show(tokenizer_->progress()); } progressBar_.stop(); // Resize the collectors to the final size (if it is not already at that // size) if (last_row == -1) { collectorsResize(0); } else if ((last_row - first_row) < (n - 1)) { collectorsResize((last_row - first_row) + 1); } return last_row - first_row; } void Reader::checkColumns(int i, int j, int n) { if (j + 1 == n) { return; } std::stringstream ss1; ss1 << n << " columns"; std::stringstream ss2; ss2 << j + 1 << " columns"; warnings_.addWarning(i, -1, ss1.str(), ss2.str()); } void Reader::collectorsResize(R_xlen_t n) { for (auto & collector : collectors_) { collector->resize(n); } } void Reader::collectorsClear() { for (auto & collector : collectors_) { collector->clear(); } } cpp11::sexp Reader::meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines) { melt(locale_, lines); // Save individual columns into a data frame cpp11::writable::list out(4); out[0] = collectors_[0]->vector(); out[1] = collectors_[1]->vector(); out[2] = collectors_[2]->vector(); out[3] = collectors_[3]->vector(); out.attr("names") = {"row", "col", "data_type", "value"}; cpp11::sexp out2(warnings_.addAsAttribute(static_cast(out))); collectorsClear(); warnings_.clear(); out.attr("names") = {"row", "col", "data_type", "value"}; static cpp11::function as_tibble = cpp11::package("tibble")["as_tibble"]; return as_tibble(out); } R_xlen_t Reader::melt(const cpp11::list& locale_, R_xlen_t lines) { if (t_.type() == TOKEN_EOF) { return (-1); } R_xlen_t n = (lines < 0) ? 10000 : lines * 10; // Start with 10 cells per line collectorsResize(n); R_xlen_t last_row = -1; R_xlen_t cells = 0; R_xlen_t first_row; if (!begun_) { t_ = tokenizer_->nextToken(); begun_ = true; first_row = 0; } else { first_row = t_.row(); } while (t_.type() != TOKEN_EOF) { ++cells; if (progress_ && cells % progressStep_ == 0) { progressBar_.show(tokenizer_->progress()); } if (lines >= 0 && static_cast(t_.row()) - first_row >= lines) { --cells; break; } if (cells >= n) { // Estimate rows in full dataset and resize collectors n = (cells / tokenizer_->progress().first) * 1.1; collectorsResize(n); } collectors_[0]->setValue(cells - 1, t_.row() + 1); collectors_[1]->setValue(cells - 1, t_.col() + 1); collectors_[3]->setValue(cells - 1, t_); switch (t_.type()) { case TOKEN_STRING: { cpp11::sexp str(cpp11::as_sexp(t_.asString())); collectors_[2]->setValue( cells - 1, collectorGuess(SEXP(str), locale_, true)); break; }; case TOKEN_MISSING: collectors_[2]->setValue(cells - 1, "missing"); break; case TOKEN_EMPTY: collectors_[2]->setValue(cells - 1, "empty"); break; case TOKEN_EOF: cpp11::stop("Invalid token"); } last_row = t_.row(); t_ = tokenizer_->nextToken(); } if (progress_) { progressBar_.show(tokenizer_->progress()); } progressBar_.stop(); // Resize the collectors to the final size (if it is not already at that // size) if (last_row == -1) { collectorsResize(0); } else if (cells < (n - 1)) { collectorsResize(cells); } return cells - 1; } readr/src/Warnings.h0000644000176200001440000000242314174704674014116 0ustar liggesusers#ifndef READ_WARNINGS_H_ #define READ_WARNINGS_H_ #include "cpp11/data_frame.hpp" #include "cpp11/sexp.hpp" #include "cpp11/strings.hpp" #include #include class Warnings { std::vector row_, col_; std::vector expected_, actual_; public: Warnings() {} // row and col should be zero-indexed. addWarning converts into one-indexed void addWarning( int row, int col, const std::string& expected, const std::string& actual) { row_.push_back(row == -1 ? NA_INTEGER : row + 1); col_.push_back(col == -1 ? NA_INTEGER : col + 1); expected_.push_back(expected); actual_.push_back(actual); } cpp11::sexp addAsAttribute(cpp11::sexp x) { if (size() == 0) return x; x.attr("problems") = asDataFrame(); return x; } size_t size() { return row_.size(); } void clear() { row_.clear(); col_.clear(); expected_.clear(); actual_.clear(); } cpp11::data_frame asDataFrame() { using namespace cpp11::literals; cpp11::writable::data_frame out( {"row"_nm = row_, "col"_nm = col_, "expected"_nm = expected_, "actual"_nm = actual_}); out.attr("class") = {"tbl_df", "tbl", "data.frame"}; return static_cast(out); } }; #endif readr/src/TokenizerWs.cpp0000644000176200001440000000501714174704674015147 0ustar liggesusers#include "cpp11/R.hpp" #include "Tokenizer.h" #include "TokenizerFwf.h" #include "TokenizerWs.h" #include "utils.h" #include "Source.h" // TokenizerWs // -------------------------------------------------------------------- #include #include TokenizerWs::TokenizerWs( std::vector NA, const std::string& comment, bool skipEmptyRows) : NA_(std::move(NA)), comment_(comment), moreTokens_(false), hasComment_(!comment.empty()), skipEmptyRows_(skipEmptyRows) {} void TokenizerWs::tokenize(SourceIterator begin, SourceIterator end) { cur_ = begin; curLine_ = begin; begin_ = begin; end_ = end; row_ = 0; col_ = 0; moreTokens_ = true; } std::pair TokenizerWs::progress() { size_t bytes = cur_ - begin_; return std::make_pair(bytes / (double)(end_ - begin_), bytes); } Token TokenizerWs::nextToken() { // Check for comments and empty lines at the start of a line while (cur_ != end_ && col_ == 0 && (isComment(cur_) || (skipEmptyRows_ && isEmpty()))) { ignoreLine(); } if (cur_ == end_) { return {TOKEN_EOF, 0, 0}; } // Find start of field SourceIterator fieldBegin = cur_; while (fieldBegin != end_ && (isblank(*fieldBegin) != 0)) { ++fieldBegin; } // Make sure we are not at the start of a comment if (isComment(fieldBegin)) { ignoreLine(); row_++; col_ = 0; return nextToken(); } SourceIterator fieldEnd = fieldBegin; while (fieldEnd != end_ && (isspace(*fieldEnd) == 0)) { ++fieldEnd; } bool hasNull = fieldEnd != end_ && *fieldEnd == '\0'; Token t = fieldToken(fieldBegin, fieldEnd, hasNull); cur_ = fieldEnd; ++col_; if (cur_ != end_ && (*cur_ == '\r' || *cur_ == '\n')) { advanceForLF(&cur_, end_); ++cur_; row_++; col_ = 0; } return t; } Token TokenizerWs::fieldToken( SourceIterator begin, SourceIterator end, bool hasNull) { if (begin == end) { return {TOKEN_MISSING, row_, col_}; } Token t = Token(begin, end, row_, col_, hasNull); t.trim(); t.flagNA(NA_); return t; } bool TokenizerWs::isComment(const char* cur) const { if (!hasComment_) { return false; } return starts_with_comment(cur, end_, comment_); } bool TokenizerWs::isEmpty() const { return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n'; } void TokenizerWs::ignoreLine() { // Skip rest of line while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') { ++cur_; } advanceForLF(&cur_, end_); if (cur_ != end_) { ++cur_; } curLine_ = cur_; } readr/src/unicode_fopen.h0000644000176200001440000000330214547370673015142 0ustar liggesusers#pragma once #include // clang-format off #ifdef __clang__ # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wsign-compare" #include "mio.h" # pragma clang diagnostic pop #else #include "mio.h" #endif // clang-format on #ifdef _WIN32 #include #include #endif // This is needed to support wide character paths on windows inline FILE* unicode_fopen(const char* path, const char* mode) { FILE* out; #ifdef _WIN32 // First conver the mode to the wide equivalent // Only usage is 2 characters so max 8 bytes + 2 byte null. wchar_t mode_w[10]; MultiByteToWideChar(CP_UTF8, 0, mode, -1, mode_w, 9); // Then convert the path wchar_t* buf; size_t len = MultiByteToWideChar(CP_UTF8, 0, path, -1, NULL, 0); if (len <= 0) { Rf_error("Cannot convert file to Unicode: %s", path); } buf = (wchar_t*)R_alloc(len, sizeof(wchar_t)); if (buf == NULL) { Rf_error("Could not allocate buffer of size: %zu", len); } MultiByteToWideChar(CP_UTF8, 0, path, -1, buf, len); out = _wfopen(buf, mode_w); #else out = fopen(path, mode); #endif return out; } inline mio::mmap_source make_mmap_source(const char* file, std::error_code& error) { #ifdef __WIN32 wchar_t* buf; size_t len = MultiByteToWideChar(CP_UTF8, 0, file, -1, NULL, 0); if (len <= 0) { Rf_error("Cannot convert file to Unicode: %s", file); } buf = (wchar_t*)malloc(len * sizeof(wchar_t)); if (buf == NULL) { Rf_error("Could not allocate buffer of size: %zu", len); } MultiByteToWideChar(CP_UTF8, 0, file, -1, buf, len); mio::mmap_source out = mio::make_mmap_source(buf, error); free(buf); return out; #else return mio::make_mmap_source(file, error); #endif } readr/src/write.cpp0000644000176200001440000000254214174704674014015 0ustar liggesusers#include "cpp11/list.hpp" #include "cpp11/sexp.hpp" #include "cpp11/strings.hpp" #include "connection.h" #include #include [[cpp11::register]] void write_lines_( const cpp11::strings& lines, const cpp11::sexp& connection, const std::string& na, const std::string& sep) { for (cpp11::strings::const_iterator i = lines.begin(); i != lines.end(); ++i) { if (*i == NA_STRING) { write_bytes(connection, na.c_str(), na.size()); } else { const char* str = Rf_translateCharUTF8(*i); write_bytes(connection, str, strlen(str)); } write_bytes(connection, sep.c_str(), sep.size()); } } [[cpp11::register]] void write_lines_raw_( const cpp11::list& x, const cpp11::sexp& connection, const std::string& sep) { for (auto i : x) { cpp11::raws y(i); write_bytes( connection, reinterpret_cast(RAW(y)), y.size() * sizeof(RAW(y)[0])); write_bytes(connection, sep.c_str(), sep.size()); } } [[cpp11::register]] void write_file_(const std::string& x, const cpp11::sexp& connection) { write_bytes(connection, x.c_str(), x.size()); } [[cpp11::register]] void write_file_raw_(const cpp11::raws& x, const cpp11::sexp& connection) { write_bytes( connection, reinterpret_cast(RAW(x)), x.size() * sizeof(RAW(x)[0])); } readr/src/LocaleInfo.cpp0000644000176200001440000000215214174704674014673 0ustar liggesusers#include "cpp11/as.hpp" #include "cpp11/list.hpp" #include "cpp11/strings.hpp" #include #include #include "LocaleInfo.h" LocaleInfo::LocaleInfo(const cpp11::list& x) : encoding_(cpp11::as_cpp(x["encoding"])), encoder_(Iconv(encoding_)) { std::string klass = cpp11::as_cpp(x.attr("class")); if (klass != "locale") { cpp11::stop("Invalid input: must be of class locale"); } cpp11::list date_names(x["date_names"]); mon_ = cpp11::as_cpp>(date_names["mon"]); monAb_ = cpp11::as_cpp>(date_names["mon_ab"]); day_ = cpp11::as_cpp>(date_names["day"]); dayAb_ = cpp11::as_cpp>(date_names["day_ab"]); amPm_ = cpp11::as_cpp>(date_names["am_pm"]); decimalMark_ = cpp11::as_cpp(x["decimal_mark"]); groupingMark_ = cpp11::as_cpp(x["grouping_mark"]); dateFormat_ = cpp11::as_cpp(x["date_format"]); timeFormat_ = cpp11::as_cpp(x["time_format"]); tz_ = cpp11::as_cpp(x["tz"]); } readr/src/TokenizerFwf.h0000644000176200001440000000176114174704674014747 0ustar liggesusers#ifndef FASTREAD_TOKENIZERFWF_H_ #define FASTREAD_TOKENIZERFWF_H_ #include "Token.h" #include "Tokenizer.h" #include "utils.h" class TokenizerFwf : public Tokenizer { std::vector beginOffset_; std::vector endOffset_; std::vector NA_; SourceIterator begin_, cur_, curLine_, end_; int row_, col_, cols_, max_; std::string comment_; bool moreTokens_, isRagged_, hasComment_, trimWS_; bool skipEmptyRows_; public: TokenizerFwf( const std::vector& beginOffset, const std::vector& endOffset, std::vector NA = std::vector(1, "NA"), const std::string& comment = "", bool trimWS = true, bool skipEmptyRows = true); void tokenize(SourceIterator begin, SourceIterator end); std::pair progress(); Token nextToken(); private: Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull); bool isComment(const char* cur) const; bool isEmpty() const; }; #endif readr/src/SourceRaw.h0000644000176200001440000000135214174357220014227 0ustar liggesusers#ifndef FASTREAD_SOURCERAW_H_ #define FASTREAD_SOURCERAW_H_ #include "Source.h" #include "cpp11/raws.hpp" class SourceRaw : public Source { cpp11::raws x_; const char* begin_; const char* end_; public: SourceRaw( cpp11::raws x, int skip = 0, bool skipEmptyRows = true, const std::string& comment = "", bool skipQuotes = true) : x_(x) { begin_ = (const char*)RAW(x); end_ = (const char*)RAW(x) + Rf_xlength(x); // Skip byte order mark, if needed begin_ = skipBom(begin_, end_); // Skip lines, if needed begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } const char* end() { return end_; } }; #endif readr/src/Reader.h0000644000176200001440000000255314174704674013534 0ustar liggesusers#include "Collector.h" #include "Progress.h" #include "Source.h" #include "cpp11/list.hpp" #include "cpp11/strings.hpp" class Reader { public: Reader( SourcePtr source, TokenizerPtr tokenizer, std::vector collectors, bool progress, const cpp11::strings& colNames = cpp11::strings()); Reader( SourcePtr source, TokenizerPtr tokenizer, const CollectorPtr& collector, bool progress, const cpp11::strings& colNames = cpp11::strings()); cpp11::sexp readToDataFrame(R_xlen_t lines = -1); cpp11::sexp meltToDataFrame(const cpp11::list& locale_, R_xlen_t lines = -1); template T readToVector(R_xlen_t lines) { read(lines); SEXP x = collectors_[0]->vector(); T out(x); collectorsClear(); return out; } private: Warnings warnings_; SourcePtr source_; TokenizerPtr tokenizer_; std::vector collectors_; bool progress_; Progress progressBar_; std::vector keptColumns_; cpp11::writable::strings outNames_; bool begun_; Token t_; const static R_xlen_t progressStep_ = 10000; void init(const cpp11::strings& colNames); R_xlen_t read(R_xlen_t lines = -1); R_xlen_t melt(const cpp11::list& locale_, R_xlen_t lines = -1); void checkColumns(int i, int j, int n); void collectorsResize(R_xlen_t n); void collectorsClear(); }; readr/src/init.c0000644000176200001440000000033714152512262013250 0ustar liggesusers#include #include #include #include // for NULL void R_init_odbc(DllInfo* info) { R_registerRoutines(info, NULL, NULL, NULL, NULL); R_useDynamicSymbols(info, TRUE); } readr/src/Source.cpp0000644000176200001440000000717114174704674014126 0ustar liggesusers#include "cpp11/list.hpp" #include "cpp11/strings.hpp" #include "Source.h" #include "SourceFile.h" #include "SourceRaw.h" #include "SourceString.h" SourcePtr Source::create(const cpp11::list& spec) { std::string subclass(cpp11::as_cpp(spec.attr("class"))[0]); int skip = cpp11::as_cpp(spec["skip"]); bool skipEmptyRows = cpp11::as_cpp(spec["skip_empty_rows"]); std::string comment = cpp11::as_cpp(spec["comment"]); bool skipQuote = cpp11::as_cpp(spec["skip_quote"]); if (subclass == "source_raw") { return SourcePtr( new SourceRaw(spec[0], skip, skipEmptyRows, comment, skipQuote)); } if (subclass == "source_string") { return SourcePtr( new SourceString(spec[0], skip, skipEmptyRows, comment, skipQuote)); } if (subclass == "source_file") { cpp11::strings path(spec[0]); return SourcePtr(new SourceFile(Rf_translateCharUTF8(path[0]), skip, skipEmptyRows, comment, skipQuote)); } cpp11::stop("Unknown source type"); return SourcePtr(); } const char* Source::skipLines( const char* begin, const char* end, int n, bool skipEmptyRows, const std::string& comment, bool skipQuote) { bool hasComment = !comment.empty(); bool isComment = false; const char* cur = begin; while (cur < end && n > 0) { cur = skipLine( cur, end, hasComment && inComment(cur, end, comment), skipQuote); --n; ++skippedRows_; } // Skip any more trailing empty rows or comments while (cur < end && ((skipEmptyRows && (*cur == '\n' || *cur == '\r')) || (isComment = hasComment && inComment(cur, end, comment)))) { cur = skipLine(cur, end, isComment, skipQuote); ++skippedRows_; } return cur; } const char* Source::skipLine( const char* begin, const char* end, bool isComment, bool skipQuote) { const char* cur = begin; // skip the rest of the line until the newline while (cur < end && !(*cur == '\n' || *cur == '\r')) { if (!isComment && skipQuote && *cur == '"') { cur = skipDoubleQuoted(cur, end); } else { advanceForLF(&cur, end); ++cur; } } advanceForLF(&cur, end); // skip the actual newline char if (cur < end) { ++cur; } return cur; } const char* Source::skipDoubleQuoted(const char* begin, const char* end) { const char* cur = begin; // This doesn't handle escaped quotes or more sophisticated things, but // will work for simple cases. // Opening quote ++cur; while (cur < end && *cur != '"') { ++cur; } // Closing quote if (cur < end) { ++cur; } return cur; } const char* Source::skipBom(const char* begin, const char* end) { /* Unicode Byte Order Marks https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding 00 00 FE FF: UTF-32BE FF FE 00 00: UTF-32LE FE FF: UTF-16BE FF FE: UTF-16LE EF BB BF: UTF-8 */ switch (begin[0]) { // UTF-32BE case '\x00': if (end - begin >= 4 && begin[1] == '\x00' && begin[2] == '\xFE' && begin[3] == '\xFF') { return begin + 4; } break; // UTF-8 case '\xEF': if (end - begin >= 3 && begin[1] == '\xBB' && begin[2] == '\xBF') { return begin + 3; } break; // UTF-16BE case '\xfe': if (end - begin >= 2 && begin[1] == '\xff') { return begin + 2; } break; case '\xff': if (end - begin >= 2 && begin[1] == '\xfe') { // UTF-32 LE if (end - begin >= 4 && begin[2] == '\x00' && begin[3] == '\x00') { return begin + 4; } // UTF-16 LE return begin + 2; } break; } return begin; } readr/src/TokenizerDelim.cpp0000644000176200001440000002362414174704674015614 0ustar liggesusers#include "TokenizerDelim.h" #include #include "cpp11/protect.hpp" TokenizerDelim::TokenizerDelim( char delim, char quote, std::vector NA, const std::string& comment, bool trimWS, bool escapeBackslash, bool escapeDouble, bool quotedNA, bool skipEmptyRows) : delim_(delim), quote_(quote), NA_(std::move(NA)), comment_(comment), hasComment_(!comment.empty()), trimWS_(trimWS), escapeBackslash_(escapeBackslash), escapeDouble_(escapeDouble), quotedNA_(quotedNA), hasEmptyNA_(false), moreTokens_(false), skipEmptyRows_(skipEmptyRows) { for (auto& i : NA_) { if (i.empty()) { hasEmptyNA_ = true; break; } } } void TokenizerDelim::tokenize(SourceIterator begin, SourceIterator end) { cur_ = begin; end_ = end; begin_ = begin; row_ = 0; col_ = 0; state_ = STATE_DELIM; moreTokens_ = true; } std::pair TokenizerDelim::progress() { size_t bytes = cur_ - begin_; return std::make_pair(bytes / (double)(end_ - begin_), bytes); } Token TokenizerDelim::nextToken() { // Capture current position int row = row_; int col = col_; if (!moreTokens_) { return {TOKEN_EOF, row, col}; } SourceIterator token_begin = cur_; bool hasEscapeD = false; bool hasEscapeB = false; bool hasNull = false; while (cur_ < end_) { // Increments cur on destruct, ensuring that we always move on to the // next character Advance advance(&cur_); if (*cur_ == '\0') { hasNull = true; } if ((end_ - cur_) % 131072 == 0) { cpp11::check_user_interrupt(); } switch (state_) { case STATE_DELIM: { while (cur_ != end_ && *cur_ == ' ') { ++cur_; } if (*cur_ == '\r' || *cur_ == '\n') { if (col_ == 0 && skipEmptyRows_) { advanceForLF(&cur_, end_); token_begin = cur_ + 1; break; } newRecord(); return emptyToken(row, col); } if (isComment(cur_)) { state_ = STATE_COMMENT; } else if (*cur_ == delim_) { newField(); return emptyToken(row, col); } else if (*cur_ == quote_) { token_begin = cur_; state_ = STATE_STRING; } else if (escapeBackslash_ && *cur_ == '\\') { state_ = STATE_ESCAPE_F; } else { state_ = STATE_FIELD; } break; } case STATE_FIELD: if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); return fieldToken( token_begin, advanceForLF(&cur_, end_), hasEscapeB, hasNull, row, col); } else if (isComment(cur_)) { newField(); state_ = STATE_COMMENT; return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col); } else if (escapeBackslash_ && *cur_ == '\\') { state_ = STATE_ESCAPE_F; } else if (*cur_ == delim_) { newField(); return fieldToken(token_begin, cur_, hasEscapeB, hasNull, row, col); } break; case STATE_ESCAPE_F: hasEscapeB = true; state_ = STATE_FIELD; break; case STATE_QUOTE: if (*cur_ == quote_) { hasEscapeD = true; state_ = STATE_STRING; } else if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); return stringToken( token_begin + 1, advanceForLF(&cur_, end_) - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else if (isComment(cur_)) { state_ = STATE_COMMENT; return stringToken( token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else if (*cur_ == delim_) { newField(); return stringToken( token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else { warn(row, col, "delimiter or quote", std::string(cur_, cur_ + 1)); state_ = STATE_STRING; } break; case STATE_STRING: if (*cur_ == quote_) { if (escapeDouble_) { state_ = STATE_QUOTE; } else { state_ = STATE_STRING_END; } } else if (escapeBackslash_ && *cur_ == '\\') { state_ = STATE_ESCAPE_S; } break; case STATE_STRING_END: if (*cur_ == '\r' || *cur_ == '\n') { newRecord(); return stringToken( token_begin + 1, advanceForLF(&cur_, end_) - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else if (isComment(cur_)) { state_ = STATE_COMMENT; return stringToken( token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else if (*cur_ == delim_) { newField(); return stringToken( token_begin + 1, cur_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); } else { state_ = STATE_FIELD; } break; case STATE_ESCAPE_S: hasEscapeB = true; state_ = STATE_STRING; break; case STATE_COMMENT: if (*cur_ == '\r' || *cur_ == '\n') { // If we have read at least one record on the current row go to the // next row, line, otherwise just ignore the line. if (col_ > 0) { row_++; row++; col_ = 0; } col = 0; advanceForLF(&cur_, end_); token_begin = cur_ + 1; state_ = STATE_DELIM; } break; } } // Reached end of Source: cur_ == end_ moreTokens_ = false; switch (state_) { case STATE_DELIM: if (col_ == 0) { return {TOKEN_EOF, row, col}; } else { return emptyToken(row, col); } case STATE_STRING_END: case STATE_QUOTE: return stringToken( token_begin + 1, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); case STATE_STRING: warn(row, col, "closing quote at end of file"); return stringToken( token_begin + 1, end_, hasEscapeB, hasEscapeD, hasNull, row, col); case STATE_ESCAPE_S: case STATE_ESCAPE_F: warn(row, col, "closing escape at end of file"); return stringToken( token_begin, end_ - 1, hasEscapeB, hasEscapeD, hasNull, row, col); case STATE_FIELD: return fieldToken(token_begin, end_, hasEscapeB, hasNull, row, col); case STATE_COMMENT: return {TOKEN_EOF, row, col}; } return {TOKEN_EOF, row, col}; } bool TokenizerDelim::isComment(const char* cur) const { if (!hasComment_) { return false; } return starts_with_comment(cur, end_, comment_); } void TokenizerDelim::newField() { col_++; state_ = STATE_DELIM; } void TokenizerDelim::newRecord() { row_++; col_ = 0; state_ = STATE_DELIM; } Token TokenizerDelim::emptyToken(int row, int col) const { return {hasEmptyNA_ ? TOKEN_MISSING : TOKEN_EMPTY, row, col}; } Token TokenizerDelim::fieldToken( SourceIterator begin, SourceIterator end, bool hasEscapeB, bool hasNull, int row, int col) { Token t(begin, end, row, col, hasNull, (hasEscapeB) ? this : nullptr); if (trimWS_) { t.trim(); } t.flagNA(NA_); return t; } Token TokenizerDelim::stringToken( SourceIterator begin, SourceIterator end, bool hasEscapeB, bool hasEscapeD, bool hasNull, int row, int col) { Token t( begin, end, row, col, hasNull, (hasEscapeD || hasEscapeB) ? this : nullptr); if (trimWS_) { t.trim(); } if (quotedNA_) { t.flagNA(NA_); } return t; } void TokenizerDelim::unescape( SourceIterator begin, SourceIterator end, std::string* pOut) { if (escapeDouble_ && !escapeBackslash_) { unescapeDouble(begin, end, pOut); } else if (escapeBackslash_ && !escapeDouble_) { unescapeBackslash(begin, end, pOut); } else if (escapeBackslash_ && escapeDouble_) { cpp11::stop("Backslash & double escapes not supported at this time"); } } void TokenizerDelim::unescapeDouble( SourceIterator begin, SourceIterator end, std::string* pOut) const { pOut->reserve(end - begin); bool inEscape = false; for (SourceIterator cur = begin; cur != end; ++cur) { if (*cur == quote_) { if (inEscape) { pOut->push_back(*cur); inEscape = false; } else { inEscape = true; } } else { pOut->push_back(*cur); } } } void TokenizerDelim::unescapeBackslash( SourceIterator begin, SourceIterator end, std::string* pOut) { pOut->reserve(end - begin); bool inEscape = false; for (SourceIterator cur = begin; cur != end; ++cur) { if (inEscape) { switch (*cur) { case '\'': pOut->push_back('\''); break; case '"': pOut->push_back('"'); break; case '\\': pOut->push_back('\\'); break; case 'a': pOut->push_back('\a'); break; case 'b': pOut->push_back('\b'); break; case 'f': pOut->push_back('\f'); break; case 'n': pOut->push_back('\n'); break; case 'r': pOut->push_back('\r'); break; case 't': pOut->push_back('\t'); break; case 'v': pOut->push_back('\v'); break; default: if (*cur == delim_ || *cur == quote_ || isComment(cur)) { pOut->push_back(*cur); } else { pOut->push_back('\\'); pOut->push_back(*cur); warn(row_, col_, "standard escape", "\\" + std::string(cur, 1)); } break; } inEscape = false; } else { if (*cur == '\\') { inEscape = true; } else { pOut->push_back(*cur); } } } } readr/src/connection.h0000644000176200001440000000156414174704674014472 0ustar liggesusers#pragma once #include "cpp11/function.hpp" #include "cpp11/raws.hpp" inline SEXP R_GetConnection(SEXP con) { return con; } inline size_t R_ReadConnection(SEXP con, void* buf, size_t n) { static auto readBin = cpp11::package("base")["readBin"]; cpp11::raws res( readBin(con, cpp11::writable::raws(static_cast(0)), n)); memcpy(buf, RAW(res), res.size()); return res.size(); } inline size_t R_WriteConnection(SEXP con, void* buf, size_t n) { static auto writeBin = cpp11::package("base")["writeBin"]; cpp11::writable::raws payload(n); memcpy(RAW(payload), buf, n); writeBin(payload, con); return n; } inline void write_bytes(SEXP con, const char* bytes, size_t size) { size_t write_size; if ((write_size = R_WriteConnection(con, (void*)bytes, size)) != size) { cpp11::stop("write failed, expected %l, got %l", size, write_size); } } readr/src/TokenizerWs.h0000644000176200001440000000145514174704674014616 0ustar liggesusers#ifndef READR_TOKENIZERWS_H_ #define READR_TOKENIZERWS_H_ #include "Token.h" #include "Tokenizer.h" #include "utils.h" class TokenizerWs : public Tokenizer { std::vector NA_; SourceIterator begin_, cur_, curLine_, end_; int row_, col_; std::string comment_; bool moreTokens_, hasComment_; bool skipEmptyRows_; public: TokenizerWs( std::vector NA = std::vector(1, "NA"), const std::string& comment = "", bool skipEmptyRows = true); void tokenize(SourceIterator begin, SourceIterator end); std::pair progress(); Token nextToken(); private: Token fieldToken(SourceIterator begin, SourceIterator end, bool hasNull); bool isComment(const char* cur) const; bool isEmpty() const; void ignoreLine(); }; #endif readr/src/Tokenizer.h0000644000176200001440000000342114174704674014277 0ustar liggesusers#ifndef FASTREAD_TOKENIZER_H_ #define FASTREAD_TOKENIZER_H_ #include "cpp11/R.hpp" #include "cpp11/list.hpp" #include "cpp11/protect.hpp" #include "Warnings.h" #include class Token; typedef const char* SourceIterator; typedef std::pair SourceIterators; typedef void (*UnescapeFun)(SourceIterator, SourceIterator, std::string*); class Tokenizer; typedef std::shared_ptr TokenizerPtr; class Tokenizer { Warnings* pWarnings_; public: Tokenizer() : pWarnings_(NULL) {} virtual ~Tokenizer() {} virtual void tokenize(SourceIterator begin, SourceIterator end) = 0; virtual Token nextToken() = 0; // Percentage & bytes virtual std::pair progress() = 0; virtual void unescape(SourceIterator begin, SourceIterator end, std::string* pOut) { pOut->reserve(end - begin); for (SourceIterator cur = begin; cur != end; ++cur) pOut->push_back(*cur); } void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; } inline void warn( int row, int col, const std::string& expected, const std::string& actual = "") { if (pWarnings_ == NULL) { cpp11::warning( "[%i, %i]: expected %s", row + 1, col + 1, expected.c_str()); return; } pWarnings_->addWarning(row, col, expected, actual); } static TokenizerPtr create(const cpp11::list& spec); }; // ----------------------------------------------------------------------------- // Helper class for parsers - ensures iterator always advanced no matter // how loop is exited class Advance { SourceIterator* pIter_; public: Advance(SourceIterator* pIter) : pIter_(pIter) {} Advance(const Advance&) = delete; Advance& operator=(const Advance&) = delete; ~Advance() { (*pIter_)++; } }; #endif readr/src/Progress.h0000644000176200001440000000347614304131171014120 0ustar liggesusers#ifndef FASTREAD_PROGRESS_H_ #define FASTREAD_PROGRESS_H_ #include "cpp11/R.hpp" #include #include #include inline int now() { return clock() / CLOCKS_PER_SEC; } inline std::string showTime(int x) { std::stringstream ss; if (x < 60) { ss << x << " s"; return ss.str(); } else if (x < 60 * 60) { ss << x / 60 << " m"; return ss.str(); } else { ss << x / (60 * 60) << " h"; return ss.str(); } } class Progress { int timeMin_, timeInit_, timeStop_, width_; bool show_, stopped_; public: Progress(int min = 5, int width = Rf_GetOptionWidth()) : timeMin_(min), timeInit_(now()), timeStop_(now()), width_(width), show_(false), stopped_(false) {} void stop() { timeStop_ = now(); stopped_ = true; } void show(std::pair progress) { double prop = progress.first, size = progress.second / (1024 * 1024); double est = (now() - timeInit_) / prop; if (!show_) { if (est > timeMin_) { show_ = true; } else { return; } } std::stringstream labelStream; labelStream << std::setprecision(2) << std::fixed << " " << (int)(prop * 100) << "%"; if (size > 0) { labelStream << " " << std::setprecision(0) << size << " MB"; } std::string label = labelStream.str(); int barSize = width_ - label.size() - 2; if (barSize < 0) { return; } int nbars = prop * barSize; int nspaces = (1 - prop) * barSize; std::string bars(nbars, '='), spaces(nspaces, ' '); Rprintf("\r|%s%s|%s", bars.c_str(), spaces.c_str(), label.c_str()); } ~Progress() { try { if (!show_) return; if (!stopped_) timeStop_ = now(); Rprintf("\n"); } catch (...) { } } }; #endif readr/src/Source.h0000644000176200001440000000203314174704674013563 0ustar liggesusers#ifndef FASTREAD_SOURCE_H_ #define FASTREAD_SOURCE_H_ #include "cpp11/list.hpp" #include "utils.h" #include class Source; typedef std::shared_ptr SourcePtr; class Source { public: Source() : skippedRows_(0) {} virtual ~Source() {} virtual const char* begin() = 0; virtual const char* end() = 0; const char* skipLines( const char* begin, const char* end, int n, bool skipEmptyRows = true, const std::string& comment = "", bool skipQuote = true); static const char* skipLine(const char* begin, const char* end, bool isComment, bool skipQuote); static const char* skipDoubleQuoted(const char* begin, const char* end); size_t skippedRows() { return skippedRows_; } static const char* skipBom(const char* begin, const char* end); static SourcePtr create(const cpp11::list& spec); private: static bool inComment(const char* cur, const char* end, const std::string& comment) { return starts_with_comment(cur, end, comment); } size_t skippedRows_; }; #endif readr/src/Collector.cpp0000644000176200001440000002370714174704674014617 0ustar liggesusers#include "cpp11/list.hpp" #include "Collector.h" #include "LocaleInfo.h" #include "QiParsers.h" #include "utils.h" CollectorPtr Collector::create(const cpp11::list& spec, LocaleInfo* pLocale) { std::string subclass(cpp11::as_cpp(spec.attr("class"))[0]); if (subclass == "collector_skip") { return CollectorPtr(new CollectorSkip()); } if (subclass == "collector_logical") { return CollectorPtr(new CollectorLogical()); } if (subclass == "collector_integer") { return CollectorPtr(new CollectorInteger()); } if (subclass == "collector_double") { return CollectorPtr(new CollectorDouble(pLocale->decimalMark_)); } if (subclass == "collector_number") { return CollectorPtr( new CollectorNumeric(pLocale->decimalMark_, pLocale->groupingMark_)); } if (subclass == "collector_character") { return CollectorPtr(new CollectorCharacter(&pLocale->encoder_)); } if (subclass == "collector_date") { SEXP format_ = spec["format"]; std::string format = (Rf_isNull(format_)) != 0U ? pLocale->dateFormat_ : cpp11::as_cpp(format_); return CollectorPtr(new CollectorDate(pLocale, format)); } if (subclass == "collector_datetime") { std::string format = cpp11::as_cpp(spec["format"]); return CollectorPtr(new CollectorDateTime(pLocale, format)); } if (subclass == "collector_time") { std::string format = cpp11::as_cpp(spec["format"]); return CollectorPtr(new CollectorTime(pLocale, format)); } if (subclass == "collector_factor") { cpp11::sexp levels(spec["levels"]); bool ordered = cpp11::as_cpp(spec["ordered"]); bool includeNa = cpp11::as_cpp(spec["include_na"]); return CollectorPtr( new CollectorFactor(&pLocale->encoder_, levels, ordered, includeNa)); } cpp11::stop("Unsupported column type"); return CollectorPtr(new CollectorSkip()); } std::vector collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale) { std::vector collectors; for (auto spec : specs) { CollectorPtr col(Collector::create(SEXP(spec), pLocale)); collectors.push_back(col); } return collectors; } // Implementations ------------------------------------------------------------ void CollectorCharacter::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); if (t.hasNull()) { warn(t.row(), t.col(), "", "embedded null"); } SET_STRING_ELT( column_, i, pEncoder_->makeSEXP(string.first, string.second, t.hasNull())); break; }; case TOKEN_MISSING: SET_STRING_ELT(column_, i, NA_STRING); break; case TOKEN_EMPTY: SET_STRING_ELT(column_, i, Rf_mkCharCE("", CE_UTF8)); break; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorCharacter::setValue(int i, const std::string& s) { SET_STRING_ELT(column_, i, Rf_mkCharCE(s.c_str(), CE_UTF8)); } void CollectorDate::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); std::string std_string(string.first, string.second); parser_.setDate(std_string.c_str()); bool res = (format_.empty()) ? parser_.parseLocaleDate() : parser_.parse(format_); if (!res) { warn(t.row(), t.col(), "date like " + format_, std_string); REAL(column_)[i] = NA_REAL; return; } DateTime dt = parser_.makeDate(); if (!dt.validDate()) { warn(t.row(), t.col(), "valid date", std_string); REAL(column_)[i] = NA_REAL; return; } REAL(column_)[i] = dt.date(); return; } case TOKEN_MISSING: case TOKEN_EMPTY: REAL(column_)[i] = NA_REAL; return; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorDateTime::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); std::string std_string(string.first, string.second); parser_.setDate(std_string.c_str()); bool res = (format_.empty()) ? parser_.parseISO8601() : parser_.parse(format_); if (!res) { warn(t.row(), t.col(), "date like " + format_, std_string); REAL(column_)[i] = NA_REAL; return; } DateTime dt = parser_.makeDateTime(); if (!dt.validDateTime()) { warn(t.row(), t.col(), "valid date", std_string); REAL(column_)[i] = NA_REAL; return; } REAL(column_)[i] = dt.datetime(); return; } case TOKEN_MISSING: case TOKEN_EMPTY: REAL(column_)[i] = NA_REAL; return; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorDouble::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators str = t.getString(&buffer); const char* end = str.second; bool ok = parseDouble(decimalMark_, str.first, str.second, REAL(column_)[i]); if (!ok) { REAL(column_)[i] = NA_REAL; SourceIterators org_str = t.getString(&buffer); warn(t.row(), t.col(), "a double", org_str); return; } if (str.second != end) { REAL(column_)[i] = NA_REAL; SourceIterators org_str = t.getString(&buffer); warn(t.row(), t.col(), "no trailing characters", org_str); return; } return; } case TOKEN_MISSING: case TOKEN_EMPTY: REAL(column_)[i] = NA_REAL; break; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorDouble::setValue(int i, size_t st) { REAL(column_)[i] = st; } void CollectorFactor::insert( int i, const cpp11::r_string& str, const Token& t) { auto it = levelset_.find(str); if (it == levelset_.end()) { if (implicitLevels_ || (includeNa_ && str == NA_STRING)) { int n = levelset_.size(); levelset_.insert(std::make_pair(str, n)); levels_.push_back(str); INTEGER(column_)[i] = n + 1; } else { warn(t.row(), t.col(), "value in level set", str); INTEGER(column_)[i] = NA_INTEGER; } } else { INTEGER(column_)[i] = it->second + 1; } } void CollectorFactor::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_EMPTY: case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); cpp11::r_string std_string( pEncoder_->makeSEXP(string.first, string.second, t.hasNull())); insert(i, std_string, t); return; }; case TOKEN_MISSING: if (includeNa_) { insert(i, NA_STRING, t); } else { INTEGER(column_)[i] = NA_INTEGER; } return; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorInteger::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators str = t.getString(&buffer); bool ok = parseInt(str.first, str.second, INTEGER(column_)[i]); if (!ok) { INTEGER(column_)[i] = NA_INTEGER; SourceIterators org_str = t.getString(&buffer); warn(t.row(), t.col(), "an integer", org_str); return; } if (str.first != str.second) { SourceIterators org_str = t.getString(&buffer); warn(t.row(), t.col(), "no trailing characters", org_str); INTEGER(column_)[i] = NA_INTEGER; return; } return; }; case TOKEN_MISSING: case TOKEN_EMPTY: INTEGER(column_)[i] = NA_INTEGER; break; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorLogical::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); std::string str(string.first, string.second); size_t len = string.second - string.first; if (isTrue(string.first, string.second) || (len == 1 && *string.first == '1')) { LOGICAL(column_)[i] = 1; return; } if (isFalse(string.first, string.second) || (len == 1 && *string.first == '0')) { LOGICAL(column_)[i] = 0; return; } warn(t.row(), t.col(), "1/0/T/F/TRUE/FALSE", string); LOGICAL(column_)[i] = NA_LOGICAL; return; }; case TOKEN_MISSING: case TOKEN_EMPTY: LOGICAL(column_)[i] = NA_LOGICAL; return; break; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorNumeric::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators str = t.getString(&buffer); bool ok = parseNumber( decimalMark_, groupingMark_, str.first, str.second, REAL(column_)[i]); if (!ok) { SourceIterators org_str = t.getString(&buffer); REAL(column_)[i] = NA_REAL; warn(t.row(), t.col(), "a number", org_str); return; } break; } case TOKEN_MISSING: case TOKEN_EMPTY: REAL(column_)[i] = NA_REAL; break; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorTime::setValue(int i, const Token& t) { switch (t.type()) { case TOKEN_STRING: { std::string buffer; SourceIterators string = t.getString(&buffer); std::string std_string(string.first, string.second); parser_.setDate(std_string.c_str()); bool res = (format_.empty()) ? parser_.parseLocaleTime() : parser_.parse(format_); if (!res) { warn(t.row(), t.col(), "time like " + format_, std_string); REAL(column_)[i] = NA_REAL; return; } DateTime dt = parser_.makeTime(); if (!dt.validDuration()) { warn(t.row(), t.col(), "valid duration", std_string); REAL(column_)[i] = NA_REAL; return; } REAL(column_)[i] = dt.time(); return; } case TOKEN_MISSING: case TOKEN_EMPTY: REAL(column_)[i] = NA_REAL; return; case TOKEN_EOF: cpp11::stop("Invalid token"); } } void CollectorRaw::setValue(int i, const Token& t) { if (t.type() == TOKEN_EOF) { cpp11::stop("Invalid token"); } SET_VECTOR_ELT(column_, i, t.asRaw()); } readr/src/utils.h0000644000176200001440000000404114174704674013464 0ustar liggesusers#ifndef FASTREAD_UTILS_H_ #define FASTREAD_UTILS_H_ #include #include #include // Advances iterator if the next character is a LF. // Returns iterator to end of line. template inline Iter advanceForLF(Iter* pBegin, Iter end) { Iter cur = *pBegin; if (cur == end) { return cur; } if (*cur == '\r' && (cur + 1 != end) && *(cur + 1) == '\n') (*pBegin)++; return cur; } const static char* const true_values[] = { "T", "t", "True", "TRUE", "true", (char*)NULL}; const static char* const false_values[] = { "F", "f", "False", "FALSE", "false", (char*)NULL}; inline bool isTrue(const char* start, const char* end) { size_t len = end - start; for (int i = 0; true_values[i]; i++) { size_t true_len = strlen(true_values[i]); if (true_len == len && strncmp(start, true_values[i], len) == 0) { return true; } } return false; } inline bool isFalse(const char* start, const char* end) { size_t len = end - start; for (int i = 0; false_values[i]; i++) { if (strlen(false_values[i]) == len && strncmp(start, false_values[i], len) == 0) { return true; } } return false; } inline bool isLogical(const char* start, const char* end) { return isTrue(start, end) || isFalse(start, end); } inline bool istarts_with(const std::string& input, const std::string& test) { if (test.size() > input.size()) { return false; } auto test_it = test.cbegin(); auto input_it = input.cbegin(); auto test_end = test.cend(); auto locale = std::locale(); while (test_it != test_end) { if (std::toupper(*test_it++, locale) != std::toupper(*input_it++, locale)) { return false; } } return true; } inline bool starts_with_comment( const char* cur, const char* end, const std::string& comment) { // If the comment is bigger than what we are testing, it cannot start with it. if ((long)comment.size() > (end - cur)) { return false; } for (auto c : comment) { if (*cur++ != c) { return false; } } return true; } #endif readr/src/parse.cpp0000644000176200001440000001052514174704674013775 0ustar liggesusers#include "cpp11/R.hpp" #include "cpp11/integers.hpp" #include "cpp11/list.hpp" #include "cpp11/sexp.hpp" #include #include "Collector.h" #include "LocaleInfo.h" #include "Source.h" #include "Tokenizer.h" #include "TokenizerLine.h" #include "Warnings.h" [[cpp11::register]] cpp11::integers dim_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec) { SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); int rows = -1; int cols = -1; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { rows = t.row(); if ((int)t.col() > cols) { cols = t.col(); } } cpp11::writable::integers out(rows + 1); for (auto&& x : out) { x = cols + 1; } return out; } [[cpp11::register]] std::vector count_fields_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, int n_max) { SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); std::vector fields; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (n_max > 0 && t.row() >= (size_t)n_max) { break; } if (t.row() >= fields.size()) { fields.resize(t.row() + 1); } fields[t.row()] = t.col() + 1; } return fields; } [[cpp11::register]] cpp11::list guess_header_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& locale_) { Warnings warnings; LocaleInfo locale(locale_); SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); CollectorCharacter out(&locale.encoder_); out.setWarnings(&warnings); Token t = tokenizer->nextToken(); size_t row_num = t.row(); size_t max_size = 0; size_t capacity = 0; for (; t.type() != TOKEN_EOF && t.row() == row_num; t = tokenizer->nextToken()) { if (t.col() >= max_size) { max_size = t.col(); } if (max_size >= capacity) { capacity = (max_size + 1) * 2; out.resize(capacity); } if (t.type() == TOKEN_STRING) { out.setValue(t.col(), t); } } out.resize(max_size + 1); using namespace cpp11::literals; return cpp11::writable::list( {"header"_nm = out.vector(), "skip"_nm = source->skippedRows() + 1}); } [[cpp11::register]] SEXP tokenize_( const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, int n_max) { Warnings warnings; SourcePtr source = Source::create(sourceSpec); TokenizerPtr tokenizer = Tokenizer::create(tokenizerSpec); tokenizer->tokenize(source->begin(), source->end()); tokenizer->setWarnings(&warnings); std::vector> rows; for (Token t = tokenizer->nextToken(); t.type() != TOKEN_EOF; t = tokenizer->nextToken()) { if (n_max > 0 && t.row() >= (size_t)n_max) { break; } if (t.row() >= rows.size()) { rows.resize(t.row() + 1); } std::vector& row = rows[t.row()]; if (t.col() >= row.size()) { row.resize(t.col() + 1); } row[t.col()] = t.asString(); } cpp11::writable::list out; out.reserve(rows.size()); for (auto&& row : rows) { cpp11::sexp row_data(cpp11::as_sexp(row)); out.push_back(row_data); } return warnings.addAsAttribute(out); } [[cpp11::register]] SEXP parse_vector_( const cpp11::strings& x, const cpp11::list& collectorSpec, const cpp11::list& locale_, const std::vector& na, bool trim_ws) { Warnings warnings; int n = x.size(); LocaleInfo locale(locale_); std::shared_ptr col(Collector::create(collectorSpec, &locale)); col->setWarnings(&warnings); col->resize(n); for (int i = 0; i < n; ++i) { Token t; if (x[i] == NA_STRING) { t = Token(TOKEN_MISSING, i, -1); } else { SEXP string = x[i]; t = Token(CHAR(string), CHAR(string) + Rf_length(string), i, -1, false); if (trim_ws) { t.trim(); } t.flagNA(na); } col->setValue(i, t); } return warnings.addAsAttribute(static_cast(col->vector())); } readr/src/Collector.h0000644000176200001440000001640214174704674014256 0ustar liggesusers#ifndef FASTREAD_COLLECTOR_H_ #define FASTREAD_COLLECTOR_H_ #include "cpp11/doubles.hpp" #include "cpp11/integers.hpp" #include "cpp11/list.hpp" #include "cpp11/logicals.hpp" #include "cpp11/strings.hpp" #include "DateTime.h" #include "DateTimeParser.h" #include "Iconv.h" #include "LocaleInfo.h" #include "Token.h" #include "Warnings.h" #include #include class Collector; typedef std::shared_ptr CollectorPtr; class Collector { protected: cpp11::sexp column_; Warnings* pWarnings_; int n_; public: Collector(SEXP column, Warnings* pWarnings = NULL) : column_(column), pWarnings_(pWarnings), n_(0) {} virtual ~Collector(){}; virtual void setValue(int i, const Token& t) = 0; virtual void setValue(int i, const std::string& s){}; // nocov virtual void setValue(int i, size_t st){}; // nocov virtual cpp11::sexp vector() { return column_; }; virtual bool skip() { return false; } int size() { return n_; } void resize(int n) { if (n == n_) return; if (column_ == R_NilValue) return; #if R_VERSION >= R_Version(3, 4, 0) if (n > 0 && n < n_) { SET_TRUELENGTH(column_, n_); SETLENGTH(column_, n); SET_GROWABLE_BIT(column_); } else { column_ = Rf_lengthgets(column_, n); } #else column_ = Rf_lengthgets(column_, n); #endif n_ = n; } void clear() { resize(0); } void setWarnings(Warnings* pWarnings) { pWarnings_ = pWarnings; } inline void warn(int row, int col, std::string expected, std::string actual) { if (pWarnings_ == NULL) { cpp11::warning( "[%i, %i]: expected %s, but got '%s'", row + 1, col + 1, expected.c_str(), actual.c_str()); return; } pWarnings_->addWarning(row, col, expected, actual); } inline void warn(int row, int col, std::string expected, SourceIterators actual) { warn(row, col, expected, std::string(actual.first, actual.second)); } static CollectorPtr create(const cpp11::list& spec, LocaleInfo* pLocale); }; // Character ------------------------------------------------------------------- class CollectorCharacter : public Collector { Iconv* pEncoder_; public: CollectorCharacter(Iconv* pEncoder) : Collector(cpp11::writable::strings(R_xlen_t(0))), pEncoder_(pEncoder) {} void setValue(int i, const Token& t); void setValue(int i, const std::string& s); }; // Date ------------------------------------------------------------------------ class CollectorDate : public Collector { std::string format_; DateTimeParser parser_; public: CollectorDate(LocaleInfo* pLocale, const std::string& format) : Collector(cpp11::writable::doubles(R_xlen_t(0))), format_(format), parser_(pLocale) {} void setValue(int i, const Token& t); cpp11::sexp vector() { column_.attr("class") = "Date"; return column_; }; }; // Date time ------------------------------------------------------------------- class CollectorDateTime : public Collector { std::string format_; DateTimeParser parser_; std::string tz_; public: CollectorDateTime(LocaleInfo* pLocale, const std::string& format) : Collector(cpp11::writable::doubles(R_xlen_t(0))), format_(format), parser_(pLocale), tz_(pLocale->tz_) {} void setValue(int i, const Token& t); cpp11::sexp vector() { column_.attr("class") = {"POSIXct", "POSIXt"}; column_.attr("tzone") = tz_; return column_; }; }; class CollectorDouble : public Collector { char decimalMark_; public: CollectorDouble(char decimalMark) : Collector(cpp11::writable::doubles(R_xlen_t(0))), decimalMark_(decimalMark) {} void setValue(int i, const Token& t); void setValue(int i, size_t st); }; class CollectorFactor : public Collector { Iconv* pEncoder_; std::vector levels_; std::map levelset_; bool ordered_, implicitLevels_, includeNa_; std::string buffer_; void insert(int i, const cpp11::r_string& str, const Token& t); public: CollectorFactor( Iconv* pEncoder, cpp11::sexp levels, bool ordered, bool includeNa) : Collector(cpp11::writable::integers(R_xlen_t(0))), pEncoder_(pEncoder), ordered_(ordered), includeNa_(includeNa) { implicitLevels_ = levels == R_NilValue; if (!implicitLevels_) { cpp11::strings lvls(levels); int n = lvls.size(); for (int i = 0; i < n; ++i) { cpp11::r_string std_level; if (STRING_ELT(lvls, i) != NA_STRING) { const char* level = Rf_translateCharUTF8(STRING_ELT(lvls, i)); std_level = level; } else { std_level = NA_STRING; } levels_.push_back(std_level); levelset_.insert(std::make_pair(std_level, i)); } } } void setValue(int i, const Token& t); cpp11::sexp vector() { if (ordered_) { column_.attr("class") = {"ordered", "factor"}; } else { column_.attr("class") = "factor"; } int n = levels_.size(); cpp11::writable::strings levels(n); for (int i = 0; i < n; ++i) { levels[i] = levels_[i]; } column_.attr("levels") = levels; return column_; }; }; class CollectorInteger : public Collector { public: CollectorInteger() : Collector(cpp11::writable::integers(R_xlen_t(0))) {} void setValue(int i, const Token& t); }; class CollectorLogical : public Collector { public: CollectorLogical() : Collector(cpp11::writable::logicals(R_xlen_t(0))) {} void setValue(int i, const Token& t); }; class CollectorNumeric : public Collector { char decimalMark_, groupingMark_; public: CollectorNumeric(char decimalMark, char groupingMark) : Collector(cpp11::writable::doubles(R_xlen_t(0))), decimalMark_(decimalMark), groupingMark_(groupingMark) {} void setValue(int i, const Token& t); bool isNum(char c); }; // Time --------------------------------------------------------------------- class CollectorTime : public Collector { std::string format_; DateTimeParser parser_; public: CollectorTime(LocaleInfo* pLocale, const std::string& format) : Collector(cpp11::writable::doubles(R_xlen_t(0))), format_(format), parser_(pLocale) {} void setValue(int i, const Token& t); cpp11::sexp vector() { column_.attr("class") = {"hms", "difftime"}; column_.attr("units") = "secs"; return column_; }; }; // Skip --------------------------------------------------------------------- class CollectorSkip : public Collector { public: CollectorSkip() : Collector(R_NilValue) {} void setValue(int i, const Token& t) {} bool skip() { return true; } }; // Raw ------------------------------------------------------------------------- class CollectorRaw : public Collector { public: CollectorRaw() : Collector(cpp11::writable::list(static_cast(0))) {} void setValue(int i, const Token& t); }; // Helpers --------------------------------------------------------------------- std::vector collectorsCreate(const cpp11::list& specs, LocaleInfo* pLocale); void collectorsResize(std::vector& collectors, int n); void collectorsClear(std::vector& collectors); std::string collectorGuess( const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger = false); #endif readr/src/DateTime.h0000644000176200001440000001313214174704674014021 0ustar liggesusers#ifndef READR_DATE_TIME_H_ #define READR_DATE_TIME_H_ #include "cpp11/R.hpp" #include #include #include // Much of this code is adapted from R's src/main/datetime.c. // Author: The R Core Team. // License: GPL >= 2 static const int month_length[12] = { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}; static const int month_start[12] = { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334}; // Leap days occur in a 400 year cycle: this records the cumulative number // of leap days in per cycle. Generated with: // is_leap <- function(y) (y %% 4) == 0 & ((y %% 100) != 0 | (y %% 400) == 0) // cumsum(is_leap(0:399)) static const int leap_days[400] = { 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 15, 16, 16, 16, 16, 17, 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21, 22, 22, 22, 22, 23, 23, 23, 23, 24, 24, 24, 24, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 27, 27, 27, 27, 28, 28, 28, 28, 29, 29, 29, 29, 30, 30, 30, 30, 31, 31, 31, 31, 32, 32, 32, 32, 33, 33, 33, 33, 34, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, 38, 38, 38, 38, 39, 39, 39, 39, 40, 40, 40, 40, 41, 41, 41, 41, 42, 42, 42, 42, 43, 43, 43, 43, 44, 44, 44, 44, 45, 45, 45, 45, 46, 46, 46, 46, 47, 47, 47, 47, 48, 48, 48, 48, 49, 49, 49, 49, 49, 49, 49, 49, 50, 50, 50, 50, 51, 51, 51, 51, 52, 52, 52, 52, 53, 53, 53, 53, 54, 54, 54, 54, 55, 55, 55, 55, 56, 56, 56, 56, 57, 57, 57, 57, 58, 58, 58, 58, 59, 59, 59, 59, 60, 60, 60, 60, 61, 61, 61, 61, 62, 62, 62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 65, 65, 65, 65, 66, 66, 66, 66, 67, 67, 67, 67, 68, 68, 68, 68, 69, 69, 69, 69, 70, 70, 70, 70, 71, 71, 71, 71, 72, 72, 72, 72, 73, 73, 73, 73, 73, 73, 73, 73, 74, 74, 74, 74, 75, 75, 75, 75, 76, 76, 76, 76, 77, 77, 77, 77, 78, 78, 78, 78, 79, 79, 79, 79, 80, 80, 80, 80, 81, 81, 81, 81, 82, 82, 82, 82, 83, 83, 83, 83, 84, 84, 84, 84, 85, 85, 85, 85, 86, 86, 86, 86, 87, 87, 87, 87, 88, 88, 88, 88, 89, 89, 89, 89, 90, 90, 90, 90, 91, 91, 91, 91, 92, 92, 92, 92, 93, 93, 93, 93, 94, 94, 94, 94, 95, 95, 95, 95, 96, 96, 96, 96, 97, 97, 97}; static const int cycle_days = 400 * 365 + 97; inline int is_leap(unsigned y) { return (y % 4) == 0 && ((y % 100) != 0 || (y % 400) == 0); } class DateTime { int year_, mon_, day_, hour_, min_, sec_, offset_; double psec_; std::string tz_; public: DateTime( int year, int mon, int day, int hour = 0, int min = 0, int sec = 0, double psec = 0, const std::string& tz = "UTC") : year_(year), mon_(mon), day_(day), hour_(hour), min_(min), sec_(sec), offset_(0), psec_(psec), tz_(tz) {} // Used to add time zone offsets which can only be easily applied once // we've converted into seconds since epoch. void setOffset(int offset) { offset_ = offset; } // Is this a valid date time? bool validDateTime() const { return validDate() && validTime(); } bool validDate() const { if (year_ < 0) return false; return (date::year{year_} / mon_ / day_).ok(); } bool validTime() const { if (sec_ < 0 || sec_ > 60) return false; if (min_ < 0 || min_ > 59) return false; if (hour_ < 0 || hour_ > 23) return false; return true; } bool validDuration() const { if (sec_ < -59 || sec_ > 59) return false; if (min_ < -59 || min_ > 59) return false; return true; } double datetime() const { return (tz_ == "UTC") ? utctime() : localtime(); } int date() const { return utcdate(); } double time() const { return psec_ + sec_ + (min_ * 60.0) + (hour_ * 3600.0); } private: // Number of number of seconds since 1970-01-01T00:00:00Z. // Compared to usual implementations this returns a double, and supports // a wider range of dates. Invalid dates have undefined behaviour. double utctime() const { return utcdate() * 86400.0 + time() + offset_; } // Find number of days since 1970-01-01. // Invalid dates have undefined behaviour. int utcdate() const { if (!validDate()) return NA_REAL; const date::year_month_day ymd{date::year(year_) / mon_ / day_}; const date::sys_days st{ymd}; return st.time_since_epoch().count(); } double localtime() const { if (!validDateTime()) return NA_REAL; const date::time_zone* p_time_zone; if (!tzdb::locate_zone(tz_, p_time_zone)) { throw std::runtime_error( "'" + tz_ + "' not found in the time zone database."); } const date::local_seconds lt = std::chrono::seconds{sec_} + std::chrono::minutes{min_} + std::chrono::hours{hour_} + date::local_days{date::year{year_} / mon_ / day_}; date::local_info info; if (!tzdb::get_local_info(lt, p_time_zone, info)) { throw std::runtime_error( "Can't lookup local time info for the supplied time zone."); } switch (info.result) { case date::local_info::unique: return (lt.time_since_epoch() - info.first.offset).count() + psec_ + offset_; case date::local_info::ambiguous: // Choose `earliest` of the two ambiguous times return (lt.time_since_epoch() - info.first.offset).count() + psec_ + offset_; case date::local_info::nonexistent: return NA_REAL; } throw std::runtime_error("should never happen"); } }; #endif readr/src/grisu3.h0000644000176200001440000000334314152512262013526 0ustar liggesusers#ifndef FASTREAD_GRISU3_H_ #define FASTREAD_GRISU3_H_ /* Copyright Jukka Jylänki Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* This file is part of an implementation of the "grisu3" double to string conversion algorithm described in the research paper "Printing Floating-Point Numbers Quickly And Accurately with Integers" by Florian Loitsch, available at http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf */ extern "C" { /// Converts the given double-precision floating point number to a string representation. /** For most inputs, this string representation is the shortest such, which deserialized again, returns the same bit representation of the double. @param v The number to convert. @param dst [out] The double-precision floating point number will be written here as a null-terminated string. The conversion algorithm will write at most 25 bytes to this buffer. (null terminator is included in this count). The dst pointer may not be null. @return the number of characters written to dst, excluding the null terminator (which is always written) is returned here. */ int dtoa_grisu3(double v, char *dst); } #ifdef __cplusplus #include std::string dtoa_grisu3_string(double v); #endif #endif readr/src/grisu3.c0000644000176200001440000003400214371264576013534 0ustar liggesusers/* Copyright Jukka Jylänki Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ /* Modifcations to dtoa_grisu3() referenced mikkelfj: are under the following * Copyright (c) 2016 Mikkel F. Jørgensen, dvide.com * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. http://www.apache.org/licenses/LICENSE-2.0 */ /* This file is part of an implementation of the "grisu3" double to string conversion algorithm described in the research paper "Printing Floating-Point Numbers Quickly And Accurately with Integers" by Florian Loitsch, available at http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf */ #include // uint64_t etc. #include // assert #include // ceil #include // snprintf #include #ifdef _MSC_VER #pragma warning(disable : 4204) // nonstandard extension used : non-constant aggregate initializer #endif #define D64_SIGN 0x8000000000000000ULL #define D64_EXP_MASK 0x7FF0000000000000ULL #define D64_FRACT_MASK 0x000FFFFFFFFFFFFFULL #define D64_IMPLICIT_ONE 0x0010000000000000ULL #define D64_EXP_POS 52 #define D64_EXP_BIAS 1075 #define DIYFP_FRACT_SIZE 64 #define D_1_LOG2_10 0.30102999566398114 // 1 / lg(10) #define MIN_TARGET_EXP -60 #define MASK32 0xFFFFFFFFULL #define CAST_U64(d) (*(uint64_t*)&d) #define MIN(x,y) ((x) <= (y) ? (x) : (y)) #define MAX(x,y) ((x) >= (y) ? (x) : (y)) #define MIN_CACHED_EXP -348 #define CACHED_EXP_STEP 8 typedef struct diy_fp { uint64_t f; int e; } diy_fp; typedef struct power { uint64_t fract; int16_t b_exp, d_exp; } power; static const power pow_cache[] = { { 0xfa8fd5a0081c0288ULL, -1220, -348 }, { 0xbaaee17fa23ebf76ULL, -1193, -340 }, { 0x8b16fb203055ac76ULL, -1166, -332 }, { 0xcf42894a5dce35eaULL, -1140, -324 }, { 0x9a6bb0aa55653b2dULL, -1113, -316 }, { 0xe61acf033d1a45dfULL, -1087, -308 }, { 0xab70fe17c79ac6caULL, -1060, -300 }, { 0xff77b1fcbebcdc4fULL, -1034, -292 }, { 0xbe5691ef416bd60cULL, -1007, -284 }, { 0x8dd01fad907ffc3cULL, -980, -276 }, { 0xd3515c2831559a83ULL, -954, -268 }, { 0x9d71ac8fada6c9b5ULL, -927, -260 }, { 0xea9c227723ee8bcbULL, -901, -252 }, { 0xaecc49914078536dULL, -874, -244 }, { 0x823c12795db6ce57ULL, -847, -236 }, { 0xc21094364dfb5637ULL, -821, -228 }, { 0x9096ea6f3848984fULL, -794, -220 }, { 0xd77485cb25823ac7ULL, -768, -212 }, { 0xa086cfcd97bf97f4ULL, -741, -204 }, { 0xef340a98172aace5ULL, -715, -196 }, { 0xb23867fb2a35b28eULL, -688, -188 }, { 0x84c8d4dfd2c63f3bULL, -661, -180 }, { 0xc5dd44271ad3cdbaULL, -635, -172 }, { 0x936b9fcebb25c996ULL, -608, -164 }, { 0xdbac6c247d62a584ULL, -582, -156 }, { 0xa3ab66580d5fdaf6ULL, -555, -148 }, { 0xf3e2f893dec3f126ULL, -529, -140 }, { 0xb5b5ada8aaff80b8ULL, -502, -132 }, { 0x87625f056c7c4a8bULL, -475, -124 }, { 0xc9bcff6034c13053ULL, -449, -116 }, { 0x964e858c91ba2655ULL, -422, -108 }, { 0xdff9772470297ebdULL, -396, -100 }, { 0xa6dfbd9fb8e5b88fULL, -369, -92 }, { 0xf8a95fcf88747d94ULL, -343, -84 }, { 0xb94470938fa89bcfULL, -316, -76 }, { 0x8a08f0f8bf0f156bULL, -289, -68 }, { 0xcdb02555653131b6ULL, -263, -60 }, { 0x993fe2c6d07b7facULL, -236, -52 }, { 0xe45c10c42a2b3b06ULL, -210, -44 }, { 0xaa242499697392d3ULL, -183, -36 }, { 0xfd87b5f28300ca0eULL, -157, -28 }, { 0xbce5086492111aebULL, -130, -20 }, { 0x8cbccc096f5088ccULL, -103, -12 }, { 0xd1b71758e219652cULL, -77, -4 }, { 0x9c40000000000000ULL, -50, 4 }, { 0xe8d4a51000000000ULL, -24, 12 }, { 0xad78ebc5ac620000ULL, 3, 20 }, { 0x813f3978f8940984ULL, 30, 28 }, { 0xc097ce7bc90715b3ULL, 56, 36 }, { 0x8f7e32ce7bea5c70ULL, 83, 44 }, { 0xd5d238a4abe98068ULL, 109, 52 }, { 0x9f4f2726179a2245ULL, 136, 60 }, { 0xed63a231d4c4fb27ULL, 162, 68 }, { 0xb0de65388cc8ada8ULL, 189, 76 }, { 0x83c7088e1aab65dbULL, 216, 84 }, { 0xc45d1df942711d9aULL, 242, 92 }, { 0x924d692ca61be758ULL, 269, 100 }, { 0xda01ee641a708deaULL, 295, 108 }, { 0xa26da3999aef774aULL, 322, 116 }, { 0xf209787bb47d6b85ULL, 348, 124 }, { 0xb454e4a179dd1877ULL, 375, 132 }, { 0x865b86925b9bc5c2ULL, 402, 140 }, { 0xc83553c5c8965d3dULL, 428, 148 }, { 0x952ab45cfa97a0b3ULL, 455, 156 }, { 0xde469fbd99a05fe3ULL, 481, 164 }, { 0xa59bc234db398c25ULL, 508, 172 }, { 0xf6c69a72a3989f5cULL, 534, 180 }, { 0xb7dcbf5354e9beceULL, 561, 188 }, { 0x88fcf317f22241e2ULL, 588, 196 }, { 0xcc20ce9bd35c78a5ULL, 614, 204 }, { 0x98165af37b2153dfULL, 641, 212 }, { 0xe2a0b5dc971f303aULL, 667, 220 }, { 0xa8d9d1535ce3b396ULL, 694, 228 }, { 0xfb9b7cd9a4a7443cULL, 720, 236 }, { 0xbb764c4ca7a44410ULL, 747, 244 }, { 0x8bab8eefb6409c1aULL, 774, 252 }, { 0xd01fef10a657842cULL, 800, 260 }, { 0x9b10a4e5e9913129ULL, 827, 268 }, { 0xe7109bfba19c0c9dULL, 853, 276 }, { 0xac2820d9623bf429ULL, 880, 284 }, { 0x80444b5e7aa7cf85ULL, 907, 292 }, { 0xbf21e44003acdd2dULL, 933, 300 }, { 0x8e679c2f5e44ff8fULL, 960, 308 }, { 0xd433179d9c8cb841ULL, 986, 316 }, { 0x9e19db92b4e31ba9ULL, 1013, 324 }, { 0xeb96bf6ebadf77d9ULL, 1039, 332 }, { 0xaf87023b9bf0ee6bULL, 1066, 340 } }; static int cached_pow(int exp, diy_fp *p) { int k = (int)ceil((exp+DIYFP_FRACT_SIZE-1) * D_1_LOG2_10); int i = (k-MIN_CACHED_EXP-1) / CACHED_EXP_STEP + 1; p->f = pow_cache[i].fract; p->e = pow_cache[i].b_exp; return pow_cache[i].d_exp; } static diy_fp minus(diy_fp x, diy_fp y) { diy_fp d; d.f = x.f - y.f; d.e = x.e; assert(x.e == y.e && x.f >= y.f); return d; } static diy_fp multiply(diy_fp x, diy_fp y) { uint64_t a, b, c, d, ac, bc, ad, bd, tmp; diy_fp r; a = x.f >> 32; b = x.f & MASK32; c = y.f >> 32; d = y.f & MASK32; ac = a*c; bc = b*c; ad = a*d; bd = b*d; tmp = (bd >> 32) + (ad & MASK32) + (bc & MASK32); tmp += 1U << 31; // round r.f = ac + (ad >> 32) + (bc >> 32) + (tmp >> 32); r.e = x.e + y.e + 64; return r; } static diy_fp normalize_diy_fp(diy_fp n) { assert(n.f != 0); while(!(n.f & 0xFFC0000000000000ULL)) { n.f <<= 10; n.e -= 10; } while(!(n.f & D64_SIGN)) { n.f <<= 1; --n.e; } return n; } static diy_fp double2diy_fp(double d) { diy_fp fp; uint64_t u64 = CAST_U64(d); if (!(u64 & D64_EXP_MASK)) { fp.f = u64 & D64_FRACT_MASK; fp.e = 1 - D64_EXP_BIAS; } else { fp.f = (u64 & D64_FRACT_MASK) + D64_IMPLICIT_ONE; fp.e = (int)((u64 & D64_EXP_MASK) >> D64_EXP_POS) - D64_EXP_BIAS; } return fp; } // pow10_cache[i] = 10^(i-1) static const unsigned int pow10_cache[] = { 0, 1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000 }; static int largest_pow10(uint32_t n, int n_bits, uint32_t *power) { int guess = ((n_bits + 1) * 1233 >> 12) + 1/*skip first entry*/; if (n < pow10_cache[guess]) { --guess; // We don't have any guarantees that 2^n_bits <= n. } *power = pow10_cache[guess]; return guess; } static int round_weed(char *buffer, int len, uint64_t wp_W, uint64_t delta, uint64_t rest, uint64_t ten_kappa, uint64_t ulp) { uint64_t wp_Wup = wp_W - ulp; uint64_t wp_Wdown = wp_W + ulp; while(rest < wp_Wup && delta - rest >= ten_kappa && (rest + ten_kappa < wp_Wup || wp_Wup - rest >= rest + ten_kappa - wp_Wup)) { --buffer[len-1]; rest += ten_kappa; } if (rest < wp_Wdown && delta - rest >= ten_kappa && (rest + ten_kappa < wp_Wdown || wp_Wdown - rest > rest + ten_kappa - wp_Wdown)) { return 0; } return 2*ulp <= rest && rest <= delta - 4*ulp; } static int digit_gen(diy_fp low, diy_fp w, diy_fp high, char *buffer, int *length, int *kappa) { uint64_t unit = 1; diy_fp too_low = { low.f - unit, low.e }; diy_fp too_high = { high.f + unit, high.e }; diy_fp unsafe_interval = minus(too_high, too_low); diy_fp one = { 1ULL << -w.e, w.e }; uint32_t p1 = (uint32_t)(too_high.f >> -one.e); uint64_t p2 = too_high.f & (one.f - 1); uint32_t div; *kappa = largest_pow10(p1, DIYFP_FRACT_SIZE + one.e, &div); *length = 0; while(*kappa > 0) { uint64_t rest; int digit = p1 / div; buffer[*length] = (char)('0' + digit); ++*length; p1 %= div; --*kappa; rest = ((uint64_t)p1 << -one.e) + p2; if (rest < unsafe_interval.f) { return round_weed(buffer, *length, minus(too_high, w).f, unsafe_interval.f, rest, (uint64_t)div << -one.e, unit); } div /= 10; } for(;;) { int digit; p2 *= 10; unit *= 10; unsafe_interval.f *= 10; // Integer division by one. digit = (int)(p2 >> -one.e); buffer[*length] = (char)('0' + digit); ++*length; p2 &= one.f - 1; // Modulo by one. --*kappa; if (p2 < unsafe_interval.f) { return round_weed(buffer, *length, minus(too_high, w).f * unit, unsafe_interval.f, p2, one.f, unit); } } } static int grisu3(double v, char *buffer, int *length, int *d_exp) { int mk, kappa, success; diy_fp dfp = double2diy_fp(v); diy_fp w = normalize_diy_fp(dfp); // normalize boundaries diy_fp t = { (dfp.f << 1) + 1, dfp.e - 1 }; diy_fp b_plus = normalize_diy_fp(t); diy_fp b_minus; diy_fp c_mk; // Cached power of ten: 10^-k uint64_t u64 = CAST_U64(v); assert(v > 0 && v <= 1.7976931348623157e308); // Grisu only handles strictly positive finite numbers. if (!(u64 & D64_FRACT_MASK) && (u64 & D64_EXP_MASK) != 0) { b_minus.f = (dfp.f << 2) - 1; b_minus.e = dfp.e - 2;} // lower boundary is closer? else { b_minus.f = (dfp.f << 1) - 1; b_minus.e = dfp.e - 1; } b_minus.f = b_minus.f << (b_minus.e - b_plus.e); b_minus.e = b_plus.e; mk = cached_pow(MIN_TARGET_EXP - DIYFP_FRACT_SIZE - w.e, &c_mk); w = multiply(w, c_mk); b_minus = multiply(b_minus, c_mk); b_plus = multiply(b_plus, c_mk); success = digit_gen(b_minus, w, b_plus, buffer, length, &kappa); *d_exp = kappa - mk; return success; } static int i_to_str(int val, char *str) { int len, i; char *s; char *begin = str; if (val < 0) { *str++ = '-'; val = -val; } s = str; for(;;) { int ni = val / 10; int digit = val - ni*10; *s++ = (char)('0' + digit); if (ni == 0) { break; } val = ni; } *s = '\0'; len = (int)(s - str); for(i = 0; i < len/2; ++i) { char ch = str[i]; str[i] = str[len-1-i]; str[len-1-i] = ch; } return (int)(s - begin); } int dtoa_grisu3(double v, char *dst) { int d_exp, len, success, decimals, i; uint64_t u64 = CAST_U64(v); char *s2 = dst; assert(dst); // Prehandle NaNs // Why size = 22? // 5 for "NaN()" // 16 for two hexadecimal intgers at width 8 // 1 for null terminator if ((u64 << 1) > 0xFFE0000000000000ULL) return snprintf(dst, 22, "NaN(%08X%08X)", (uint32_t)(u64 >> 32), (uint32_t)u64); // Prehandle negative values. if ((u64 & D64_SIGN) != 0) { *s2++ = '-'; v = -v; u64 ^= D64_SIGN; } // Prehandle zero. if (!u64) { *s2++ = '0'; *s2 = '\0'; return (int)(s2 - dst); } // Prehandle infinity. if (u64 == D64_EXP_MASK) { *s2++ = 'i'; *s2++ = 'n'; *s2++ = 'f'; *s2 = '\0'; return (int)(s2 - dst); } success = grisu3(v, s2, &len, &d_exp); // If grisu3 was not able to convert the number to a string, then use old sprintf (suboptimal). // (Putative) rationale for size = 30: // 17 digits after decimal at most // 1 for the `.` // 1 for a possible `-`, if the number is negative // 5 for a possible e+308 if it chooses exponential form and uses the largest // exponent possible // 1 for null terminator // -- // 25 total so far // 5 left for displaying the value before the decimal (in the worst case, // which I'm not even sure is possible) // More context: when vroom calls dtoa_grisu3(), dst points to a buffer of // size 33 (at the time of writing), and that's where s2 starts out FWIW. if (!success) return snprintf(s2, 30, "%.17g", v) + (int)(s2 - dst); // handle whole numbers as integers if they are < 10^15 if (d_exp >= 0 && d_exp <= MAX(2, 15 - len)) { while(d_exp-- > 0) { s2[len++] = '0'; } s2[len] = '\0'; return (int)(s2+len-dst); } // We now have an integer string of form "151324135" and a base-10 exponent for that number. // Next, decide the best presentation for that string by whether to use a decimal point, or the scientific exponent notation 'e'. // We don't pick the absolute shortest representation, but pick a balance between readability and shortness, e.g. // 1.545056189557677e-308 could be represented in a shorter form // 1545056189557677e-323 but that would be somewhat unreadable. decimals = MIN(-d_exp, MAX(1, len-1)); // mikkelfj: // fix zero prefix .1 => 0.1, important for JSON export. // prefer unscientific notation at same length: // -1.2345e-4 over -1.00012345, // -1.0012345 over -1.2345e-3 if (d_exp < 0 && (len + d_exp) > -3 && len <= -d_exp) { // mikkelfj: fix zero prefix .1 => 0.1, and short exponents 1.3e-2 => 0.013. memmove(s2 + 2 - d_exp - len, s2, len); s2[0] = '0'; s2[1] = '.'; for (i = 2; i < 2-d_exp-len; ++i) { s2[i] = '0'; } len += i; } else if (d_exp < 0 && len > 1) // Add decimal point? { for(i = 0; i < decimals; ++i) { s2[len-i] = s2[len-i-1]; } s2[len++ - decimals] = '.'; d_exp += decimals; // Need scientific notation as well? if (d_exp != 0) { s2[len++] = 'e'; len += i_to_str(d_exp, s2+len); } }// Add scientific notation? else if (d_exp < 0 || d_exp > 2) { s2[len++] = 'e'; len += i_to_str(d_exp, s2+len); } // Add zeroes instead of scientific notation? s2[len] = '\0'; // grisu3 doesn't null terminate, so ensure termination. return (int)(s2+len-dst); } readr/src/connection.cpp0000644000176200001440000000164714174704674015027 0ustar liggesusers#include "cpp11/R.hpp" #include "cpp11/function.hpp" #include "cpp11/raws.hpp" #include "cpp11/strings.hpp" #include // Wrapper around R's read_bin function SEXP read_bin(const cpp11::sexp& con, int bytes) { static auto readBin = cpp11::package("base")["readBin"]; return readBin(con, "raw", bytes); } // Read data from a connection in chunks and then combine into a single // raw vector. // [[cpp11::register]] std::string read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size) { std::ofstream out(filename.c_str(), std::fstream::out | std::fstream::binary); SEXP chunk = read_bin(con, chunk_size); R_xlen_t chunk_len = Rf_xlength(chunk); while (chunk_len > 0) { std::copy( RAW(chunk), RAW(chunk) + Rf_xlength(chunk), std::ostream_iterator(out)); chunk = read_bin(con, chunk_size); chunk_len = Rf_xlength(chunk); } return filename; } readr/src/TokenizerFwf.cpp0000644000176200001440000001713014174704674015277 0ustar liggesusers#include "cpp11/list.hpp" #include "cpp11/protect.hpp" #include "Tokenizer.h" #include "TokenizerFwf.h" #include "utils.h" #include "Source.h" #include #include struct skip_t { SourceIterator begin; int lines; }; skip_t skip_comments( SourceIterator begin, SourceIterator end, const std::string& comment = "") { skip_t out; if (comment.length() == 0) { out.begin = begin; out.lines = 0; return out; } SourceIterator cur = begin; int skip = 0; while (starts_with_comment(cur, end, comment)) { // Skip rest of line while (cur != end && *cur != '\n' && *cur != '\r') { ++cur; } advanceForLF(&cur, end); ++cur; ++skip; } out.begin = cur; out.lines = skip; return out; } std::vector emptyCols_(SourceIterator begin, SourceIterator end, size_t n = 100) { std::vector is_white; size_t row = 0; size_t col = 0; for (SourceIterator cur = begin; cur != end; ++cur) { if (row > n) { break; } switch (*cur) { case '\n': case '\r': advanceForLF(&cur, end); col = 0; row++; break; case ' ': col++; break; default: // Make sure there's enough room if (col >= is_white.size()) { is_white.resize(col + 1, true); } is_white[col] = false; col++; } } return is_white; } [[cpp11::register]] cpp11::list whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment) { SourcePtr source = Source::create(sourceSpec); skip_t s = skip_comments(source->begin(), source->end(), std::move(comment)); std::vector empty = emptyCols_(s.begin, source->end(), n); std::vector begin; std::vector end; bool in_col = false; for (size_t i = 0; i < empty.size(); ++i) { if (in_col && empty[i]) { end.push_back(i); in_col = false; } else if (!in_col && !empty[i]) { begin.push_back(i); in_col = true; } } if (in_col) { end.push_back(empty.size()); } using namespace cpp11::literals; return cpp11::writable::list( {"begin"_nm = begin, "end"_nm = end, "skip"_nm = s.lines}); } // TokenizerFwf -------------------------------------------------------------- #include "TokenizerFwf.h" TokenizerFwf::TokenizerFwf( const std::vector& beginOffset, const std::vector& endOffset, std::vector NA, const std::string& comment, bool trimWS, bool skipEmptyRows) : beginOffset_(beginOffset), endOffset_(endOffset), NA_(std::move(NA)), cols_(beginOffset.size()), comment_(comment), moreTokens_(false), hasComment_(!comment.empty()), trimWS_(trimWS), skipEmptyRows_(skipEmptyRows) { if (beginOffset_.size() != endOffset_.size()) { cpp11::stop( "Begin (%i) and end (%i) specifications must have equal length", beginOffset_.size(), endOffset_.size()); } if (beginOffset_.empty()) { cpp11::stop("Zero-length begin and end specifications not supported"); } // File is assumed to be ragged (last column can have variable width) // when the last element of endOffset_ is NA isRagged_ = endOffset_[endOffset_.size() - 1L] == NA_INTEGER; max_ = 0; for (int j = 0; j < (cols_ - static_cast(isRagged_)); ++j) { if (endOffset_[j] <= beginOffset_[j]) { cpp11::stop( "Begin offset (%i) must be smaller than end offset (%i)", beginOffset_[j], endOffset_[j]); } if (beginOffset_[j] < 0) { cpp11::stop("Begin offset (%i) must be greater than 0", beginOffset_[j]); } if (endOffset_[j] < 0) { cpp11::stop("End offset (%i) must be greater than 0", endOffset_[j]); } if (endOffset_[j] > max_) { max_ = endOffset_[j]; } } } void TokenizerFwf::tokenize(SourceIterator begin, SourceIterator end) { cur_ = begin; curLine_ = begin; begin_ = begin; end_ = end; row_ = 0; col_ = 0; moreTokens_ = true; } std::pair TokenizerFwf::progress() { size_t bytes = cur_ - begin_; return std::make_pair(bytes / (double)(end_ - begin_), bytes); } Token TokenizerFwf::nextToken() { if (!moreTokens_) { return {TOKEN_EOF, 0, 0}; } // Check for comments only at start of line while (cur_ != end_ && col_ == 0 && (isComment(cur_) || (isEmpty() && skipEmptyRows_))) { // Skip rest of line while (cur_ != end_ && *cur_ != '\n' && *cur_ != '\r') { ++cur_; } advanceForLF(&cur_, end_); if (cur_ != end_) { ++cur_; } curLine_ = cur_; } // Find start of field SourceIterator fieldBegin = cur_; findBeginning: int skip = beginOffset_[col_] - (cur_ - curLine_); if (skip < 0) { // overlapping case fieldBegin += skip; } else if (skip > 0) { // skipped columns case for (int i = 0; i < skip; ++i) { if (fieldBegin == end_) { break; } if (*fieldBegin == '\n' || *fieldBegin == '\r') { std::stringstream ss1; ss1 << skip << " chars betwen fields"; std::stringstream ss2; ss2 << skip << " chars until end of line"; warn(row_, col_, ss1.str(), ss2.str()); row_++; col_ = 0; advanceForLF(&fieldBegin, end_); if (fieldBegin != end_) { fieldBegin++; } cur_ = curLine_ = fieldBegin; goto findBeginning; } fieldBegin++; } } if (fieldBegin == end_) { // need to warn here if col != 0/cols - 1 moreTokens_ = false; return {TOKEN_EOF, 0, 0}; } // Find end of field SourceIterator fieldEnd = fieldBegin; bool lastCol = (col_ == cols_ - 1); bool tooShort = false; bool hasNull = false; if (lastCol && isRagged_) { // Last column is ragged, so read until end of line (ignoring width) while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') { if (*fieldEnd == '\0') { hasNull = true; } fieldEnd++; } } else { int width = endOffset_[col_] - beginOffset_[col_]; // Find the end of the field, stopping for newlines for (int i = 0; i < width; ++i) { if (fieldEnd == end_ || *fieldEnd == '\n' || *fieldEnd == '\r') { if (!(col_ == 0 && !skipEmptyRows_)) { std::stringstream ss1; ss1 << i << " chars"; std::stringstream ss2; ss2 << i; warn(row_, col_, ss1.str(), ss2.str()); } tooShort = true; break; } if (*fieldEnd == '\0') { hasNull = true; } fieldEnd++; } } Token t = fieldToken(fieldBegin, fieldEnd, hasNull); if (lastCol || tooShort) { row_++; col_ = 0; if (!(tooShort || isRagged_)) { // Proceed to the end of the line when you are possibly not there. // This is needed in case the last column in the file is not being read. while (fieldEnd != end_ && *fieldEnd != '\r' && *fieldEnd != '\n') { fieldEnd++; } } curLine_ = fieldEnd; advanceForLF(&curLine_, end_); if (curLine_ != end_) { curLine_++; } cur_ = curLine_; } else { col_++; cur_ = fieldEnd; } return t; } Token TokenizerFwf::fieldToken( SourceIterator begin, SourceIterator end, bool hasNull) { if (begin == end) { return {TOKEN_MISSING, row_, col_}; } Token t = Token(begin, end, row_, col_, hasNull); if (trimWS_) { t.trim(); } t.flagNA(NA_); return t; } bool TokenizerFwf::isComment(const char* cur) const { if (!hasComment_) { return false; } return starts_with_comment(cur, end_, comment_); } bool TokenizerFwf::isEmpty() const { return cur_ == end_ || *cur_ == '\r' || *cur_ == '\n'; } readr/src/SourceFile.h0000644000176200001440000000166514174704674014375 0ustar liggesusers#ifndef FASTREAD_SOURCEFILE_H_ #define FASTREAD_SOURCEFILE_H_ #include "Source.h" #include "cpp11/protect.hpp" #include "unicode_fopen.h" class SourceFile : public Source { mio::mmap_source source_; const char* begin_; const char* end_; public: SourceFile( const std::string& path, int skip = 0, bool skipEmptyRows = true, const std::string& comment = "", bool skipQuotes = true) { std::error_code error; source_ = make_mmap_source(path.c_str(), error); if (error) { cpp11::stop("Cannot read file %s: %s", error.message().c_str()); } begin_ = source_.begin(); end_ = begin_ + source_.size(); // Skip byte order mark, if needed begin_ = skipBom(begin_, end_); // Skip lines, if needed begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } const char* end() { return end_; } }; #endif readr/src/Iconv.h0000644000176200001440000000102714174357220013372 0ustar liggesusers#ifndef READ_ICONV_H_ #define READ_ICONV_H_ #include "cpp11/R.hpp" #include #include "R_ext/Riconv.h" #include class Iconv { void* cd_; std::string buffer_; public: Iconv(const std::string& from, const std::string& to = "UTF-8"); virtual ~Iconv(); SEXP makeSEXP(const char* start, const char* end, bool hasNull = true); std::string makeString(const char* start, const char* end); private: // Returns number of characters in buffer size_t convert(const char* start, const char* end); }; #endif readr/src/CollectorGuess.cpp0000644000176200001440000000711114174704674015615 0ustar liggesusers#include "cpp11/R.hpp" #include "cpp11/list.hpp" #include "cpp11/strings.hpp" #include "DateTime.h" #include "DateTimeParser.h" #include "LocaleInfo.h" #include "QiParsers.h" #include "utils.h" typedef bool (*canParseFun)(const std::string&, LocaleInfo* pLocale); bool canParse( const cpp11::strings& x, const canParseFun& canParse, LocaleInfo* pLocale) { for (const auto & i : x) { if (i == NA_STRING) { continue; } if (i.size() == 0) { continue; } if (!canParse(std::string(i), pLocale)) { return false; } } return true; } bool allMissing(const cpp11::strings& x) { for (const auto & i : x) { if (i != NA_STRING && i.size() > 0) { return false; } } return true; } bool isLogical(const std::string& x, LocaleInfo* /*unused*/) { const char* const str = x.data(); bool res = isLogical(str, str + x.size()); return res; } bool isNumber(const std::string& x, LocaleInfo* pLocale) { // Leading zero not followed by decimal mark if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) { return false; } double res = 0; std::string::const_iterator begin = x.begin(); std::string::const_iterator end = x.end(); bool ok = parseNumber( pLocale->decimalMark_, pLocale->groupingMark_, begin, end, res); return ok && begin == x.begin() && end == x.end(); } bool isInteger(const std::string& x, LocaleInfo* /*unused*/) { // Leading zero if (x[0] == '0' && x.size() > 1) { return false; } double res = 0; std::string::const_iterator begin = x.begin(); std::string::const_iterator end = x.end(); return parseInt(begin, end, res) && begin == end; } bool isDouble(const std::string& x, LocaleInfo* pLocale) { // Leading zero not followed by decimal mark if (x[0] == '0' && x.size() > 1 && x[1] != pLocale->decimalMark_) { return false; } double res = 0; const char* begin = x.c_str(); const char* end = begin + x.size(); return parseDouble(pLocale->decimalMark_, begin, end, res) && end == begin + x.size(); } bool isTime(const std::string& x, LocaleInfo* pLocale) { DateTimeParser parser(pLocale); parser.setDate(x.c_str()); return parser.parseLocaleTime(); } bool isDate(const std::string& x, LocaleInfo* pLocale) { DateTimeParser parser(pLocale); parser.setDate(x.c_str()); return parser.parseLocaleDate(); } static bool isDateTime(const std::string& x, LocaleInfo* pLocale) { DateTimeParser parser(pLocale); parser.setDate(x.c_str()); bool ok = parser.parseISO8601(); if (!ok) { return false; } if (!parser.compactDate()) { return true; } // Values like 00014567 are unlikely to be dates, so don't guess return parser.year() > 999; } [[cpp11::register]] std::string collectorGuess( const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger) { LocaleInfo locale(static_cast(locale_)); if (input.size() == 0) { return "character"; } if (allMissing(input)) { return "logical"; } // Work from strictest to most flexible if (canParse(input, isLogical, &locale)) { return "logical"; } if (guessInteger && canParse(input, isInteger, &locale)) { return "integer"; } if (canParse(input, isDouble, &locale)) { return "double"; } if (canParse(input, isNumber, &locale)) { return "number"; } if (canParse(input, isTime, &locale)) { return "time"; } if (canParse(input, isDate, &locale)) { return "date"; } if (canParse(input, isDateTime, &locale)) { return "datetime"; } // Otherwise can always parse as a character return "character"; } readr/src/datetime.cpp0000644000176200001440000000157414174704674014463 0ustar liggesusers#include "cpp11/doubles.hpp" #include "cpp11/integers.hpp" #include "cpp11/protect.hpp" #include "DateTime.h" [[cpp11::register]] cpp11::writable::doubles utctime_( const cpp11::integers& year, const cpp11::integers& month, const cpp11::integers& day, const cpp11::integers& hour, const cpp11::integers& min, const cpp11::integers& sec, const cpp11::doubles& psec) { int n = year.size(); if (month.size() != n || day.size() != n || hour.size() != n || min.size() != n || sec.size() != n || psec.size() != n) { cpp11::stop("All inputs must be same length"); } cpp11::writable::doubles out(n); for (int i = 0; i < n; ++i) { DateTime dt( year[i], month[i], day[i], hour[i], min[i], sec[i], psec[i], "UTC"); out[i] = dt.datetime(); } out.attr("class") = {"POSIXct", "POSIXt"}; out.attr("tzone") = "UTC"; return out; } readr/src/LocaleInfo.h0000644000176200001440000000071214174704674014340 0ustar liggesusers#ifndef FASTREAD_LOCALINFO #define FASTREAD_LOCALINFO #include "Iconv.h" #include "cpp11/list.hpp" #include #include class LocaleInfo { public: // LC_TIME std::vector mon_, monAb_, day_, dayAb_, amPm_; std::string dateFormat_, timeFormat_; // LC_NUMERIC char decimalMark_, groupingMark_; // LC_MISC std::string tz_; std::string encoding_; Iconv encoder_; LocaleInfo(const cpp11::list& x); }; #endif readr/src/SourceString.h0000644000176200001440000000142214174357220014742 0ustar liggesusers#ifndef FASTREAD_SOURCESTRING_H_ #define FASTREAD_SOURCESTRING_H_ #include "cpp11/strings.hpp" #include "Source.h" class SourceString : public Source { cpp11::sexp string_; const char* begin_; const char* end_; public: SourceString( cpp11::strings x, int skip = 0, bool skipEmptyRows = true, const std::string& comment = "", bool skipQuotes = true) : string_(static_cast(x[0])) { begin_ = CHAR(string_); end_ = begin_ + Rf_xlength(string_); // Skip byte order mark, if needed begin_ = skipBom(begin_, end_); // Skip lines, if needed begin_ = skipLines(begin_, end_, skip, skipEmptyRows, comment, skipQuotes); } const char* begin() { return begin_; } const char* end() { return end_; } }; #endif readr/src/DateTimeParser.h0000644000176200001440000003040414174704674015177 0ustar liggesusers#ifndef FASTREAD_DATE_TIME_PARSER_H_ #define FASTREAD_DATE_TIME_PARSER_H_ #include "DateTime.h" #include "LocaleInfo.h" #include "QiParsers.h" #include "cpp11/protect.hpp" #include "utils.h" #include // Parsing --------------------------------------------------------------------- class DateTimeParser { int sign_, year_, mon_, day_, hour_, min_, sec_; double psec_; int amPm_; bool compactDate_; // used for guessing int tzOffsetHours_, tzOffsetMinutes_; std::string tz_; LocaleInfo* pLocale_; std::string tzDefault_; const char* dateItr_; const char* dateEnd_; public: DateTimeParser(LocaleInfo* pLocale) : pLocale_(pLocale), tzDefault_(pLocale->tz_), dateItr_(NULL), dateEnd_(NULL) { reset(); } // Parse ISO8601 date time. In benchmarks this only seems ~30% faster than // parsing with a format string so it doesn't seem necessary to add individual // parsers for other common formats. bool parseISO8601(bool partial = true) { // Date: YYYY-MM-DD, YYYYMMDD if (!consumeInteger(4, &year_)) return false; if (consumeThisChar('-')) compactDate_ = false; if (!consumeInteger(2, &mon_)) return false; if (!compactDate_ && !consumeThisChar('-')) return false; if (!consumeInteger(2, &day_)) return false; if (isComplete()) return true; // Spec requires T, but common to use space instead char next; if (!consumeChar(&next)) return false; if (next != 'T' && next != ' ') return false; // hh:mm:ss.sss, hh:mm:ss, hh:mm, hh // hhmmss.sss, hhmmss, hhmm if (!consumeInteger(2, &hour_)) return false; consumeThisChar(':'); consumeInteger(2, &min_); consumeThisChar(':'); consumeSeconds(&sec_, &psec_); if (isComplete()) return true; // Has a timezone tz_ = "UTC"; if (!consumeTzOffset(&tzOffsetHours_, &tzOffsetMinutes_)) return false; return isComplete(); } bool parseLocaleTime() { return parse(pLocale_->timeFormat_); } bool parseLocaleDate() { return parse(pLocale_->dateFormat_); } // A flexible time parser for the most common formats bool parseTime() { if (!consumeInteger(2, &hour_, false)) return false; if (!consumeThisChar(':')) return false; if (!consumeInteger(2, &min_)) return false; consumeThisChar(':'); consumeSeconds(&sec_, NULL); consumeWhiteSpace(); consumeString(pLocale_->amPm_, &amPm_); consumeWhiteSpace(); return isComplete(); } bool parseDate() { // Date: YYYY-MM-DD, YYYY/MM/DD if (!consumeInteger(4, &year_)) return false; if (!consumeThisChar('-') && !consumeThisChar('/')) return false; if (!consumeInteger(2, &mon_)) return false; if (!consumeThisChar('-') && !consumeThisChar('/')) return false; if (!consumeInteger(2, &day_)) return false; return isComplete(); } bool isComplete() { return dateItr_ == dateEnd_; } void setDate(const char* date) { reset(); dateItr_ = date; dateEnd_ = date + strlen(date); } bool parse(const std::string& format) { consumeWhiteSpace(); // always consume leading whitespace std::string::const_iterator formatItr, formatEnd = format.end(); for (formatItr = format.begin(); formatItr != formatEnd; ++formatItr) { // Whitespace in format matches 0 or more whitespace in date if (std::isspace(*formatItr)) { consumeWhiteSpace(); continue; } // Any other characters must much exactly. if (*formatItr != '%') { if (!consumeThisChar(*formatItr)) return false; continue; } if (formatItr + 1 == formatEnd) cpp11::stop("Invalid format: trailing %%"); formatItr++; switch (*formatItr) { case 'Y': // year with century if (!consumeInteger(4, &year_)) return false; break; case 'y': // year without century if (!consumeInteger(2, &year_)) return false; year_ += (year_ < 69) ? 2000 : 1900; break; case 'm': // month if (!consumeInteger(2, &mon_, false)) return false; break; case 'b': // abbreviated month name if (!consumeString(pLocale_->monAb_, &mon_)) return false; break; case 'B': // month name if (!consumeString(pLocale_->mon_, &mon_)) return false; break; case 'd': // day if (!consumeInteger(2, &day_, false)) return false; break; case 'a': // abbreviated day of week if (!consumeString(pLocale_->dayAb_, &day_)) return false; break; case 'e': // day with optional leading space if (!consumeIntegerWithSpace(2, &day_)) return false; break; case 'h': // hour, unrestricted if (!consumeHours(&hour_, &sign_)) return false; break; case 'H': // hour, 0-23 if (!consumeInteger(2, &hour_, false)) return false; if (hour_ < 0 || hour_ > 23) { return false; } break; case 'I': // hour if (!consumeInteger(2, &hour_, false)) return false; if (hour_ < 1 || hour_ > 12) { return false; } hour_ %= 12; break; case 'M': // minute if (!consumeInteger(2, &min_)) return false; break; case 'S': // seconds (integer) if (!consumeSeconds(&sec_, NULL)) return false; break; case 'O': // seconds (double) if (formatItr + 1 == formatEnd || *(formatItr + 1) != 'S') cpp11::stop("Invalid format: %%O must be followed by %%S"); formatItr++; if (!consumeSeconds(&sec_, &psec_)) return false; break; case 'p': // AM/PM if (!consumeString(pLocale_->amPm_, &amPm_)) return false; break; case 'z': // time zone specification tz_ = "UTC"; if (!consumeTzOffset(&tzOffsetHours_, &tzOffsetMinutes_)) return false; break; case 'Z': // time zone name if (!consumeTzName(&tz_)) return false; break; // Extensions case '.': if (!consumeNonDigit()) return false; break; case '+': if (!consumeNonDigits()) return false; break; case '*': consumeNonDigits(); break; case 'A': // auto date / time if (formatItr + 1 == formatEnd) cpp11::stop("Invalid format: %%A must be followed by another letter"); formatItr++; switch (*formatItr) { case 'D': if (!parseDate()) return false; break; case 'T': if (!parseTime()) return false; break; default: cpp11::stop("Invalid %%A auto parser"); } break; // Compound formats case 'D': parse("%m/%d/%y"); break; case 'F': parse("%Y-%m-%d"); break; case 'R': parse("%H:%M"); break; case 'X': case 'T': parse("%H:%M:%S"); break; case 'x': parse("%y/%m/%d"); break; default: cpp11::stop("Unsupported format %%%s", formatItr); } } consumeWhiteSpace(); // always consume trailing whitespace return isComplete(); } DateTime makeDateTime() { DateTime dt(year_, mon_, day_, hour(), min_, sec_, psec_, tz_); if (tz_ == "UTC") dt.setOffset(-tzOffsetHours_ * 3600 - tzOffsetMinutes_ * 60); return dt; } DateTime makeDate() { DateTime dt(year_, mon_, day_, 0, 0, 0, 0, "UTC"); return dt; } DateTime makeTime() { DateTime dt( 0, 1, 1, sign_ * hour(), sign_ * min_, sign_ * sec_, sign_ * psec_, "UTC"); return dt; } bool compactDate() { return compactDate_; } int year() { return year_; } private: int hour() { if (hour_ == 12) { // 12 AM if (amPm_ == 1) { return hour_ - 12; } // 12 PM return hour_; } // Rest of PM if (amPm_ == 2) { return hour_ + 12; } // 24 hour time return hour_; } inline bool consumeHours(int* pHour, int* pSign) { if (dateItr_ == dateEnd_) return false; int sign = 1; if (*dateItr_ == '-') { sign = -1; ++dateItr_; } else if (*dateItr_ == '+') { ++dateItr_; } if (!consumeInteger(10, pHour, false)) return false; *pSign = sign; return true; } inline bool consumeSeconds(int* pSec, double* pPartialSec) { double sec; if (!consumeDouble(&sec)) return false; *pSec = (int)sec; if (pPartialSec != NULL) *pPartialSec = sec - *pSec; return true; } inline bool consumeString(const std::vector& haystack, int* pOut) { // Assumes `pOut` is 1-indexed // haystack is always in UTF-8 std::string needleUTF8 = pLocale_->encoder_.makeString(dateItr_, dateEnd_); for (size_t i = 0; i < haystack.size(); ++i) { if (istarts_with(needleUTF8, haystack[i])) { *pOut = i + 1; dateItr_ += haystack[i].size(); return true; } } return false; } inline bool consumeInteger(int n, int* pOut, bool exact = true) { if (dateItr_ == dateEnd_ || *dateItr_ == '-' || *dateItr_ == '+') return false; const char* start = dateItr_; const char* end = std::min(dateItr_ + n, dateEnd_); bool ok = parseInt(dateItr_, end, *pOut); return ok && (!exact || (dateItr_ - start) == n); } // Integer indexed from 1 with optional space inline bool consumeIntegerWithSpace(int n, int* pOut) { if (consumeThisChar(' ')) n--; return consumeInteger(n, pOut); } inline bool consumeDouble(double* pOut) { if (dateItr_ == dateEnd_ || *dateItr_ == '-' || *dateItr_ == '+') return false; const char* end = dateEnd_; bool ok = parseDouble(pLocale_->decimalMark_, dateItr_, end, *pOut); dateItr_ = end; return ok; } inline bool consumeWhiteSpace() { while (dateItr_ != dateEnd_ && std::isspace(*dateItr_)) dateItr_++; return true; } inline bool consumeNonDigit() { if (dateItr_ == dateEnd_ || std::isdigit(*dateItr_)) return false; dateItr_++; return true; } inline bool consumeNonDigits() { if (!consumeNonDigit()) return false; while (dateItr_ != dateEnd_ && !std::isdigit(*dateItr_)) dateItr_++; return true; } inline bool consumeChar(char* pOut) { if (dateItr_ == dateEnd_) return false; *pOut = *dateItr_++; return true; } inline bool consumeThisChar(char needed) { if (dateItr_ == dateEnd_ || *dateItr_ != needed) return false; dateItr_++; return true; } inline bool consumeAMPM(bool* pIsPM) { if (dateItr_ == dateEnd_) return false; if (consumeThisChar('A') || consumeThisChar('a')) { *pIsPM = false; } else if (consumeThisChar('P') || consumeThisChar('p')) { *pIsPM = true; } else { return false; } if (!(consumeThisChar('M') || consumeThisChar('m'))) return false; return true; } // ISO8601 style // Z // ±hh:mm // ±hhmm // ±hh inline bool consumeTzOffset(int* pHours, int* pMinutes) { if (consumeThisChar('Z')) return true; // Optional +/- (required for ISO8601 but we'll let it slide) int mult = 1; if (*dateItr_ == '+' || *dateItr_ == '-') { mult = (*dateItr_ == '-') ? -1 : 1; dateItr_++; } // Required hours if (!consumeInteger(2, pHours)) return false; // Optional colon and minutes consumeThisChar(':'); consumeInteger(2, pMinutes); *pHours *= mult; *pMinutes *= mult; return true; } inline bool consumeTzName(std::string* pOut) { const char* tzStart = dateItr_; while (dateItr_ != dateEnd_ && !std::isspace(*dateItr_)) dateItr_++; pOut->assign(tzStart, dateItr_); return tzStart != dateItr_; } void reset() { sign_ = 1; year_ = -1; mon_ = 1; day_ = 1; hour_ = 0; min_ = 0; sec_ = 0; psec_ = 0; amPm_ = -1; compactDate_ = true; tzOffsetHours_ = 0; tzOffsetMinutes_ = 0; tz_ = tzDefault_; } }; #endif readr/src/TokenizerLine.h0000644000176200001440000000350114174357220015075 0ustar liggesusers#ifndef FASTREAD_TOKENIZERLINE_H_ #define FASTREAD_TOKENIZERLINE_H_ #include "Token.h" #include "Tokenizer.h" #include "utils.h" class TokenizerLine : public Tokenizer { SourceIterator begin_, cur_, end_; std::vector NA_; bool moreTokens_; bool skipEmptyRows_; int line_; public: TokenizerLine(std::vector NA, bool skipEmptyRows) : NA_(NA), moreTokens_(false), skipEmptyRows_(skipEmptyRows) {} TokenizerLine() : moreTokens_(false), skipEmptyRows_(false) {} void tokenize(SourceIterator begin, SourceIterator end) { begin_ = begin; cur_ = begin; end_ = end; line_ = 0; moreTokens_ = true; } std::pair progress() { size_t bytes = cur_ - begin_; return std::make_pair(bytes / (double)(end_ - begin_), bytes); } Token nextToken() { SourceIterator token_begin = cur_; bool hasNull = false; if (!moreTokens_) return Token(TOKEN_EOF, line_, 0); while (cur_ < end_) { Advance advance(&cur_); if (*cur_ == '\0') hasNull = true; if ((end_ - cur_) % 131072 == 0) { cpp11::check_user_interrupt(); } switch (*cur_) { case '\r': case '\n': { if (skipEmptyRows_ && token_begin == cur_) { ++cur_; advanceForLF(&cur_, end_); token_begin = cur_; continue; } Token t = Token(token_begin, advanceForLF(&cur_, end_), line_++, 0, hasNull); t.flagNA(NA_); return t; } default: break; } } // Reached end of Source: cur_ == end_ moreTokens_ = false; if (token_begin == end_) { return Token(TOKEN_EOF, line_++, 0); } else { Token t = Token(token_begin, end_, line_++, 0, hasNull); t.flagNA(NA_); return t; } } }; #endif readr/src/QiParsers.h0000644000176200001440000002171314174704674014242 0ustar liggesusers#ifndef FASTREAD_QI_PARSERS #define FASTREAD_QI_PARSERS #include "Rinternals.h" /* An STL iterator-based string to floating point number conversion. This function was adapted from the C standard library of RetroBSD, which is based on Berkeley UNIX. This function and this function only is BSD license. https://retrobsd.googlecode.com/svn/stable/vroom_time.h|31 col 32| for (const autosrc str : info->column.slice(start, end)) {/libc/stdlib/strtod.c */ inline double bsd_strtod(const char* begin, const char** endptr, const char decimal_mark) { if (begin == *endptr) { return NA_REAL; } if (*begin == 'n' || *begin == '?') { *endptr = begin; return NA_REAL; } int sign = 0, expSign = 0, i; double fraction, dblExp; const char* p; char c; /* Exponent read from "EX" field. */ int exp = 0; /* Exponent that derives from the fractional part. Under normal * circumstances, it is the negative of the number of digits in F. * However, if I is very long, the last digits of I get dropped * (otherwise a long I with a large negative exponent could cause an * unnecessary overflow on I alone). In this case, fracExp is * incremented one for each dropped digit. */ int fracExp = 0; /* Number of digits in mantissa. */ int mantSize; /* Number of mantissa digits BEFORE decimal point. */ int decPt; /* Temporarily holds location of exponent in str. */ const char* pExp; /* Largest possible base 10 exponent. * Any exponent larger than this will already * produce underflow or overflow, so there's * no need to worry about additional digits. */ static int maxExponent = 307; /* Table giving binary powers of 10. * Entry is 10^2^i. Used to convert decimal * exponents into floating-point numbers. */ static double powersOf10[] = { 1e1, 1e2, 1e4, 1e8, 1e16, 1e32, 1e64, 1e128, 1e256, }; #if 0 static double powersOf2[] = { 2, 4, 16, 256, 65536, 4.294967296e9, 1.8446744073709551616e19, //3.4028236692093846346e38, 1.1579208923731619542e77, 1.3407807929942597099e154, }; static double powersOf8[] = { 8, 64, 4096, 2.81474976710656e14, 7.9228162514264337593e28, //6.2771017353866807638e57, 3.9402006196394479212e115, 1.5525180923007089351e231, }; static double powersOf16[] = { 16, 256, 65536, 1.8446744073709551616e19, //3.4028236692093846346e38, 1.1579208923731619542e77, 1.3407807929942597099e154, }; #endif /* * Strip off leading blanks and check for a sign. */ p = begin; while (p != *endptr && (*p == ' ' || *p == '\t')) ++p; if (p != *endptr && *p == '-') { sign = 1; ++p; } else if (p != *endptr && *p == '+') ++p; /* If we don't have a digit or decimal point something is wrong, so return an * NA */ if (!(isdigit(*p) || *p == decimal_mark)) { *endptr = p; return NA_REAL; } /* * Count the number of digits in the mantissa (including the decimal * point), and also locate the decimal point. */ decPt = -1; for (mantSize = 0; p != *endptr; ++mantSize) { c = *p; if (!isdigit(c)) { if (c != decimal_mark || decPt >= 0) break; decPt = mantSize; } ++p; } /* * Now suck up the digits in the mantissa. Use two integers to * collect 9 digits each (this is faster than using floating-point). * If the mantissa has more than 18 digits, ignore the extras, since * they can't affect the value anyway. */ pExp = p; p -= mantSize; if (decPt < 0) decPt = mantSize; else --mantSize; /* One of the digits was the point. */ if (mantSize > 2 * 9) mantSize = 2 * 9; fracExp = decPt - mantSize; if (mantSize == 0) { fraction = 0.0; p = begin; goto done; } else { int frac1, frac2; for (frac1 = 0; mantSize > 9 && p != *endptr; --mantSize) { c = *p++; if (c == decimal_mark) c = *p++; frac1 = frac1 * 10 + (c - '0'); } for (frac2 = 0; mantSize > 0 && p != *endptr; --mantSize) { c = *p++; if (c == decimal_mark) c = *p++; frac2 = frac2 * 10 + (c - '0'); } fraction = (double)1000000000 * frac1 + frac2; } /* * Skim off the exponent. */ p = pExp; if (p != *endptr && (*p == 'E' || *p == 'e' || *p == 'S' || *p == 's' || *p == 'F' || *p == 'f' || *p == 'D' || *p == 'd' || *p == 'L' || *p == 'l')) { ++p; if (p != *endptr && *p == '-') { expSign = 1; ++p; } else if (p != *endptr && *p == '+') ++p; else if (!isdigit(*p)) { --p; goto done; } while (p != *endptr && isdigit(*p)) exp = exp * 10 + (*p++ - '0'); } if (expSign) exp = fracExp - exp; else exp = fracExp + exp; /* * Generate a floating-point number that represents the exponent. * Do this by processing the exponent one bit at a time to combine * many powers of 2 of 10. Then combine the exponent with the * fraction. */ if (exp < 0) { expSign = 1; exp = -exp; } else expSign = 0; if (exp > maxExponent) exp = maxExponent; dblExp = 1.0; for (i = 0; exp; exp >>= 1, ++i) if (exp & 01) dblExp *= powersOf10[i]; if (expSign) fraction /= dblExp; else fraction *= dblExp; done: if (p != *endptr) { *endptr = p; } return sign ? -fraction : fraction; } template inline bool parseDouble( const char decimalMark, Iterator& first, Iterator& last, Attr& res) { res = bsd_strtod(first, &last, decimalMark); return !ISNA(res); } enum NumberState { STATE_INIT, STATE_LHS, STATE_RHS, STATE_EXP, STATE_FIN }; // First and last are updated to point to first/last successfully parsed // character template inline bool parseNumber( char decimalMark, char groupingMark, Iterator& first, Iterator& last, Attr& res) { Iterator cur = first; // Advance to first non-character for (; cur != last; ++cur) { if (*cur == '-' || *cur == decimalMark || (*cur >= '0' && *cur <= '9')) break; } if (cur == last) { return false; } else { // Move first to start of number first = cur; } double sum = 0, denom = 1, exponent = 0; NumberState state = STATE_INIT; bool seenNumber = false, exp_init = true; double sign = 1.0, exp_sign = 1.0; for (; cur != last; ++cur) { if (state == STATE_FIN) break; switch (state) { case STATE_INIT: if (*cur == '-') { state = STATE_LHS; sign = -1.0; } else if (*cur == decimalMark) { state = STATE_RHS; } else if (*cur >= '0' && *cur <= '9') { seenNumber = true; state = STATE_LHS; sum = *cur - '0'; } else { goto end; } break; case STATE_LHS: if (*cur == groupingMark) { // do nothing } else if (*cur == decimalMark) { state = STATE_RHS; } else if (seenNumber && (*cur == 'e' || *cur == 'E')) { state = STATE_EXP; } else if (*cur >= '0' && *cur <= '9') { seenNumber = true; sum *= 10; sum += *cur - '0'; } else { goto end; } break; case STATE_RHS: if (*cur == groupingMark) { // do nothing } else if (seenNumber && (*cur == 'e' || *cur == 'E')) { state = STATE_EXP; } else if (*cur >= '0' && *cur <= '9') { seenNumber = true; denom *= 10; sum += (*cur - '0') / denom; } else { goto end; } break; case STATE_EXP: // negative/positive sign only allowed immediately after 'e' or 'E' if (*cur == '-' && exp_init) { exp_sign = -1.0; exp_init = false; } else if (*cur == '+' && exp_init) { // sign defaults to positive exp_init = false; } else if (*cur >= '0' && *cur <= '9') { exponent *= 10.0; exponent += *cur - '0'; exp_init = false; } else { goto end; } break; case STATE_FIN: goto end; } } end: // Set last to point to final character used last = cur; res = sign * sum; // If the number was in scientific notation, multiply by 10^exponent if (exponent) { res *= pow(10.0, exp_sign * exponent); } return seenNumber; } template inline bool parseInt(Iterator& first, Iterator& last, Attr& res) { char buf[64]; size_t expected_size = last - first; if (expected_size > sizeof(buf) - 1) { res = NA_INTEGER; return false; } std::copy(first, last, buf); buf[expected_size] = '\0'; long lres; char* endp; errno = 0; lres = strtol(buf, &endp, 10); size_t parsed_size = endp - buf; /* next can happen on a 64-bit platform */ if (res > INT_MAX || res < INT_MIN) lres = NA_INTEGER; if (errno == ERANGE) lres = NA_INTEGER; res = static_cast(lres); first += parsed_size; return res != NA_INTEGER; } #endif readr/src/cpp11.cpp0000644000176200001440000004213514547554550013611 0ustar liggesusers// Generated by cpp11: do not edit by hand // clang-format off #include "cpp11/declarations.hpp" #include // CollectorGuess.cpp std::string collectorGuess(const cpp11::strings& input, const cpp11::list& locale_, bool guessInteger); extern "C" SEXP _readr_collectorGuess(SEXP input, SEXP locale_, SEXP guessInteger) { BEGIN_CPP11 return cpp11::as_sexp(collectorGuess(cpp11::as_cpp>(input), cpp11::as_cpp>(locale_), cpp11::as_cpp>(guessInteger))); END_CPP11 } // connection.cpp std::string read_connection_(const cpp11::sexp& con, std::string filename, int chunk_size); extern "C" SEXP _readr_read_connection_(SEXP con, SEXP filename, SEXP chunk_size) { BEGIN_CPP11 return cpp11::as_sexp(read_connection_(cpp11::as_cpp>(con), cpp11::as_cpp>(filename), cpp11::as_cpp>(chunk_size))); END_CPP11 } // datetime.cpp cpp11::writable::doubles utctime_(const cpp11::integers& year, const cpp11::integers& month, const cpp11::integers& day, const cpp11::integers& hour, const cpp11::integers& min, const cpp11::integers& sec, const cpp11::doubles& psec); extern "C" SEXP _readr_utctime_(SEXP year, SEXP month, SEXP day, SEXP hour, SEXP min, SEXP sec, SEXP psec) { BEGIN_CPP11 return cpp11::as_sexp(utctime_(cpp11::as_cpp>(year), cpp11::as_cpp>(month), cpp11::as_cpp>(day), cpp11::as_cpp>(hour), cpp11::as_cpp>(min), cpp11::as_cpp>(sec), cpp11::as_cpp>(psec))); END_CPP11 } // parse.cpp cpp11::integers dim_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec); extern "C" SEXP _readr_dim_tokens_(SEXP sourceSpec, SEXP tokenizerSpec) { BEGIN_CPP11 return cpp11::as_sexp(dim_tokens_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec))); END_CPP11 } // parse.cpp std::vector count_fields_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, int n_max); extern "C" SEXP _readr_count_fields_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP n_max) { BEGIN_CPP11 return cpp11::as_sexp(count_fields_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(n_max))); END_CPP11 } // parse.cpp cpp11::list guess_header_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& locale_); extern "C" SEXP _readr_guess_header_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP locale_) { BEGIN_CPP11 return cpp11::as_sexp(guess_header_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(locale_))); END_CPP11 } // parse.cpp SEXP tokenize_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, int n_max); extern "C" SEXP _readr_tokenize_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP n_max) { BEGIN_CPP11 return cpp11::as_sexp(tokenize_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(n_max))); END_CPP11 } // parse.cpp SEXP parse_vector_(const cpp11::strings& x, const cpp11::list& collectorSpec, const cpp11::list& locale_, const std::vector& na, bool trim_ws); extern "C" SEXP _readr_parse_vector_(SEXP x, SEXP collectorSpec, SEXP locale_, SEXP na, SEXP trim_ws) { BEGIN_CPP11 return cpp11::as_sexp(parse_vector_(cpp11::as_cpp>(x), cpp11::as_cpp>(collectorSpec), cpp11::as_cpp>(locale_), cpp11::as_cpp&>>(na), cpp11::as_cpp>(trim_ws))); END_CPP11 } // read.cpp cpp11::strings read_file_(const cpp11::list& sourceSpec, const cpp11::list& locale_); extern "C" SEXP _readr_read_file_(SEXP sourceSpec, SEXP locale_) { BEGIN_CPP11 return cpp11::as_sexp(read_file_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(locale_))); END_CPP11 } // read.cpp cpp11::raws read_file_raw_(const cpp11::list& sourceSpec); extern "C" SEXP _readr_read_file_raw_(SEXP sourceSpec) { BEGIN_CPP11 return cpp11::as_sexp(read_file_raw_(cpp11::as_cpp>(sourceSpec))); END_CPP11 } // read.cpp cpp11::writable::strings read_lines_(const cpp11::list& sourceSpec, const cpp11::list& locale_, std::vector na, int n_max, bool skip_empty_rows, bool progress); extern "C" SEXP _readr_read_lines_(SEXP sourceSpec, SEXP locale_, SEXP na, SEXP n_max, SEXP skip_empty_rows, SEXP progress) { BEGIN_CPP11 return cpp11::as_sexp(read_lines_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(locale_), cpp11::as_cpp>>(na), cpp11::as_cpp>(n_max), cpp11::as_cpp>(skip_empty_rows), cpp11::as_cpp>(progress))); END_CPP11 } // read.cpp void read_lines_chunked_(const cpp11::list& sourceSpec, const cpp11::list& locale_, std::vector na, int chunkSize, const cpp11::environment& callback, bool skip_empty_rows, bool progress); extern "C" SEXP _readr_read_lines_chunked_(SEXP sourceSpec, SEXP locale_, SEXP na, SEXP chunkSize, SEXP callback, SEXP skip_empty_rows, SEXP progress) { BEGIN_CPP11 read_lines_chunked_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(locale_), cpp11::as_cpp>>(na), cpp11::as_cpp>(chunkSize), cpp11::as_cpp>(callback), cpp11::as_cpp>(skip_empty_rows), cpp11::as_cpp>(progress)); return R_NilValue; END_CPP11 } // read.cpp cpp11::list read_lines_raw_(const cpp11::list& sourceSpec, int n_max, bool progress); extern "C" SEXP _readr_read_lines_raw_(SEXP sourceSpec, SEXP n_max, SEXP progress) { BEGIN_CPP11 return cpp11::as_sexp(read_lines_raw_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(n_max), cpp11::as_cpp>(progress))); END_CPP11 } // read.cpp void read_lines_raw_chunked_(const cpp11::list& sourceSpec, int chunkSize, const cpp11::environment& callback, bool progress); extern "C" SEXP _readr_read_lines_raw_chunked_(SEXP sourceSpec, SEXP chunkSize, SEXP callback, SEXP progress) { BEGIN_CPP11 read_lines_raw_chunked_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(chunkSize), cpp11::as_cpp>(callback), cpp11::as_cpp>(progress)); return R_NilValue; END_CPP11 } // read.cpp cpp11::sexp read_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::strings& colNames, const cpp11::list& locale_, int n_max, bool progress); extern "C" SEXP _readr_read_tokens_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP colSpecs, SEXP colNames, SEXP locale_, SEXP n_max, SEXP progress) { BEGIN_CPP11 return cpp11::as_sexp(read_tokens_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(colNames), cpp11::as_cpp>(locale_), cpp11::as_cpp>(n_max), cpp11::as_cpp>(progress))); END_CPP11 } // read.cpp void read_tokens_chunked_(const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::strings& colNames, const cpp11::list& locale_, const cpp11::sexp& spec, bool progress); extern "C" SEXP _readr_read_tokens_chunked_(SEXP sourceSpec, SEXP callback, SEXP chunkSize, SEXP tokenizerSpec, SEXP colSpecs, SEXP colNames, SEXP locale_, SEXP spec, SEXP progress) { BEGIN_CPP11 read_tokens_chunked_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(callback), cpp11::as_cpp>(chunkSize), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(colNames), cpp11::as_cpp>(locale_), cpp11::as_cpp>(spec), cpp11::as_cpp>(progress)); return R_NilValue; END_CPP11 } // read.cpp cpp11::sexp melt_tokens_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, int n_max, bool progress); extern "C" SEXP _readr_melt_tokens_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP n_max, SEXP progress) { BEGIN_CPP11 return cpp11::as_sexp(melt_tokens_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(locale_), cpp11::as_cpp>(n_max), cpp11::as_cpp>(progress))); END_CPP11 } // read.cpp void melt_tokens_chunked_(const cpp11::list& sourceSpec, const cpp11::environment& callback, int chunkSize, const cpp11::list& tokenizerSpec, const cpp11::list& colSpecs, const cpp11::list& locale_, bool progress); extern "C" SEXP _readr_melt_tokens_chunked_(SEXP sourceSpec, SEXP callback, SEXP chunkSize, SEXP tokenizerSpec, SEXP colSpecs, SEXP locale_, SEXP progress) { BEGIN_CPP11 melt_tokens_chunked_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(callback), cpp11::as_cpp>(chunkSize), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(colSpecs), cpp11::as_cpp>(locale_), cpp11::as_cpp>(progress)); return R_NilValue; END_CPP11 } // read.cpp std::vector guess_types_(const cpp11::list& sourceSpec, const cpp11::list& tokenizerSpec, const cpp11::list& locale_, int n); extern "C" SEXP _readr_guess_types_(SEXP sourceSpec, SEXP tokenizerSpec, SEXP locale_, SEXP n) { BEGIN_CPP11 return cpp11::as_sexp(guess_types_(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(tokenizerSpec), cpp11::as_cpp>(locale_), cpp11::as_cpp>(n))); END_CPP11 } // TokenizerFwf.cpp cpp11::list whitespaceColumns(const cpp11::list& sourceSpec, int n, std::string comment); extern "C" SEXP _readr_whitespaceColumns(SEXP sourceSpec, SEXP n, SEXP comment) { BEGIN_CPP11 return cpp11::as_sexp(whitespaceColumns(cpp11::as_cpp>(sourceSpec), cpp11::as_cpp>(n), cpp11::as_cpp>(comment))); END_CPP11 } // type_convert.cpp cpp11::sexp type_convert_col(const cpp11::strings& x, const cpp11::list& spec, const cpp11::list& locale_, int col, const std::vector& na, bool trim_ws); extern "C" SEXP _readr_type_convert_col(SEXP x, SEXP spec, SEXP locale_, SEXP col, SEXP na, SEXP trim_ws) { BEGIN_CPP11 return cpp11::as_sexp(type_convert_col(cpp11::as_cpp>(x), cpp11::as_cpp>(spec), cpp11::as_cpp>(locale_), cpp11::as_cpp>(col), cpp11::as_cpp&>>(na), cpp11::as_cpp>(trim_ws))); END_CPP11 } // write_delim.cpp void stream_delim_(const cpp11::list& df, const cpp11::sexp& connection, char delim, const std::string& na, bool col_names, bool bom, int quote_escape, const char* eol); extern "C" SEXP _readr_stream_delim_(SEXP df, SEXP connection, SEXP delim, SEXP na, SEXP col_names, SEXP bom, SEXP quote_escape, SEXP eol) { BEGIN_CPP11 stream_delim_(cpp11::as_cpp>(df), cpp11::as_cpp>(connection), cpp11::as_cpp>(delim), cpp11::as_cpp>(na), cpp11::as_cpp>(col_names), cpp11::as_cpp>(bom), cpp11::as_cpp>(quote_escape), cpp11::as_cpp>(eol)); return R_NilValue; END_CPP11 } // write.cpp void write_lines_(const cpp11::strings& lines, const cpp11::sexp& connection, const std::string& na, const std::string& sep); extern "C" SEXP _readr_write_lines_(SEXP lines, SEXP connection, SEXP na, SEXP sep) { BEGIN_CPP11 write_lines_(cpp11::as_cpp>(lines), cpp11::as_cpp>(connection), cpp11::as_cpp>(na), cpp11::as_cpp>(sep)); return R_NilValue; END_CPP11 } // write.cpp void write_lines_raw_(const cpp11::list& x, const cpp11::sexp& connection, const std::string& sep); extern "C" SEXP _readr_write_lines_raw_(SEXP x, SEXP connection, SEXP sep) { BEGIN_CPP11 write_lines_raw_(cpp11::as_cpp>(x), cpp11::as_cpp>(connection), cpp11::as_cpp>(sep)); return R_NilValue; END_CPP11 } // write.cpp void write_file_(const std::string& x, const cpp11::sexp& connection); extern "C" SEXP _readr_write_file_(SEXP x, SEXP connection) { BEGIN_CPP11 write_file_(cpp11::as_cpp>(x), cpp11::as_cpp>(connection)); return R_NilValue; END_CPP11 } // write.cpp void write_file_raw_(const cpp11::raws& x, const cpp11::sexp& connection); extern "C" SEXP _readr_write_file_raw_(SEXP x, SEXP connection) { BEGIN_CPP11 write_file_raw_(cpp11::as_cpp>(x), cpp11::as_cpp>(connection)); return R_NilValue; END_CPP11 } extern "C" { static const R_CallMethodDef CallEntries[] = { {"_readr_collectorGuess", (DL_FUNC) &_readr_collectorGuess, 3}, {"_readr_count_fields_", (DL_FUNC) &_readr_count_fields_, 3}, {"_readr_dim_tokens_", (DL_FUNC) &_readr_dim_tokens_, 2}, {"_readr_guess_header_", (DL_FUNC) &_readr_guess_header_, 3}, {"_readr_guess_types_", (DL_FUNC) &_readr_guess_types_, 4}, {"_readr_melt_tokens_", (DL_FUNC) &_readr_melt_tokens_, 6}, {"_readr_melt_tokens_chunked_", (DL_FUNC) &_readr_melt_tokens_chunked_, 7}, {"_readr_parse_vector_", (DL_FUNC) &_readr_parse_vector_, 5}, {"_readr_read_connection_", (DL_FUNC) &_readr_read_connection_, 3}, {"_readr_read_file_", (DL_FUNC) &_readr_read_file_, 2}, {"_readr_read_file_raw_", (DL_FUNC) &_readr_read_file_raw_, 1}, {"_readr_read_lines_", (DL_FUNC) &_readr_read_lines_, 6}, {"_readr_read_lines_chunked_", (DL_FUNC) &_readr_read_lines_chunked_, 7}, {"_readr_read_lines_raw_", (DL_FUNC) &_readr_read_lines_raw_, 3}, {"_readr_read_lines_raw_chunked_", (DL_FUNC) &_readr_read_lines_raw_chunked_, 4}, {"_readr_read_tokens_", (DL_FUNC) &_readr_read_tokens_, 7}, {"_readr_read_tokens_chunked_", (DL_FUNC) &_readr_read_tokens_chunked_, 9}, {"_readr_stream_delim_", (DL_FUNC) &_readr_stream_delim_, 8}, {"_readr_tokenize_", (DL_FUNC) &_readr_tokenize_, 3}, {"_readr_type_convert_col", (DL_FUNC) &_readr_type_convert_col, 6}, {"_readr_utctime_", (DL_FUNC) &_readr_utctime_, 7}, {"_readr_whitespaceColumns", (DL_FUNC) &_readr_whitespaceColumns, 3}, {"_readr_write_file_", (DL_FUNC) &_readr_write_file_, 2}, {"_readr_write_file_raw_", (DL_FUNC) &_readr_write_file_raw_, 2}, {"_readr_write_lines_", (DL_FUNC) &_readr_write_lines_, 4}, {"_readr_write_lines_raw_", (DL_FUNC) &_readr_write_lines_raw_, 3}, {NULL, NULL, 0} }; } extern "C" attribute_visible void R_init_readr(DllInfo* dll){ R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); R_useDynamicSymbols(dll, FALSE); R_forceSymbols(dll, TRUE); } readr/src/tzfile.h0000644000176200001440000001303414152512262013605 0ustar liggesusers#ifndef TZFILE_H #define TZFILE_H /* ** This file is in the public domain, so clarified as of ** 1996-06-05 by Arthur David Olson. */ /* ** This header is for use ONLY with the time conversion code. ** There is no guarantee that it will remain unchanged, ** or that it will remain at all. ** Do NOT copy it to any system include directory. ** Thank you! */ /* ** Information about time zone files. */ #ifndef TZDIR #define TZDIR "/usr/local/etc/zoneinfo" /* Time zone object file directory */ #endif /* !defined TZDIR */ #ifndef TZDEFAULT #define TZDEFAULT "UTC" // needs to be a valid timezone, PR#16503 #endif /* !defined TZDEFAULT */ /* We don't ship posixrules, which is usually a link to a USA timezeone. So choose one instead. */ #ifndef TZDEFRULES #define TZDEFRULES "America/New_York" #endif /* !defined TZDEFRULES */ /* ** Each file begins with. . . */ #define TZ_MAGIC "TZif" struct tzhead { char tzh_magic[4]; /* TZ_MAGIC */ char tzh_version[1]; /* '\0' or '2' or '3' as of 2013 */ char tzh_reserved[15]; /* reserved--must be zero */ char tzh_ttisgmtcnt[4]; /* coded number of trans. time flags */ char tzh_ttisstdcnt[4]; /* coded number of trans. time flags */ char tzh_leapcnt[4]; /* coded number of leap seconds */ char tzh_timecnt[4]; /* coded number of transition times */ char tzh_typecnt[4]; /* coded number of local time types */ char tzh_charcnt[4]; /* coded number of abbr. chars */ }; /* ** . . .followed by. . . ** ** tzh_timecnt (char [4])s coded transition times a la time(2) ** tzh_timecnt (unsigned char)s types of local time starting at above ** tzh_typecnt repetitions of ** one (char [4]) coded UT offset in seconds ** one (unsigned char) used to set tm_isdst ** one (unsigned char) that's an abbreviation list index ** tzh_charcnt (char)s '\0'-terminated zone abbreviations ** tzh_leapcnt repetitions of ** one (char [4]) coded leap second transition times ** one (char [4]) total correction after above ** tzh_ttisstdcnt (char)s indexed by type; if TRUE, transition ** time is standard time, if FALSE, ** transition time is wall clock time ** if absent, transition times are ** assumed to be wall clock time ** tzh_ttisgmtcnt (char)s indexed by type; if TRUE, transition ** time is UT, if FALSE, ** transition time is local time ** if absent, transition times are ** assumed to be local time */ /* ** If tzh_version is '2' or greater, the above is followed by a second instance ** of tzhead and a second instance of the data in which each coded transition ** time uses 8 rather than 4 chars, ** then a POSIX-TZ-environment-variable-style string for use in handling ** instants after the last transition time stored in the file ** (with nothing between the newlines if there is no POSIX representation for ** such instants). ** ** If tz_version is '3' or greater, the above is extended as follows. ** First, the POSIX TZ string's hour offset may range from -167 ** through 167 as compared to the POSIX-required 0 through 24. ** Second, its DST start time may be January 1 at 00:00 and its stop ** time December 31 at 24:00 plus the difference between DST and ** standard time, indicating DST all year. */ /* ** In the current implementation, "tzset()" refuses to deal with files that ** exceed any of the limits below. */ #ifndef TZ_MAX_TIMES #define TZ_MAX_TIMES 1200 #endif /* !defined TZ_MAX_TIMES */ #ifndef TZ_MAX_TYPES #ifndef NOSOLAR #define TZ_MAX_TYPES 256 /* Limited by what (unsigned char)'s can hold */ #endif /* !defined NOSOLAR */ #ifdef NOSOLAR /* ** Must be at least 14 for Europe/Riga as of Jan 12 1995, ** as noted by Earl Chew. */ #define TZ_MAX_TYPES 20 /* Maximum number of local time types */ #endif /* !defined NOSOLAR */ #endif /* !defined TZ_MAX_TYPES */ // increased from 50, http://mm.icann.org/pipermail/tz/2015-August/022623.html #ifndef TZ_MAX_CHARS #define TZ_MAX_CHARS 100 /* Maximum number of abbreviation characters */ /* (limited by what unsigned chars can hold) */ #endif /* !defined TZ_MAX_CHARS */ #ifndef TZ_MAX_LEAPS #define TZ_MAX_LEAPS 50 /* Maximum number of leap second corrections */ #endif /* !defined TZ_MAX_LEAPS */ #define SECSPERMIN 60 #define MINSPERHOUR 60 #define HOURSPERDAY 24 #define DAYSPERWEEK 7 #define DAYSPERNYEAR 365 #define DAYSPERLYEAR 366 #define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) #define SECSPERDAY ((int_fast32_t)SECSPERHOUR * HOURSPERDAY) #define MONSPERYEAR 12 #define TM_SUNDAY 0 #define TM_MONDAY 1 #define TM_TUESDAY 2 #define TM_WEDNESDAY 3 #define TM_THURSDAY 4 #define TM_FRIDAY 5 #define TM_SATURDAY 6 #define TM_JANUARY 0 #define TM_FEBRUARY 1 #define TM_MARCH 2 #define TM_APRIL 3 #define TM_MAY 4 #define TM_JUNE 5 #define TM_JULY 6 #define TM_AUGUST 7 #define TM_SEPTEMBER 8 #define TM_OCTOBER 9 #define TM_NOVEMBER 10 #define TM_DECEMBER 11 #define TM_YEAR_BASE 1900 #define EPOCH_YEAR 1970 #define EPOCH_WDAY TM_THURSDAY #define isleap(y) (((y) % 4) == 0 && (((y) % 100) != 0 || ((y) % 400) == 0)) /* ** Since everything in isleap is modulo 400 (or a factor of 400), we know that ** isleap(y) == isleap(y % 400) ** and so ** isleap(a + b) == isleap((a + b) % 400) ** or ** isleap(a + b) == isleap(a % 400 + b % 400) ** This is true even if % means modulo rather than Fortran remainder ** (which is allowed by C89 but not C99). ** We use this to avoid addition overflow problems. */ #define isleap_sum(a, b) isleap((a) % 400 + (b) % 400) #endif /* !defined TZFILE_H */ readr/src/Token.h0000644000176200001440000000605714174704674013415 0ustar liggesusers#ifndef FASTREAD_TOKEN_H_ #define FASTREAD_TOKEN_H_ #include "cpp11/raws.hpp" #include "Iconv.h" #include "Source.h" #include "Tokenizer.h" #include enum TokenType { TOKEN_STRING, // a sequence of characters TOKEN_MISSING, // an missing value TOKEN_EMPTY, // an empty value TOKEN_EOF // end of file }; class Token { TokenType type_; SourceIterator begin_, end_; size_t row_, col_; bool hasNull_; Tokenizer* pTokenizer_; public: Token() : type_(TOKEN_EMPTY), begin_(0), end_(0), row_(0), col_(0), hasNull_(false), pTokenizer_(nullptr) {} Token(TokenType type, int row, int col) : type_(type), begin_(0), end_(0), row_(row), col_(col), hasNull_(false), pTokenizer_(nullptr) {} Token( SourceIterator begin, SourceIterator end, int row, int col, bool hasNull, Tokenizer* pTokenizer = NULL) : type_(TOKEN_STRING), begin_(begin), end_(end), row_(row), col_(col), hasNull_(hasNull), pTokenizer_(pTokenizer) { if (begin_ == end_) type_ = TOKEN_EMPTY; } std::string asString() const { switch (type_) { case TOKEN_STRING: { std::string buffer; SourceIterators string = getString(&buffer); return std::string(string.first, string.second); } case TOKEN_MISSING: return "[MISSING]"; case TOKEN_EMPTY: return "[EMPTY]"; case TOKEN_EOF: return "[EOF]"; } return ""; } SEXP asRaw() const { size_t n = (type_ == TOKEN_STRING) ? end_ - begin_ : 0; cpp11::writable::raws out(n); if (n > 0) memcpy(RAW(out), begin_, n); return out; } SEXP asSEXP(Iconv* pEncoder) const { switch (type_) { case TOKEN_STRING: { std::string buffer; SourceIterators string = getString(&buffer); return pEncoder->makeSEXP(string.first, string.second, hasNull_); } default: return NA_STRING; } } TokenType type() const { return type_; } SourceIterators getString(std::string* pOut) const { if (pTokenizer_ == NULL) return std::make_pair(begin_, end_); pTokenizer_->unescape(begin_, end_, pOut); return std::make_pair(pOut->data(), pOut->data() + pOut->size()); } size_t row() const { return row_; } size_t col() const { return col_; } bool hasNull() const { return hasNull_; } Token& trim() { while (begin_ != end_ && (*begin_ == ' ' || *begin_ == '\t')) begin_++; while (end_ != begin_ && (*(end_ - 1) == ' ' || *(end_ - 1) == '\t')) end_--; if (begin_ == end_) type_ = TOKEN_EMPTY; return *this; } Token& flagNA(const std::vector& NA) { std::vector::const_iterator it; for (it = NA.begin(); it != NA.end(); ++it) { if ((size_t)(end_ - begin_) != it->size()) continue; if (strncmp(begin_, it->data(), it->size()) == 0) { type_ = TOKEN_MISSING; break; } } return *this; } }; #endif readr/src/Iconv.cpp0000644000176200001440000000467714174704674013754 0ustar liggesusers#include "Iconv.h" #include "cpp11/protect.hpp" #include Iconv::Iconv(const std::string& from, const std::string& to) { if (from == "UTF-8") { cd_ = nullptr; } else { cd_ = Riconv_open(to.c_str(), from.c_str()); if (cd_ == (void*)-1) { if (errno == EINVAL) { cpp11::stop("Can't convert from %s to %s", from.c_str(), to.c_str()); } else { cpp11::stop("Iconv initialisation failed"); } } // Allocate space in buffer buffer_.resize(1024); } } Iconv::~Iconv() { if (cd_ != nullptr) { Riconv_close(cd_); cd_ = nullptr; } } size_t Iconv::convert(const char* start, const char* end) { size_t n = end - start; // Ensure buffer is big enough: one input byte can never generate // more than 4 output bytes size_t max_size = n * 4; if (buffer_.size() < max_size) { buffer_.resize(max_size); } char* outbuf = &buffer_[0]; size_t inbytesleft = n; size_t outbytesleft = max_size; size_t res = Riconv(cd_, &start, &inbytesleft, &outbuf, &outbytesleft); if (res == (size_t)-1) { switch (errno) { case EILSEQ: cpp11::stop("Invalid multibyte sequence"); case EINVAL: cpp11::stop("Incomplete multibyte sequence"); case E2BIG: cpp11::stop("Iconv buffer too small"); default: cpp11::stop("Iconv failed to convert for unknown reason"); } } return max_size - outbytesleft; } int my_strnlen(const char* s, int maxlen) { for (int n = 0; n < maxlen; ++n) { if (s[n] == '\0') { return n; } } return maxlen; } #if defined(__sun) #define readr_strnlen my_strnlen #else #define readr_strnlen strnlen #endif // To be safe, we need to check for nulls - this also needs to emit // a warning, but this behaviour is better than crashing SEXP safeMakeChar(const char* start, size_t n, bool hasNull) { size_t m = hasNull ? readr_strnlen(start, n) : n; if (m > INT_MAX) { cpp11::stop("R character strings are limited to 2^31-1 bytes"); } return Rf_mkCharLenCE(start, m, CE_UTF8); } SEXP Iconv::makeSEXP(const char* start, const char* end, bool hasNull) { if (cd_ == nullptr) { return safeMakeChar(start, end - start, hasNull); } int n = convert(start, end); return safeMakeChar(&buffer_[0], n, hasNull); } std::string Iconv::makeString(const char* start, const char* end) { if (cd_ == nullptr) { return std::string(start, end); } int n = convert(start, end); return std::string(&buffer_[0], n); } readr/vignettes/0000755000176200001440000000000014547603063013367 5ustar liggesusersreadr/vignettes/column-types.Rmd0000644000176200001440000000524514371264576016507 0ustar liggesusers--- title: "Column type" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Column type} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette provides an overview of column type specification with readr. Currently it focuses on how automatic guessing works, but over time we expect to cover more topics. ```{r setup} library(readr) ``` ## Automatic guessing If you don't explicit specify column types with the `col_types` argument, readr will attempt to guess them using some simple heuristics. By default, it will inspect 1000 values, evenly spaced from the first to the last row. This is a heuristic designed to always be fast (no matter how large your file is) and, in our experience, does a good job in most cases. If needed, you can request that readr use more rows by supplying the `guess_max` argument. You can even supply `guess_max = Inf` to use every row to guess the column types. You might wonder why this isn't the default. That's because it's slow: it has to look at every column twice, once to determine the type and once to parse the value. In most cases, you're best off supplying `col_types` yourself. ### Legacy behavior Column type guessing was substantially worse in the first edition of readr (meaning, prior to v2.0.0), because it always looked at the first 1000 rows, and through some application of Murphy's Law, it appears that many real csv files have lots of empty values at the start, followed by more "excitement" later in the file. Let's demonstrate the problem with a slightly tricky file: the column `x` is mostly empty, but has some numeric data at the very end, in row 1001. ```{r} tricky_dat <- tibble::tibble( x = rep(c("", "2"), c(1000, 1)), y = "y" ) tfile <- tempfile("tricky-column-type-guessing-", fileext = ".csv") write_csv(tricky_dat, tfile) ``` The first edition parser doesn't guess the right type for `x` so the `2` becomes an `NA`: ```{r} df <- with_edition(1, read_csv(tfile)) tail(df) ``` For this specific case, we can fix the problem by marginally increasing `guess_max`: ```{r} df <- with_edition(1, read_csv(tfile, guess_max = 1001)) tail(df) ``` Unlike the second edition, we don't recommend using `guess_max = Inf` with the legacy parser, because the engine pre-allocates a large amount of memory in the face of this uncertainty. This means that reading with `guess_max = Inf` can be extremely slow and might even crash your R session. Instead specify the `col_types`: ```{r} df <- with_edition(1, read_csv(tfile, col_types = list(x = col_double()))) tail(df) ``` ```{r} #| include: false file.remove(tfile) ``` readr/vignettes/readr.Rmd0000644000176200001440000002415714314603711015131 0ustar liggesusers--- title: "Introduction to readr" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to readr} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` The key problem that readr solves is __parsing__ a flat file into a tibble. Parsing is the process of taking a text file and turning it into a rectangular tibble where each column is the appropriate part. Parsing takes place in three basic stages: 1. The flat file is parsed into a rectangular matrix of strings. 1. The type of each column is determined. 1. Each column of strings is parsed into a vector of a more specific type. It's easiest to learn how this works in the opposite order Below, you'll learn how the: 1. __Vector parsers__ turn a character vector in to a more specific type. 1. __Column specification__ describes the type of each column and the strategy readr uses to guess types so you don't need to supply them all. 1. __Rectangular parsers__ turn a flat file into a matrix of rows and columns. Each `parse_*()` is coupled with a `col_*()` function, which will be used in the process of parsing a complete tibble. ## Vector parsers It's easiest to learn the vector parses using `parse_` functions. These all take a character vector and some options. They return a new vector the same length as the old, along with an attribute describing any problems. ### Atomic vectors `parse_logical()`, `parse_integer()`, `parse_double()`, and `parse_character()` are straightforward parsers that produce the corresponding atomic vector. ```{r} parse_integer(c("1", "2", "3")) parse_double(c("1.56", "2.34", "3.56")) parse_logical(c("true", "false")) ``` By default, readr expects `.` as the decimal mark and `,` as the grouping mark. You can override this default using `locale()`, as described in `vignette("locales")`. ### Flexible numeric parser `parse_integer()` and `parse_double()` are strict: the input string must be a single number with no leading or trailing characters. `parse_number()` is more flexible: it ignores non-numeric prefixes and suffixes, and knows how to deal with grouping marks. This makes it suitable for reading currencies and percentages: ```{r} parse_number(c("0%", "10%", "150%")) parse_number(c("$1,234.5", "$12.45")) ``` ### Date/times readr supports three types of date/time data: * dates: number of days since 1970-01-01. * times: number of seconds since midnight. * datetimes: number of seconds since midnight 1970-01-01. ```{r} parse_datetime("2010-10-01 21:45") parse_date("2010-10-01") parse_time("1:00pm") ``` Each function takes a `format` argument which describes the format of the string. If not specified, it uses a default value: * `parse_datetime()` recognises [ISO8601](https://en.wikipedia.org/wiki/ISO_8601) datetimes. * `parse_date()` uses the `date_format` specified by the `locale()`. The default value is `%AD` which uses an automatic date parser that recognises dates of the format `Y-m-d` or `Y/m/d`. * `parse_time()` uses the `time_format` specified by the `locale()`. The default value is `%At` which uses an automatic time parser that recognises times of the form `H:M` optionally followed by seconds and am/pm. In most cases, you will need to supply a `format`, as documented in `parse_datetime()`: ```{r} parse_datetime("1 January, 2010", "%d %B, %Y") parse_datetime("02/02/15", "%m/%d/%y") ``` ### Factors When reading a column that has a known set of values, you can read directly into a factor. `parse_factor()` will generate a warning if a value is not in the supplied levels. ```{r} parse_factor(c("a", "b", "a"), levels = c("a", "b", "c")) parse_factor(c("a", "b", "d"), levels = c("a", "b", "c")) ``` ## Column specification It would be tedious if you had to specify the type of every column when reading a file. Instead readr, uses some heuristics to guess the type of each column. You can access these results yourself using `guess_parser()`: ```{r} guess_parser(c("a", "b", "c")) guess_parser(c("1", "2", "3")) guess_parser(c("1,000", "2,000", "3,000")) guess_parser(c("2001/10/10")) ``` The guessing policies are described in the documentation for the individual functions. Guesses are fairly strict. For example, we don't guess that currencies are numbers, even though we can parse them: ```{r} guess_parser("$1,234") parse_number("$1,234") ``` There are two parsers that will never be guessed: `col_skip()` and `col_factor()`. You will always need to supply these explicitly. You can see the specification that readr would generate for a column file by using `spec_csv()`, `spec_tsv()` and so on: ```{r} x <- spec_csv(readr_example("challenge.csv")) ``` For bigger files, you can often make the specification simpler by changing the default column type using `cols_condense()` ```{r} mtcars_spec <- spec_csv(readr_example("mtcars.csv")) mtcars_spec cols_condense(mtcars_spec) ``` By default readr only looks at the first 1000 rows. This keeps file parsing speedy, but can generate incorrect guesses. For example, in `challenge.csv` the column types change in row 1001, so readr guesses the wrong types. One way to resolve the problem is to increase the number of rows: ```{r} x <- spec_csv(readr_example("challenge.csv"), guess_max = 1001) ``` Another way is to manually specify the `col_type`, as described below. ## Rectangular parsers readr comes with five parsers for rectangular file formats: * `read_csv()` and `read_csv2()` for csv files * `read_tsv()` for tabs separated files * `read_fwf()` for fixed-width files * `read_log()` for web log files Each of these functions firsts calls `spec_xxx()` (as described above), and then parses the file according to that column specification: ```{r} df1 <- read_csv(readr_example("challenge.csv")) ``` The rectangular parsing functions almost always succeed; they'll only fail if the format is severely messed up. Instead, readr will generate a data frame of problems. The first few will be printed out, and you can access them all with `problems()`: ```{r} problems(df1) ``` You've already seen one way of handling bad guesses: increasing the number of rows used to guess the type of each column. ```{r} df2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001) ``` Another approach is to manually supply the column specification. ### Overriding the defaults In the previous examples, you may have noticed that readr printed the column specification that it used to parse the file: ```{r} #> Parsed with column specification: #> cols( #> x = col_integer(), #> y = col_character() #> ) ``` You can also access it after the fact using `spec()`: ```{r} spec(df1) spec(df2) ``` (This also allows you to access the full column specification if you're reading a very wide file. By default, readr will only print the specification of the first 20 columns.) If you want to manually specify the column types, you can start by copying and pasting this code, and then tweaking it fix the parsing problems. ```{r} df3 <- read_csv( readr_example("challenge.csv"), col_types = list( x = col_double(), y = col_date(format = "") ) ) ``` In general, it's good practice to supply an explicit column specification. It is more work, but it ensures that you get warnings if the data changes in unexpected ways. To be really strict, you can use `stop_for_problems(df3)`. This will throw an error if there are any parsing problems, forcing you to fix those problems before proceeding with the analysis. ### Available column specifications The available specifications are: (with string abbreviations in brackets) * `col_logical()` [l], containing only `T`, `F`, `TRUE` or `FALSE`. * `col_integer()` [i], integers. * `col_double()` [d], doubles. * `col_character()` [c], everything else. * `col_factor(levels, ordered)` [f], a fixed set of values. * `col_date(format = "")` [D]: with the locale's `date_format`. * `col_time(format = "")` [t]: with the locale's `time_format`. * `col_datetime(format = "")` [T]: ISO8601 date times * `col_number()` [n], numbers containing the `grouping_mark` * `col_skip()` [_, -], don't import this column. * `col_guess()` [?], parse using the "best" type based on the input. Use the `col_types` argument to override the default choices. There are two ways to use it: * With a string: `"dc__d"`: read first column as double, second as character, skip the next two and read the last column as a double. (There's no way to use this form with types that take additional parameters.) * With a (named) list of col objects: ```r read_csv("iris.csv", col_types = list( Sepal.Length = col_double(), Sepal.Width = col_double(), Petal.Length = col_double(), Petal.Width = col_double(), Species = col_factor(c("setosa", "versicolor", "virginica")) )) ``` Or, with their abbreviations: ```r read_csv("iris.csv", col_types = list( Sepal.Length = "d", Sepal.Width = "d", Petal.Length = "d", Petal.Width = "d", Species = col_factor(c("setosa", "versicolor", "virginica")) )) ``` Any omitted columns will be parsed automatically, so the previous call will lead to the same result as: ```r read_csv("iris.csv", col_types = list( Species = col_factor(c("setosa", "versicolor", "virginica"))) ) ``` You can also set a default type that will be used instead of relying on the automatic detection for columns you don't specify: ```r read_csv("iris.csv", col_types = list( Species = col_factor(c("setosa", "versicolor", "virginica")), .default = col_double()) ) ``` If you only want to read specified columns, use `cols_only()`: ```r read_csv("iris.csv", col_types = cols_only( Species = col_factor(c("setosa", "versicolor", "virginica"))) ) ``` ### Output The output of all these functions is a tibble. Note that characters are never automatically converted to factors (i.e. no more `stringsAsFactors = FALSE`) and column names are left as is, not munged into valid R identifiers (i.e. there is no `check.names = TRUE`). Row names are never set. Attributes store the column specification (`spec()`) and any parsing problems (`problems()`). readr/vignettes/locales.Rmd0000644000176200001440000002063014510343737015455 0ustar liggesusers--- title: "Locales" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Locales} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` The goal of readr's locales is to encapsulate common options that vary between languages and localities. This includes: * The names of months and days, used when parsing dates. * The default time zone, used when parsing datetimes. * The character encoding, used when reading non-ASCII strings. * Default date format, used when guessing column types. * The decimal and grouping marks, used when reading numbers. (Strictly speaking these are not locales in the usual technical sense of the word because they also contain information about time zones and encoding.) To create a new locale, you use the `locale()` function: ```{r} locale() ``` This rest of this vignette will explain what each of the options do. All of the parsing function in readr take a `locale` argument. You'll most often use it with `read_csv()`, `read_fwf()` or `read_table()`. Readr is designed to work the same way across systems, so the default locale is English centric like R. If you're not in an English speaking country, this makes initial import a little harder, because you have to override the defaults. But the payoff is big: you can share your code and know that it will work on any other system. Base R takes a different philosophy. It uses system defaults, so typical data import is a little easier, but sharing code is harder. Rather than demonstrating the use of locales with `read_csv()` and fields, in this vignette I'm going to use the `parse_*()` functions. These work with a character vector instead of a file on disk, so they're easier to use in examples. They're also useful in their own right if you need to do custom parsing. See `type_convert()` if you need to apply multiple parsers to a data frame. ## Dates and times ### Names of months and days The first argument to `locale()` is `date_names`, and it controls what values are used for month and day names. The easiest way to specify it is with a ISO 639 language code: ```{r} locale("ko") # Korean locale("fr") # French ``` If you don't already know the code for your language, [Wikipedia](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) has a good list. Currently readr has `r length(date_names_langs())` languages available. You can list them all with `date_names_langs()`. Specifying a locale allows you to parse dates in other languages: ```{r} parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr")) parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr")) ``` For many languages, it's common to find that diacritics have been stripped so they can be stored as ASCII. You can tell the locale that with the `asciify` option: ```{r} parse_date("1 août 2015", "%d %B %Y", locale = locale("fr")) parse_date("1 aout 2015", "%d %B %Y", locale = locale("fr", asciify = TRUE)) ``` Note that the quality of the translations is variable, especially for the rarer languages. If you discover that they're not quite right for your data, you can create your own with `date_names()`. The following example creates a locale with Māori date names: ```{r} maori <- locale(date_names( day = c("Rātapu", "Rāhina", "Rātū", "Rāapa", "Rāpare", "Rāmere", "Rāhoroi"), mon = c("Kohi-tātea", "Hui-tanguru", "Poutū-te-rangi", "Paenga-whāwhā", "Haratua", "Pipiri", "Hōngongoi", "Here-turi-kōkā", "Mahuru", "Whiringa-ā-nuku", "Whiringa-ā-rangi", "Hakihea") )) ``` ### Timezones Unless otherwise specified, readr assumes that times are in UTC, the Universal Coordinated Time (this is a successor to GMT and for almost all intents is identical). UTC is most suitable for data because it doesn't have daylight savings - this avoids a whole class of potential problems. If your data isn't already in UTC, you'll need to supply a `tz` in the locale: ```{r} parse_datetime("2001-10-10 20:10") parse_datetime("2001-10-10 20:10", locale = locale(tz = "Pacific/Auckland")) parse_datetime("2001-10-10 20:10", locale = locale(tz = "Europe/Dublin")) ``` You can see a complete list of time zones with `OlsonNames()`. If you're American, note that "EST" is a Canadian time zone that does not have DST. It's not Eastern Standard Time! Instead use: * PST/PDT = "US/Pacific" * CST/CDT = "US/Central" * MST/MDT = "US/Mountain" * EST/EDT = "US/Eastern" (Note that there are more specific time zones for smaller areas that don't follow the same rules. For example, "US/Arizona", which follows mostly follows mountain time, but doesn't have daylight savings. If you're dealing with historical data, you might need an even more specific zone like "America/North_Dakota/New_Salem" - that will get you the most accurate time zones.) Note that these are only used as defaults. If individual times have timezones and you're using "%Z" (as name, e.g. "America/Chicago") or "%z" (as offset from UTC, e.g. "+0800"), they'll override the defaults. There's currently no good way to parse times that use US abbreviations. Note that once you have the date in R, changing the time zone just changes its printed representation - it still represents the same instants of time. If you've loaded non-UTC data, and want to display it as UTC, try this snippet of code: ```{r, eval = FALSE} is_datetime <- sapply(df, inherits, "POSIXct") df[is_datetime] <- lapply(df[is_datetime], function(x) { attr(x, "tzone") <- "UTC" x }) ``` ### Default formats Locales also provide default date and time formats. The date format is used when guessing column types. The default date format is `%AD`, a flexible YMD parser (see `?parse_date`): ```{r} str(parse_guess("2010-10-10")) str(parse_guess("2010/10/10")) ``` If you're an American, you might want to use your illogical date system:: ```{r} str(parse_guess("01/31/2013")) str(parse_guess("01/31/2013", locale = locale(date_format = "%m/%d/%Y"))) ``` The time format is also used when guessing column types. The default time format is `%AT`, a flexible HMS parser (see `?parse_time`): ```{r} str(parse_guess("17:55:14")) str(parse_guess("5:55:14 PM")) # Example of a non-standard time str(parse_guess("h5m55s14 PM")) str(parse_guess("h5m55s14 PM", locale = locale(time_format = "h%Hm%Ms%S %p"))) ``` ## Character All readr functions yield strings encoded in UTF-8. This encoding is the most likely to give good results in the widest variety of settings. By default, readr assumes that your input is also in UTF-8. This is less likely to be the case, especially when you're working with older datasets. The following code illustrates the problems with encodings: ```{r} library(stringi) x <- "Émigré cause célèbre déjà vu.\n" y <- stri_conv(x, "UTF-8", "latin1") # These strings look like they're identical: x y identical(x, y) # But they have different encodings: Encoding(x) Encoding(y) # That means while they print the same, their raw (binary) # representation is actually quite different: charToRaw(x) charToRaw(y) # readr expects strings to be encoded as UTF-8. If they're # not, you'll get weird characters parse_character(x) parse_character(y) # If you know the encoding, supply it: parse_character(y, locale = locale(encoding = "latin1")) ``` If you don't know what encoding the file uses, try [`guess_encoding()`](https://readr.tidyverse.org/reference/encoding.html). It's not 100% perfect (as it's fundamentally a heuristic), but should at least get you pointed in the right direction: ```{r} guess_encoding(x) guess_encoding(y) # Note that the first guess produces a valid string, but isn't correct: parse_character(y, locale = locale(encoding = "ISO-8859-2")) # But ISO-8859-1 is another name for latin1 parse_character(y, locale = locale(encoding = "ISO-8859-1")) ``` ## Numbers Some countries use the decimal point, while others use the decimal comma. The `decimal_mark` option controls which readr uses when parsing doubles: ```{r} parse_double("1,23", locale = locale(decimal_mark = ",")) ``` Additionally, when writing out big numbers, you might have `1,000,000`, `1.000.000`, `1 000 000`, or `1'000'000`. The grouping mark is ignored by the more flexible number parser: ```{r} parse_number("$1,234.56") parse_number("$1.234,56", locale = locale(decimal_mark = ",", grouping_mark = ".") ) # readr is smart enough to guess that if you're using , for decimals then # you're probably using . for grouping: parse_number("$1.234,56", locale = locale(decimal_mark = ",")) ``` readr/R/0000755000176200001440000000000014547554550011566 5ustar liggesusersreadr/R/melt_fwf.R0000644000176200001440000000543014174704674013516 0ustar liggesusers #' Return melted data for each token in a fixed width file #' #' `r lifecycle::badge("superseded")` #' This function has been superseded in readr and moved to [the meltr #' package](https://r-lib.github.io/meltr/). #' #' For certain non-rectangular data formats, it can be useful to parse the data #' into a melted format where each row represents a single token. #' #' `melt_fwf()` parses each token of a fixed width file into a single row, but #' it still requires that each field is in the same in every row of the #' source file. #' #' @seealso [melt_table()] to melt fixed width files where each #' column is separated by whitespace, and [read_fwf()] for the conventional #' way to read rectangular data from fixed width files. #' @inheritParams read_fwf #' @param col_positions Column positions, as created by [fwf_empty()], #' [fwf_widths()] or [fwf_positions()]. To read in only selected fields, #' use [fwf_positions()]. If the width of the last column is variable (a #' ragged fwf file), supply the last end position as NA. #' @export #' @examples #' fwf_sample <- readr_example("fwf-sample.txt") #' cat(read_lines(fwf_sample)) #' #' # You can specify column positions in several ways: #' # 1. Guess based on position of empty columns #' melt_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) #' # 2. A vector of field widths #' melt_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) #' # 3. Paired vectors of start and end positions #' melt_fwf(fwf_sample, fwf_positions(c(1, 30), c(10, 42), c("name", "ssn"))) #' # 4. Named arguments with start and end positions #' melt_fwf(fwf_sample, fwf_cols(name = c(1, 10), ssn = c(30, 42))) #' # 5. Named arguments with column widths #' melt_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) melt_fwf <- function(file, col_positions, locale = default_locale(), na = c("", "NA"), comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_fwf()", details = "Please use `meltr::melt_fwf()` instead") } ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) if (inherits(ds, "source_file") && empty_file(file)) { return(tibble::tibble( row = double(), col = double(), data_type = character(), value = character() )) } tokenizer <- tokenizer_fwf(as.integer(col_positions$begin), as.integer(col_positions$end), na = na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, progress = progress ) warn_problems(out) } readr/R/sysdata.rda0000644000176200001440000007256214152512262013724 0ustar liggesusersBZh91AY&SY2u?w}&lzOZ[[[{::*<:ɬv[-'L]Vy_{|/v:K]ګ'^{\>֬׷9w=ݹUֽ1[D<=Ǧ۽eO=mZ/{ybڎ;'otܨק6W^k^*{\Η\ݷκZyw׳羯fwsIݳ$UZڻbfPz*'{^njw{{k^{K@= +BKi]J{V{y9Uyp[i4 2C@ )biO'h43SQz3C)=@  h& i=)M1Q==4HOj)OIi=M444z(4% 5) ~OJyyOSMGxfPW}v5}3͂q)ojYlG+uelYbg{!B;i jWŁnPBu1}8!8DZ[o~oچj  `kYW?j a@R:ۻ '^v;;?7Bi_'6/1gXo-'?͍Z!sE !co6^#%2O/GyFOs?3dFTYV]Uv5W.{ HN UWƫ=C6@P=_Ыo%zFįM6)' 1ffjtVC%?JӜHJ#|]L(w1bGsAt<f wCR^g/ a%<4z.9ۇW ]`TQ#;^Fh Z*bA B HM@ Y@@"o+H?v{S==4 ҁ7aewct/a|ݓvㆍhgg'}Ǡ\,? f< 7km}ȵ)JR?xT\- > _W&هQ̈]ɱw `!@`@|)tG oC HY3@AɁm|ŗz3Ea#Ps~\ !4O Bc6ImWi[f2B l*(w"|xw_=}_V)Wv|xoIɫǐqg$Zkv$U=rY"G"H& "0Xmt~uB~"¦$dhF9C'Xk[_O $"yq8}?ӁĐ{^Jp@H> E +3,g$D@ ݻ%=~Gh,a<{:oN8GӾ A|v'8| NPލQ6& /8cy1>*7p'/q-?)P9C989{Ѹ·KాGcK!!Ƒ#`oJ5F͈sGD:zf= MZ+V6, _+3{٫L T!J  J+93ᡨ NSs73ZZ"G4.5c!sH naⰹWꪤQ(E20YWb6Z[흍c^i=h솏{q=)z! }pd': *6!hmXyt@((YZߨ^qQ Vsއ'-J1ܨ(1RV 8ޞ},B79ң_;eU0cUFs̈9|Gd(tlz"lUiO_ZVt1NT&AΨ:0M)u}{㧎>m1/lcuz] węr p}`#2ZC(xG\l@!8/uhjr^2 ^ 0HF׎em7QV,JG$R5Ţס[ (.*e3HQ^}1) wS`I1863~\z5c酩)-@3U*+ {Pp C&QDƀ2/?(Fw֘+M&BukLj M2[ӴEUY deņLK5 qt=)R<$X41ɤr(֥ C zp]W&h1U2<xcq6 <8/a8)]}]h$<#s9\ LV3`qv vMKOc-(Ku7 {͡*[ whk`ැ6y8aœ{{"$/]SC.x0Cb8O!"7wxQUˀT[djz<j|8a:z!t2ٱ|yQ1% Ko]03s9im8Z>u*(!19Jb+=aݼ.LvlĊtQ9c\ \(ԯ͖P~~ˎnf3Pܦ}t r"s+_sI4E-%P+# ˅rX0^X82$B*V+xb6L;bҲ';~9p(yu ȫ=ێo+06\ .I"zV\ل]!zooor3VZ!f UIeXj0t_!8?G oۜUAhj\lj 'bphK4*L{=Mɥ/g%b5uf}1ɸ g:w6u +@cM +b*32dƃ50Jʏ,iDZV_ANTQx k0+sϕ"517 A|fȤ)ڳuolL2!~n71! 2w-b?P任9,O먵p]^/O5um0]<Łd+1F6u0$l~`tl`wCr*tNGx́\Vz S^pWR#Nׅ5C=IAm; pWJT5H;ZϮx<Îj^~Iڀ9 Cr݌!:dfB p7)9 vu-x oX[ d+sz;;;=bKL% YE7 c1ee ra,ƾ#@av&zbA86؀LdO7$iO:5UDpMR_;J0SfB-j'$5vڴ@DCff'KzŰg"+iƟВӵw__%ZU;Coy--JY Rb}PjϝfRiI յXr:,k<4i.j"n(+ٮ2/do.a S)ˡ{b`E7uj8c<@ѝs6]|{A*s;-;gV͞ c57{-}t2oT` ı3л-fr` ;ݙԁ/f+cMkɘ-*KDI cI(4!-n9^ZuX X819ʙT1Ό8®>H;l[p()k$F3zɱJ|} %( R5%ݐϙ}W(]ukeCSA̳,"AO)m=g tٲ{P01#h\a',ZL(*,v$pVvQ+"2z BV k# RU%D7\ŵ'Sb̾Ҟ3fʠ3Zflvl*.ƙpp]9Rx@$RH@ RR*m [   Dd Hd@$,D! F EȢ Q`A``,Db$(EF E @!*0 Y2ADdPc $HAV$$1!#,HTdDdT`,d"*# 01,` ( AH"DEEDX( 0a@Ub$H(E,Y"1dXEX"dAdFXB H*(,D`0V((A(R"#QQAF (*XDF F,FAHȪ,cX1@PPEEEQA @ȪD*1UA@AERAX UH0HȬE "$EX)TUAX ċ*IE1AUAI FȤ "*(PAUD`Q EHdV ,XQY$F( HA@cb "B,(@3PF "BG?,^:af/KدZjMʵA6`"Jh('yFmkRyug"b%d 8H,kǾN~âfA;(H,DY!0"U `_- CSnD3|=z 'އM(`'VTN&UlQL 7`iЇ,h]1ǁ )~b! tSPQETX2Rd4D̸ĩ.wezBR3&O/~^ \]oY ^*V 2> smO]}[J`y`X!u48Z5d):Yv~u=x^/'v9߱?CiII->̃Qw|}T,FaNAWgS}C%bACwDSFnb=!e<b;>Ǟ @ޑ_p2#޷N(|\QH$ԃ|XT~y qvu;_L(-k-zkYB %QD/;[mx?&wT+Zֵ`&Bj9bs s9Ndĥ!)4 H 'C{ZYb֋Y~Eϥaj~l2c9qC䉗pہ .w]CH N*g acMs>AٳP2!l^NIqK-!rҖjyUC5Dj6!~? }g*Eq@TsJ>P,f eDCUәD9Eκk/C"dԢ%J0Vb]f“8  ƂXԡEQTC9] REVtV/\HǂGC2D4Rh'u!-Q&J\_w}[gr'}ʊ}c1QiK'!9 LcZ?< ]?Ɓ$g( I5&w |8L< a`h9FzG1}w@ JqԳ[v d)X!gi0H}Jped7tcEMܔӊ殂"2ʍ;^ .{be4^K}gƙݥIK|OM&sȘ]{`z{gNߑƂNJ . H6ylwXU\ɰ2+-MXǗpV**B:`@(NM HF*b r.e~;31iL;0&3 hAp%Н"Ա~:{ɡ"7[XgPfA"2j6 Ap&@~ouo3xY5qSxLȈ4TK&c0 a_;g-Ptռ2,lvݨ]aڲ=#N u-Q6V[86^c(AQ&sLYKS1 o <@8/ULb$9n@#, 0JTeJRP}NBDg:T};zهh4ۘӝNhk\Sҝsض)Z;˥Aefa-Sx>yby'=Ƞ9 P6JL47ԅ_2*JQ2V@ϋB82`Ӿ9=?evjUZ "cY:JM.اT^dc/!vh^E֔2ԁTXvp?b!394RB!Lp,iuFoi_0 *yƓ}W>o #tC|;%܁=-F:r)ɡO]|zw>}iܷئ_j8ܠ+ =ϱ[eWk9*nD ~5.o=D)5\q3Bo/f@Dٵˮ k:3QZ;Ω(x/ůܣ%l&+f2NV^uxKB/ 2F]C2mZG39ã#<_g8y+ܔ'ɼ3cqq0D95Nh(pV֟zuI{ռh_Rzj+HF)Mgi_69T^= v)M>D/~0VggZczavXȩCCÕWab+:]&U-u݅p)-Ng&6 UoERUEUTUPU*2}@cu^R+@3×vIW=xTPV|c3D<wE!캎Ešm?< uxhBiF쪂PP@0tM= V?molDY`r3ܖd˟[#W *Ꞵ3LasOs,Yd_{4xhŁ$6> KJسę({:iPqj)njH  ڑ53‹]0&2H۾yحdf#>{>Ë>jCV`~bcFBBlTY{s M8j翟h smѳB*7>1tM>z|%UghRE ȩIĝRJj19obbhp:uڌ? A CR2YF_@QCArPJKRE+N3 `/=fr'Q\T p &)JFCD~B1Pn֨.SZۂʽAq\<Py\{9MQH/[m^AuaK~1iٵ209Ap$3 {51d>[29ot~lc-ps]ETMbX*"'٬^;s֊VVf1JN4 H: q05ѳCAM+cZVk$Wl %Cݳk9jd,XM2PQMB6S'V;;#6hEWEe`JAr! TDU+ɲ3(QnP] ;+J֒K*m e^Age4XpdR8s xdJ٠_m@b0rߎ>$!v"~#C MR)^ad9U7Š F0P)8G6Ң>2rL^K jGci3Ah|# sFL߭Z ;1tQ*)XP1d&2 $]۠H?vldU }gCU78F%m8hEuǺC%췎SG-Y%8Tø=@AGU@QC.X,[HJ\LTƻN tD+~1M>g\4BBM$굥=`,oQcw0`_jF~b>J۠_ <˺S$wB([ grndvsA{\'N ]pv0a){J%]E.zAP~7FD>xv}_/S3g&*͙Ynr2Y*; rXl0M9Sbv]gIsZ]\FR` Vf-6s 8["S{tD.eӨ*?q7(][{yb ŎW&Q L뾚L9BGLitŠ_ZrǞxeud[`l8a9 {X=Cz!,|>RWgGx~yϥtva"…)4 YV./siDq@“=6rn ⁐6QN:NۯS_s4 9__k?XÁZHߏ[# mMpuC囌zVaO8=O` Y )fdP^t3P\Kc5SԦk?>,c.M]=MřK 6#ouy>#٭e@|YT۠A >3q͍ǘIWqFN:e3 ɚm&`waمOxD.E)<ųtc"n9 ] Yi BFcpSU?WY<~輆 ᷋+=$gz1s[٤8=8@}jtW* }/rj1JjHP0I Zw '?>`ry`"3kg{r$R:a4]#۩js)$vG]PPoxF23.PSkΏ7+ BQzȿ$?IdRw|nVkBd" wHG3-@bR UkDFKpգ/(gϔ((TN_.<*J$H_I.#$PRJwUST#`66 鏂eY$eaݧ'~DŽtpq^sLCAcCJ'D8}"/'|u0ߗO*)0RYh!%ˆA;">LS;ZK.׎ay4n̅6}+D&Ͻh/8o{s@C7aTy9lHw7Ȓ.%C0s0ǧR 2A*q3]l!GLAR֢q:/ub%rzou멿K1hyyW89pfL `SoOxv燞A b> X 1j|c 4)x7!U;\b6gECtq)#s-K{.]nԀSq'ݪ^@eCe Rڏ?#K!c:1BrIHd+d "Uso{<<4ie,,Y$Q[tv1vMn|?9 ۚٝ(4arM(y^JJs_yEmZw\i/#n~SUJ (.c,b+")`ɤfT"yڂn" wp!D02z"BUm>^~9MO٨'O&aQ熕3_Yթ5THveYAsB5[ϡ/3QF1,r+0Fb[ژ`56t9KNM1M  3wR)!^F[YvF^@; 0;'"+#+dis"`yr ǡCܲn!o/LYJvo[..@v^sNJAPQ[_mF fnU9hT޶jg}O?г.3iV0"ޟ~lpԒIkzV1aKG6Ŗ3RS4vZF-]tgt9JJ[8ڽ6bd P*sᓦG[:Z YhB QZ${HT>߾&dlq~+ALcSMB)H'reNq*p4ށ2Qe/ⶨdp/ckB-cTsss h48X||nNnCq0>!Iq;%m e9D.\BnF5DT:V'ATչ}0zҢ[V.F֨:dnW Ш`((ܦgS~M]&(#@ W ÓC3hl]4RYm! wFMFZjw? DwT O(<4Q X`ȴŬհvf-W> )J0vһϗ7Շk>ͅ".ШPTcm3 gPsWU1.Fd1,J+0Q+~{0\>S2 H\M1> e5߇i DC4C: ϯD.l}C{Hl$A!4ڻ4E=ì,;(ef$ s!oTfuxP@(\9X%i`|JN'DKKFqh>I,|ef捪h_mZfDבaйj=Aa̖dlNjڕީ~vYAuQ'fu;舢dLwlP0 {<*0s3bpU%]L`eX@ WEFʟ\C2<BQ6;VXuwӰFz7}"J{shQE.1rAfuM ">ζ֬ss*ȭj,)*%Rzxd"Y}5Cva['Ǔfv֘&mR* XvGvFk+LyDe%B $`̧q`Y.xJX^Ҩ_7EgW~Wo;_kR}*hAMҴ)?8Y}|i'ŒY˩hCx}!p}J;vGm)`MGhx>?wp,zH9lߋnyjE 5G{}cNFJLzʏ"izZgTEKNYugx4oaҥ. E>!7t3ZE{0!GtE8|M'ZO(|77e֤½`D;#;CǤ<*_R`"LbwlՎY7j9WCSl^&MV*债VW V mQOŢ^]~'rǽUv.Xn*_=Sѽ; =w<4^ gGR&p9;K 2K"Xc[Cz"*X:=ͭ,HRypM6|B-X3Gj j=@+C@6绸,͈cKTfFaWNj z-fR HX]{+W_DЇmJg( .f`}ݱ4hNXo;OFN2F 'ͺan,,RY.>U tjǬ4mU1P1Ǝ5t5CEbhbk54pҭj \m`0vƛ Kw1FӧJь#Ʈ%-o6@uR3sro ǎɍIgg%M*#I<w^BM _:ևpukj/*ٶ:0nn c[|xۈyNn,2ncM0SӜZ4gBz (6%qҿ1`PHJrwp |}Nr@R)cHBs 0-sR⾸b́"'L5|1a}rS r2w^nZ.'3D%6vF+sZ4gǿ1}y 0P!S_mǒ"nO>bh4udP,"&MJRJ˻oß$EgbͼBhӰz㘦NȤlB߶jޜq $ ^'P.Q8k7;y}i9ZgCdQCDvh$ \ 1oA'VW^\Q])1.Tka|BO!IBHf$ \:(B_" -a 0z_KUgct>~reNi!,"a?GH%H"L3P4J5.JKYNLR#01.0c0rSB"Y"VͭA:yr`Bhlҫu? g1xtqCP6 >jc"H)M).2r&8&Xtw|>L"""9n2$1HQd&Yi5uIx.U =N/Hm[`T& a+@۞ jڻxJ`}BZ05® ܲ>)lC53* >f$MZ<Vo]hdBk(ҍMF.^@DjlV"^d nYpIX^*ك Y5 !BaÅbك&!$1{ϛ' csK p9Ўa*hgSZ( _9EL 4.͖6U3Z i)fdaDh(hOPN,$IS/;qJ84Q9[S >yvLA;\dl>2 |h1Γs(L ;XCQiHVk:~PF'Vw~6ۻix09*@Ј0<1`KcYH/E$[T,|06%8)3r wpCLO(a@"H J#0L5)S5iUя0]IL`vwe9ߑF7eyn7"0B ta'PRVD$@ CǞ# V R*]Y$цI!IRL$2 hF4gZ7:1? "DMi#_Yp'q2)HECK9./;,+Az:{݉aLߊQ⳪bRVNCPK" ȅJBFdL^eϬQS Hd #D*PB()V3I& qF1cH2^C}ځ\— eU uՁSK hlJJqrGpǐW\㫸YaĹ! 1i4*X.LC"1f}c!*R2("@-F+A! hVv݈Ys)FFiP-0ǚl6Hmrra;.0#dz5GYPI"C#i;K":Qx\l͐۶z~= Q,HcX#Xэk hfH@8LP /XX&]콪[ɡ*mFQExl5B1>"qwbW@P >s2*LdR2 ݩh.8ʪ)(K+ '5fY4h|(xNHO@@QIE$J2xOnz2bBv`YVAd"oS!)żb----BCAFgFD&-Y9b">2h^QAA2`d)hm.f Œ<\ 6 EH,f.1/iH&Hu.6#;N,/E(ɘ ^"d)1,e0wNf8ZWhuubꪐ/*՘@ g0%xVaamQ3^)o[+D$6AN0NL^TiE@&HIHilXk wãCztxGTVzҤQT0d"tU*-B@FBn H```xPPq$+gq! rr,A.@^g2kR; ӸU-N tPIh40p=*$EF C83$8N`3+ mN 23pDs? M؋>>]?nRxbu9L|松w0{:bXDTHRW\su1p !rIUY$&P-YO*H 1"(/ijZ^\\г$',ҎY)eˢ~s?$# >.5 5~mϷlU'Bd};{oAo==5LFœ6 (({Ll MXAqt$ ep,[w#MePN4eegfSA0a# ~`~ ^Ʉ1v:c(r54Af"ѷhrYvg`DϾw~`g'g 3$h5 D5A .b2m4ğdh/C"'LQ!0?|{y%??PyKt }3PjJ``H{Ҝ0e@.4v:ʉLe{m+8~H2!>rוai ;xʎRnRQ4 ­=yNec΁^Nn̉F>2_;f84O.c㛢k=|sUק|ٜν;e! pP0 pI 4C-fʠ1M;Ao 0&J*e| ! isȣx/=A8Ǡr"鶶H3 ?4ơ-X>BbWnvpwkg5W9; qSPs6?l<btdqF$b@( 4vZU#%9v\YE]w5h`+LzYarLFc o.@?@vNܾϏ!>*4؆.sN~^3%Qt`ý>> tq*1EC 1 5Kޫp&1q*v5,y CefW`mkwIJyQ/!wp*` NBc^ &B [9 ble&uc9Me$IF7 եd'P"``Ñfhplv3F`[">:w<>W=z`n1xDΫf+][Nި} hdy g%8]"g验z=^-7EG忽E܃πH76xIҖ1WV2+<:SCGq^ndÞ$~ha?[,܌-pe{cpj|,o& A P3O9:n@p Zs=aeXŤ1Hu n 46ۂ\-:tfbyr=uD`v$w\CNr#)b\uYwFG9|>`f9/  }a%28OEB@4I'> `񉀂 %*L)(J!pc# SH IvC%KCè3pТ\OǼm<e2D;RuZ9!!q@B0͊@@9A9G>񼘛yrS<-x]Z s7>'E sI rC9AΠa4{@1D  9β!yYx 6C1l.ıh4qRH';Ɠ(1`c6,7g\ZV@e )IdIzIq<Ϛ@M 5Bbz /^} GӈZNWt4oj²2_ g4jEAL*rR!>1e] )2g_,t}mj[UȘL4xHǑ6g|(/][ țuj͛3RT1̭$1&sC+/ H`'X,A/Xd;٧bB.hjˇeR*,Pl ThBP@``Tq>,NLR" !!=#4yOhVl b{`ŠāC`VL"2"'Ӡ:1z221a6I=,"9@(|t5S@M@aCPg '9`tvHx!ta;v qX(&P"1>sGL3KRprcwqQ˯Pfa).a8 8FTGfnYu ŭJ%'"2cg?=quk$""蠼y)1p_.K=hff@la=-˭n!$T!"43ue^NrP N>SP D3 'x ꜐@VD\!1$ 14d,c1.A}4 l ,&("pQm:8˒e{艬9+H8NAgqA'qH1Fuji0\LBN\2W_f'WXzI%o{LMœ]u (,exo;]3+ϻW^x© A 7[HC\? g3ccSY"+d : A\;˟F>qVYT((6Yl<ʯJiyy& +ƗQ]2fPGh[Us/~9aLYeA$Ѱ#sT57prmT"WHV&lvcHO}0c}QbKĹx<`0QClڮe3jIsCfՁ|[t<^G314>;Xx)t1Oopl.[st%kAy/a3>I?Wi#͇hO|q XּiU9f6f[tHLy@`Aa³!xqoPH8-.Z6~+]mkme!%ym#9;DO SkؘX3}P<òuVJfZ7e U}^?$^V' z u[f*CiW:' ͈<6F` u&CD{Rk¢–3EETeOV<$hv,,K5|O'3-|e'a[ƏIZ.aA:+kxdA2Ǘ-{F#YB ȗX\cCCƖ364GZfpL1FTuOZUUUobp".{'"D,2GA*L=('Rg38eٞp(L vRd?A%01qӓ5q.sGyJ47H&=() "`X5=ӆ1;/d Ln _b NP*CL9E \ h[b- QY#.w>Y_@u9CB|h^2GN>$"1D: o?grO Q6s>L%9dհ,Ŷ e#5WD̻\A*o]]Vs\ERtfObFFt<#؜,~ 7 +|`pPZ[ ~Nk42A}P }29SL+O) Ga% mɑ[|=LHdy 9ibc x2r*'<\1OlN33ao!J7۽&,ݺ P"AO69jU|Ue3Ĉxd=<٭kCqq*io]ۏ݋|_G9<LKQXb 9H.l(RDau)ҢC9LdJǜv'xc13{2rr򓐠q+,-*.Z?sg?=> c g ׹|HBؔ%q ۪<ԅk%k٨2E4:O)QrW=( |)E|~>)B89C8ˆ)$2ȃܰPjQ`Sp`=p!9Q8hl2 6$b45"PVǜt0^XR3)N*'}Wvgwv7Y[ԨHuZͷ26mm{&fl]om?S)7c;g~} Þ#=g>lw@ ?T`v2eGf'9"G#zx?9K AAyG;6sR'vA8p 1) { datasource_string(paste(file, collapse = "\n"), skip, skip_empty_rows, comment, skip_quote) } else if (grepl("\n", file)) { datasource_string(file, skip, skip_empty_rows, comment, skip_quote) } else { file <- standardise_path(file) if (is.connection(file)) { datasource_connection(file, skip, skip_empty_rows, comment, skip_quote) } else { datasource_file(file, skip, skip_empty_rows, comment, skip_quote) } } } else { stop("`file` must be a string, raw vector or a connection.", call. = FALSE) } } # Constructors ----------------------------------------------------------------- new_datasource <- function(type, x, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) { structure(list(x, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...), class = c(paste0("source_", type), "source") ) } datasource_string <- function(text, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) { new_datasource("string", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote) } datasource_file <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE, ...) { path <- check_path(path) new_datasource("file", path, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote, ...) } datasource_connection <- function(path, skip, skip_empty_rows = TRUE, comment = "", skip_quote = TRUE) { # We read the connection to a temporary file, then register a finalizer to # cleanup the temp file after the datasource object is removed. file <- read_connection(path) env <- new.env(parent = emptyenv()) reg.finalizer(env, function(env) unlink(file)) datasource_file(file, skip, skip_empty_rows = skip_empty_rows, comment = comment, env = env, skip_quote = skip_quote) } datasource_raw <- function(text, skip, skip_empty_rows, comment, skip_quote = TRUE) { new_datasource("raw", text, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote) } # Helpers ---------------------------------------------------------------------- read_connection <- function(con, chunk_size = 64L * 1024L) { stopifnot(is.connection(con)) if (!isOpen(con)) { on.exit(close(con), add = TRUE) open(con, "rb") } read_connection_(con, tempfile(), chunk_size) } standardise_path <- function(path, input = TRUE) { if (!is.character(path)) { return(path) } if (length(path) > 1) { return(paste(path, collapse = "\n")) } if (grepl("\n", path)) { return(path) } if (is_url(path)) { if (requireNamespace("curl", quietly = TRUE)) { con <- curl::curl(path) } else { cli::cli_alert_warning("{.pkg curl} package not installed, falling back to using {.fn url}") con <- url(path) } ext <- tolower(tools::file_ext(path)) return( switch(ext, bz2 = , xz = { close(con) stop("Reading from remote `", ext, "` compressed files is not supported,\n", " download the files locally first.", call. = FALSE ) }, gz = gzcon(con), con ) ) } if (isTRUE(input)) { path <- check_path(path) compression <- detect_compression(path) } else { compression <- tools::file_ext(path) } switch(compression, gz = gzfile(path, ""), bz2 = bzfile(path, ""), xz = xzfile(path, ""), zip = zipfile(path, ""), { path <- normalizePath(path, mustWork = FALSE) # Use a file connection for output if (!isTRUE(input)) { file(path, "") } else { enc2utf8(path) } } ) } source_name <- function(x) { if (is.connection(x)) { "" } else if (is.raw(x)) { "" } else if (is.character(x)) { if (length(x) > 1 || grepl("\n", x)) { "literal data" } else { paste0("'", x, "'") } } else { "???" } } is_url <- function(path) { grepl("^((http|ftp)s?|sftp)://", path) } check_path <- function(path) { if (file.exists(path)) { return(normalizePath(path, "/", mustWork = FALSE)) } stop("'", path, "' does not exist", if (!is_absolute_path(path)) { paste0(" in current working directory ('", getwd(), "')") }, ".", call. = FALSE ) } is_absolute_path <- function(path) { grepl("^(/|[A-Za-z]:|\\\\|~)", path) } zipfile <- function(path, open = "r") { files <- utils::unzip(path, list = TRUE) file <- files$Name[[1]] if (nrow(files) > 1) { suppressWarnings(cli::cli_alert_warning("Multiple files in zip: reading {.file '{file}'}")) } unz(path, file, open = open) } empty_file <- function(x) { is.character(x) && file.exists(x) && file.info(x, extra_cols = FALSE)$size == 0 } #' Returns values from the clipboard #' #' This is useful in the [read_delim()] functions to read from the clipboard. #' @seealso read_delim #' @export clipboard <- function() { if (edition_first()) { return(clipr::read_clip()) } I(paste0(clipr::read_clip(), collapse = "\n")) } detect_compression <- function(path) { bytes <- readBin(path, "raw", n = 6) if (length(bytes) >= 2 && bytes[[1]] == 0x1f && bytes[[2]] == 0x8b) { return("gz") } if (length(bytes) >= 6 && bytes[[1]] == 0xFD && bytes[[2]] == 0x37 && bytes[[3]] == 0x7A && bytes[[4]] == 0x58 && bytes[[5]] == 0x5A && bytes[[6]] == 0x00) { return("xz") } if (length(bytes) >= 3 && bytes[[1]] == 0x42 && bytes[[2]] == 0x5a && bytes[[3]] == 0x68) { return("bz2") } # normal zip if (length(bytes) >= 4 && bytes[[1]] == 0x50 && bytes[[2]] == 0x4B && bytes[[3]] == 0x03 && bytes[[4]] == 0x04) { return("zip") } # empty zip if (length(bytes) >= 4 && bytes[[1]] == 0x50 && bytes[[2]] == 0x4B && bytes[[3]] == 0x05 && bytes[[4]] == 0x06) { return("zip") } # spanned zip if (length(bytes) >= 4 && bytes[[1]] == 0x50 && bytes[[2]] == 0x4B && bytes[[3]] == 0x07 && bytes[[4]] == 0x08) { return("zip") } NA_character_ } readr/R/melt_delim.R0000644000176200001440000001752414304131171014012 0ustar liggesusers#' Return melted data for each token in a delimited file (including csv & tsv) #' #' `r lifecycle::badge("superseded")` #' This function has been superseded in readr and moved to [the meltr #' package](https://r-lib.github.io/meltr/). #' #' For certain non-rectangular data formats, it can be useful to parse the data #' into a melted format where each row represents a single token. #' #' `melt_csv()` and `melt_tsv()` are special cases of the general #' `melt_delim()`. They're useful for reading the most common types of #' flat file data, comma separated values and tab separated values, #' respectively. `melt_csv2()` uses `;` for the field separator and `,` for the #' decimal point. This is common in some European countries. #' @inheritParams read_delim #' @return A [tibble()] of four columns: #' * `row`, the row that the token comes from in the original file #' * `col`, the column that the token comes from in the original file #' * `data_type`, the data type of the token, e.g. `"integer"`, `"character"`, #' `"date"`, guessed in a similar way to the `guess_parser()` function. #' * `value`, the token itself as a character string, unchanged from its #' representation in the original file. #' #' If there are parsing problems, a warning tells you #' how many, and you can retrieve the details with [problems()]. #' @seealso [read_delim()] for the conventional way to read rectangular data #' from delimited files. #' @export #' @examples #' # Input sources ------------------------------------------------------------- #' # Read from a path #' melt_csv(readr_example("mtcars.csv")) #' melt_csv(readr_example("mtcars.csv.zip")) #' melt_csv(readr_example("mtcars.csv.bz2")) #' \dontrun{ #' melt_csv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv") #' } #' #' # Or directly from a string (must contain a newline) #' melt_csv("x,y\n1,2\n3,4") #' #' # To import empty cells as 'empty' rather than `NA` #' melt_csv("x,y\n,NA,\"\",''", na = "NA") #' #' # File types ---------------------------------------------------------------- #' melt_csv("a,b\n1.0,2.0") #' melt_csv2("a;b\n1,0;2,0") #' melt_tsv("a\tb\n1.0\t2.0") #' melt_delim("a|b\n1.0|2.0", delim = "|") #' @export melt_delim <- function(file, delim, quote = '"', escape_backslash = FALSE, escape_double = TRUE, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_delim()", details = "Please use `meltr::melt_delim()` instead") } if (!nzchar(delim)) { stop("`delim` must be at least one character, ", "use `melt_table()` for whitespace delimited input.", call. = FALSE ) } tokenizer <- tokenizer_delim(delim, quote = quote, escape_backslash = escape_backslash, escape_double = escape_double, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) melt_delimited(file, tokenizer, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, progress = progress ) } #' @rdname melt_delim #' @export melt_csv <- function(file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_csv()", details = "Please use `meltr::melt_csv()` instead") } tokenizer <- tokenizer_csv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) melt_delimited(file, tokenizer, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, progress = progress ) } #' @rdname melt_delim #' @export melt_csv2 <- function(file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_csv2()", details = "Please use `meltr::melt_csv2()` instead") } if (locale$decimal_mark == ".") { cli::cli_alert_info("Using {.val ','} as decimal and {.val '.'} as grouping mark. Use {.fn read_delim} for more control.") locale$decimal_mark <- "," locale$grouping_mark <- "." } tokenizer <- tokenizer_delim( delim = ";", na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) melt_delimited(file, tokenizer, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, progress = progress ) } #' @rdname melt_delim #' @export melt_tsv <- function(file, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, progress = show_progress(), skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_tsv()", details = "Please use `meltr::melt_tsv()` instead") } tokenizer <- tokenizer_tsv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) melt_delimited(file, tokenizer, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, progress = progress ) } # Helper functions for reading from delimited files ---------------------------- col_spec_melt <- structure(list( row = structure(list(), class = c( "collector_double", "collector" ) ), col = structure(list(), class = c( "collector_double", "collector" ) ), data_type = structure(list(), class = c( "collector_character", "collector" ) ), value = structure(list(), class = c( "collector_character", "collector" ) ) ), .Names = c("row", "col", "data_type", "value") ) melt_tokens <- function(data, tokenizer, locale_, n_max, progress) { if (n_max == Inf) { n_max <- -1 } melt_tokens_(data, tokenizer, col_spec_melt, locale_, n_max, progress) } melt_delimited <- function(file, tokenizer, locale = default_locale(), skip = 0, skip_empty_rows = FALSE, comment = "", n_max = Inf, progress = show_progress()) { name <- source_name(file) # If connection needed, read once. file <- standardise_path(file) if (is.connection(file)) { data <- datasource_connection(file, skip, skip_empty_rows = skip_empty_rows, comment) } else { if (empty_file(file)) { return(tibble::tibble( row = double(), col = double(), data_type = character(), value = character() )) } if (is.character(file) && identical(locale$encoding, "UTF-8")) { # When locale is not set, file is probablly marked as its correct encoding. # As default_locale() assumes file is UTF-8, file should be encoded as UTF-8 for non-UTF-8 MBCS locales. data <- enc2utf8(file) } else { data <- file } } ds <- datasource(data, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment) out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = n_max, progress = progress ) warn_problems(out) } readr/R/col_types.R0000644000176200001440000003755614304131171013707 0ustar liggesusers#' Create column specification #' #' `cols()` includes all columns in the input data, guessing the column types #' as the default. `cols_only()` includes only the columns you explicitly #' specify, skipping the rest. In general you can substitute `list()` for #' `cols()` without changing the behavior. #' #' The available specifications are: (with string abbreviations in brackets) #' #' * `col_logical()` \[l\], containing only `T`, `F`, `TRUE` or `FALSE`. #' * `col_integer()` \[i\], integers. #' * `col_double()` \[d\], doubles. #' * `col_character()` \[c\], everything else. #' * `col_factor(levels, ordered)` \[f\], a fixed set of values. #' * `col_date(format = "")` \[D\]: with the locale's `date_format`. #' * `col_time(format = "")` \[t\]: with the locale's `time_format`. #' * `col_datetime(format = "")` \[T\]: ISO8601 date times #' * `col_number()` \[n\], numbers containing the `grouping_mark` #' * `col_skip()` \[_, -\], don't import this column. #' * `col_guess()` \[?\], parse using the "best" type based on the input. #' #' @family parsers #' @param ... Either column objects created by `col_*()`, or their abbreviated #' character names (as described in the `col_types` argument of #' [read_delim()]). If you're only overriding a few columns, it's #' best to refer to columns by name. If not named, the column types must match #' the column names exactly. #' @param .default Any named columns not explicitly overridden in `...` #' will be read with this column type. #' @export #' @examples #' cols(a = col_integer()) #' cols_only(a = col_integer()) #' #' # You can also use the standard abbreviations #' cols(a = "i") #' cols(a = "i", b = "d", c = "_") #' #' # You can also use multiple sets of column definitions by combining #' # them like so: #' #' t1 <- cols( #' column_one = col_integer(), #' column_two = col_number() #' ) #' #' t2 <- cols( #' column_three = col_character() #' ) #' #' t3 <- t1 #' t3$cols <- c(t1$cols, t2$cols) #' t3 cols <- function(..., .default = col_guess()) { if (edition_first()) { col_types <- list(...) is_character <- vapply(col_types, is.character, logical(1)) col_types[is_character] <- lapply(col_types[is_character], col_concise) if (is.character(.default)) { .default <- col_concise(.default) } return(col_spec(col_types, .default)) } vroom::cols(..., .default = .default) } #' @export #' @rdname cols cols_only <- function(...) { cols(..., .default = col_skip()) } # col_spec ---------------------------------------------------------------- col_spec <- function(col_types, default = col_guess()) { stopifnot(is.list(col_types)) stopifnot(is.collector(default)) is_collector <- vapply(col_types, is.collector, logical(1)) if (any(!is_collector)) { stop("Some `col_types` are not S3 collector objects: ", paste(which(!is_collector), collapse = ", "), call. = FALSE ) } structure( list( cols = col_types, default = default ), class = "col_spec" ) } is.col_spec <- function(x) inherits(x, "col_spec") #' Generate a column specification #' #' This is most useful for generating a specification using the short form #' @param x Input object #' @keywords internal #' @examples #' as.col_spec("cccnnn") #' @export as.col_spec <- function(x) UseMethod("as.col_spec") #' @export as.col_spec.character <- function(x) { if (is_named(x)) { return(as.col_spec(as.list(x))) } letters <- strsplit(x, "")[[1]] col_spec(lapply(letters, col_concise), col_guess()) } #' @export as.col_spec.NULL <- function(x) { col_spec(list()) } #' @export as.col_spec.list <- function(x) { do.call(cols, x) } #' @export as.col_spec.col_spec <- function(x) x #' @export as.col_spec.default <- function(x) { stop("`col_types` must be NULL, a list or a string", call. = FALSE) } type_to_col <- function(x, ...) { UseMethod("type_to_col") } #' @export type_to_col.default <- function(x, ...) { col_character() } #' @export type_to_col.logical <- function(x, ...) { col_logical() } #' @export type_to_col.integer <- function(x, ...) { col_integer() } #' @export type_to_col.double <- function(x, ...) { col_double() } #' @export type_to_col.factor <- function(x, ...) { col_factor(levels = levels(x), ordered = is.ordered(x), include_na = any(is.na(levels(x)))) } #' @export type_to_col.Date <- function(x, ...) { col_date() } #' @export type_to_col.POSIXct <- function(x, ...) { col_datetime() } #' @export type_to_col.hms <- function(x, ...) { col_time() } #' @export as.col_spec.data.frame <- function(x) { as.col_spec(lapply(x, type_to_col)) } col_to_short <- function(x, ...) { switch(class(x)[[1]], collector_character = "c", collector_date = "D", collector_datetime = "T", collector_double = "d", collector_factor = "f", collector_guess = "?", collector_integer = "i", collector_logical = "l", collector_number = "n", collector_skip = "-", collector_time = "t" ) } #' @export as.character.col_spec <- function(x, ...) { paste0( collapse = "", vapply(x$cols, col_to_short, character(1)) ) } #' @export print.col_spec <- function(x, n = Inf, condense = NULL, colour = crayon::has_color(), ...) { cat(format.col_spec(x, n = n, condense = condense, colour = colour, ...)) invisible(x) } #' @description #' `cols_condense()` takes a spec object and condenses its definition by setting #' the default column type to the most frequent type and only listing columns #' with a different type. #' @rdname spec #' @export cols_condense <- function(x) { types <- vapply(x$cols, function(xx) class(xx)[[1]], character(1)) counts <- table(types) most_common <- names(counts)[counts == max(counts)][[1]] x$default <- x$cols[types == most_common][[1]] x$cols <- x$cols[types != most_common] x } #' @export format.col_spec <- function(x, n = Inf, condense = NULL, colour = crayon::has_color(), ...) { if (n == 0) { return("") } # condense if cols >= n condense <- condense %||% (length(x$cols) >= n) if (isTRUE(condense)) { x <- cols_condense(x) } # truncate to minumum of n or length cols <- x$cols[seq_len(min(length(x$cols), n))] default <- NULL if (inherits(x$default, "collector_guess")) { fun_type <- "cols" } else if (inherits(x$default, "collector_skip")) { fun_type <- "cols_only" } else { fun_type <- "cols" type <- sub("^collector_", "", class(x$default)[[1]]) default <- paste0(".default = col_", type, "()") } cols_args <- c( default, vapply( seq_along(cols), function(i) { col_funs <- sub("^collector_", "col_", class(cols[[i]])[[1]]) args <- vapply(cols[[i]], deparse2, character(1), sep = "\n ") args <- paste(names(args), args, sep = " = ", collapse = ", ") col_funs <- paste0(col_funs, "(", args, ")") col_funs <- colourise_cols(col_funs, colour) col_names <- names(cols)[[i]] %||% "" # Need to handle unnamed columns and columns with non-syntactic names named <- col_names != "" non_syntactic <- !is_syntactic(col_names) & named col_names[non_syntactic] <- paste0("`", gsub("`", "\\\\`", col_names[non_syntactic]), "`") out <- paste0(col_names, " = ", col_funs) out[!named] <- col_funs[!named] out }, character(1) ) ) if (length(x$cols) == 0 && length(cols_args) == 0) { return(paste0(fun_type, "()\n")) } out <- paste0(fun_type, "(\n ", paste(collapse = ",\n ", cols_args)) if (length(x$cols) > n) { out <- paste0(out, "\n # ... with ", length(x$cols) - n, " more columns") } out <- paste0(out, "\n)\n") out } colourise_cols <- function(cols, colourise = crayon::has_color()) { if (!isTRUE(colourise)) { return(cols) } fname <- sub("[(].*", "", cols) for (i in seq_along(cols)) { cols[[i]] <- switch(fname, col_skip = , col_guess = cols[[i]], col_character = , col_factor = crayon::red(cols[[i]]), col_logical = crayon::yellow(cols[[i]]), col_double = , col_integer = , col_number = crayon::green(cols[[i]]), col_date = , col_datetime = , col_time = crayon::blue(cols[[i]]) ) } cols } # Used in read_delim(), read_fwf() and type_convert() show_cols_spec <- function(spec, n = getOption("readr.num_columns", 20)) { if (n > 0) { cli_block(class = "readr_spec_message", { cli::cli_h1("Column specification") txt <- strsplit(format(spec, n = n, condense = NULL), "\n")[[1]] cli::cli_verbatim(txt) if (length(spec$cols) >= n) { cli::cli_alert_info("Use {.fn spec} for the full column specifications.") } }) } } # This allows str() on a tibble object to print a little nicer. #' @export str.col_spec <- function(object, ..., indent.str = "") { # Split the formatted column spec into strings specs <- strsplit(format(object), "\n")[[1]] cat( sep = "", "\n", # Append the current indentation string to the specs paste(indent.str, specs, collapse = "\n"), "\n" ) } #' Examine the column specifications for a data frame #' #' `spec()` extracts the full column specification from a tibble #' created by readr. #' #' @family parsers #' @param x The data frame object to extract from #' @return A col_spec object. #' @export #' @examples #' df <- read_csv(readr_example("mtcars.csv")) #' s <- spec(df) #' s #' #' cols_condense(s) spec <- function(x) { stopifnot(inherits(x, "tbl_df")) attr(x, "spec") } col_concise <- function(x) { switch(x, "_" = , "-" = col_skip(), "?" = col_guess(), c = col_character(), f = col_factor(), d = col_double(), i = col_integer(), l = col_logical(), n = col_number(), D = col_date(), T = col_datetime(), t = col_time(), stop("Unknown shortcut: ", x, call. = FALSE) ) } col_spec_standardise <- function(file, col_names = TRUE, col_types = NULL, guessed_types = NULL, comment = "", skip = 0, skip_empty_rows = TRUE, skip_quote = TRUE, guess_max = 1000, tokenizer = tokenizer_csv(), locale = default_locale(), drop_skipped_names = FALSE) { # Figure out the column names ----------------------------------------------- if (is.logical(col_names) && length(col_names) == 1) { ds_header <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows, skip_quote = skip_quote, comment = comment) if (col_names) { res <- guess_header(ds_header, tokenizer, locale) col_names <- res$header skip <- res$skip } else { n <- length(guess_header(ds_header, tokenizer, locale)$header) col_names <- paste0("X", seq_len(n)) } guessed_names <- TRUE } else if (is.character(col_names)) { guessed_names <- FALSE } else { stop("`col_names` must be TRUE, FALSE or a character vector", call. = FALSE) } missing_names <- is.na(col_names) if (any(missing_names)) { new_names <- paste0("X", seq_along(col_names)[missing_names]) col_names[missing_names] <- new_names warning( "Missing column names filled in: ", paste0( encodeString(new_names, quote = "'"), " [", which(missing_names), "]", collapse = ", " ), call. = FALSE ) } if (anyDuplicated(col_names)) { dups <- duplicated(col_names) old_names <- col_names col_names <- make.unique(col_names, sep = "_") warning( "Duplicated column names deduplicated: ", paste0( encodeString(old_names[dups], quote = "'"), " => ", encodeString(col_names[dups], quote = "'"), " [", which(dups), "]", collapse = ", " ), call. = FALSE ) } # Figure out column types ---------------------------------------------------- spec <- as.col_spec(col_types) type_names <- names(spec$cols) spec$skip <- skip if (length(spec$cols) == 0) { # no types specified so use defaults spec$cols <- rep(list(spec$default), length(col_names)) names(spec$cols) <- col_names } else if (is.null(type_names) && guessed_names) { # unnamed types & names guessed from header: match exactly if (length(spec$cols) != length(col_names)) { warning("Unnamed `col_types` should have the same length as `col_names`. ", "Using smaller of the two.", call. = FALSE ) n <- min(length(col_names), length(spec$cols)) spec$cols <- spec$cols[seq_len(n)] col_names <- col_names[seq_len(n)] } names(spec$cols) <- col_names } else if (is.null(type_names) && !guessed_names) { # unnamed types & names supplied: match non-skipped columns skipped <- vapply(spec$cols, inherits, "collector_skip", FUN.VALUE = logical(1) ) # Needed for read_fwf() because width generator functions have name for # every column, even those that are skipped. Not need for read_delim() if (drop_skipped_names) { col_names <- col_names[!skipped] } n_read <- sum(!skipped) n_names <- length(col_names) n_new <- abs(n_names - n_read) if (n_read < n_names) { warning("Insufficient `col_types`. Guessing ", n_new, " columns.", call. = FALSE ) spec$cols <- c(spec$cols, list(rep(col_guess(), n_new))) } else if (n_read > n_names) { warning("Insufficient `col_names`. Adding ", n_new, " names.", call. = FALSE ) col_names2 <- rep("", length(spec$cols)) col_names2[!skipped] <- c(col_names, paste0("X", seq_len(n_new) + n_names)) col_names <- col_names2 } else { col_names2 <- rep("", length(spec$cols)) col_names2[!skipped] <- col_names col_names <- col_names2 } names(spec$cols) <- col_names } else { # names types bad_types <- !(type_names %in% col_names) if (any(bad_types)) { warning("The following named parsers don't match the column names: ", paste0(type_names[bad_types], collapse = ", "), call. = FALSE ) spec$cols <- spec$cols[!bad_types] type_names <- type_names[!bad_types] } default_types <- !(col_names %in% type_names) if (any(default_types)) { defaults <- rep(list(spec$default), sum(default_types)) names(defaults) <- col_names[default_types] spec$cols[names(defaults)] <- defaults } spec$cols <- spec$cols[col_names] } # Guess any types that need to be guessed ------------------------------------ is_guess <- vapply(spec$cols, function(x) inherits(x, "collector_guess"), logical(1)) if (any(is_guess)) { if (is.null(guessed_types)) { ds <- datasource(file, skip = spec$skip, skip_empty_rows = skip_empty_rows, skip_quote = skip_quote, comment = comment) guessed_types <- guess_types(ds, tokenizer, locale, guess_max = guess_max) } # Need to be careful here: there might be more guesses than types/names guesses <- guessed_types[seq_along(spec$cols)][is_guess] spec$cols[is_guess] <- lapply(guesses, collector_find) } spec } check_guess_max <- function(guess_max, max_limit = .Machine$integer.max %/% 100) { if (length(guess_max) != 1 || !is.numeric(guess_max) || !is_integerish(guess_max) || is.na(guess_max) || guess_max < 0) { stop("`guess_max` must be a positive integer", call. = FALSE) } if (guess_max > max_limit) { warning("`guess_max` is a very large value, setting to `", max_limit, "` to avoid exhausting memory", call. = FALSE ) guess_max <- max_limit } guess_max } guess_types <- function(datasource, tokenizer, locale, guess_max = 1000, max_limit = .Machine$integer.max %/% 100) { guess_max <- check_guess_max(guess_max, max_limit) guess_types_(datasource, tokenizer, locale, n = guess_max) } guess_header <- function(datasource, tokenizer, locale = default_locale()) { guess_header_(datasource, tokenizer, locale) } readr/R/POSIXct.R0000644000176200001440000000014214152512262013121 0ustar liggesusersPOSIXct <- function(x, tz = "UTC") { structure(x, class = c("POSIXct", "POSIXt"), tzone = tz) } readr/R/write.R0000644000176200001440000004044214304131171013024 0ustar liggesusers#' Write a data frame to a delimited file #' #' The `write_*()` family of functions are an improvement to analogous function such #' as [write.csv()] because they are approximately twice as fast. Unlike [write.csv()], #' these functions do not include row names as a column in the written file. #' A generic function, `output_column()`, is applied to each variable #' to coerce columns to suitable output. #' #' @section Output: #' Factors are coerced to character. Doubles are formatted to a decimal string #' using the grisu3 algorithm. `POSIXct` values are formatted as ISO8601 with a #' UTC timezone *Note: `POSIXct` objects in local or non-UTC timezones will be #' converted to UTC time before writing.* #' #' All columns are encoded as UTF-8. `write_excel_csv()` and `write_excel_csv2()` also include a #' \href{https://en.wikipedia.org/wiki/Byte_order_mark}{UTF-8 Byte order mark} #' which indicates to Excel the csv is UTF-8 encoded. #' #' `write_excel_csv2()` and `write_csv2` were created to allow users with #' different locale settings to save .csv files using their default settings #' (e.g. `;` as the column separator and `,` as the decimal separator). #' This is common in some European countries. #' #' Values are only quoted if they contain a comma, quote or newline. #' #' The `write_*()` functions will automatically compress outputs if an appropriate extension is given. #' Three extensions are currently supported: `.gz` for gzip compression, `.bz2` for bzip2 compression and #' `.xz` for lzma compression. See the examples for more information. #' #' @param x A data frame or tibble to write to disk. #' @param file File or connection to write to. #' @param append If `FALSE`, will overwrite existing file. If `TRUE`, #' will append to existing file. In both cases, if the file does not exist a new #' file is created. #' @param col_names If `FALSE`, column names will not be included at the top of the file. If `TRUE`, #' column names will be included. If not specified, `col_names` will take the opposite value given to `append`. #' @param delim Delimiter used to separate values. Defaults to `" "` for `write_delim()`, `","` for `write_excel_csv()` and #' `";"` for `write_excel_csv2()`. Must be a single character. #' @param na String used for missing values. Defaults to NA. Missing values #' will never be quoted; strings with the same value as `na` will #' always be quoted. #' @param quote_escape `r lifecycle::badge("deprecated")` Use the `escape` #' argument instead. #' @param eol The end of line character to use. Most commonly either `"\n"` for #' Unix style newlines, or `"\r\n"` for Windows style newlines. #' @param path `r lifecycle::badge("deprecated")` Use the `file` argument #' instead. #' @return `write_*()` returns the input `x` invisibly. #' @inheritParams vroom::vroom_write #' @inheritParams read_delim #' @references Florian Loitsch, Printing Floating-Point Numbers Quickly and #' Accurately with Integers, PLDI '10, #' #' @export #' @examples #' \dontshow{ #' .old_wd <- setwd(tempdir()) #' } #' # If only a file name is specified, write_()* will write #' # the file to the current working directory. #' write_csv(mtcars, "mtcars.csv") #' write_tsv(mtcars, "mtcars.tsv") #' #' # If you add an extension to the file name, write_()* will #' # automatically compress the output. #' write_tsv(mtcars, "mtcars.tsv.gz") #' write_tsv(mtcars, "mtcars.tsv.bz2") #' write_tsv(mtcars, "mtcars.tsv.xz") #' \dontshow{ #' setwd(.old_wd) #' } write_delim <- function(x, file, delim = " ", na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_delim(path = )", "write_delim(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } stopifnot(is.data.frame(x)) check_column_types(x) x_out <- x x[] <- lapply(x, output_column) if (edition_first()) { stream_delim(x, file, delim = delim, col_names = col_names, append = append, na = na, quote_escape = escape, eol = eol ) return(invisible(x_out)) } vroom::vroom_write(x, file, delim = delim, col_names = col_names, append = append, na = na, eol = eol, quote = quote, escape = escape, num_threads = num_threads, progress = progress ) invisible(x_out) } #' @rdname write_delim #' @export write_csv <- function(x, file, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_csv(path = )", "write_csv(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } write_delim(x, file, delim = ",", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress ) } #' @rdname write_delim #' @export write_csv2 <- function(x, file, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_csv2(path = )", "write_csv2(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } x_out <- x x <- change_decimal_separator(x, decimal_mark = ",") write_delim(x, file, delim = ";", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress ) invisible(x_out) } #' @rdname write_delim #' @export write_excel_csv <- function(x, file, na = "NA", append = FALSE, col_names = !append, delim = ",", quote = "all", escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_excel_csv(path = )", "write_excel_csv(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } stopifnot(is.data.frame(x)) check_column_types(x) x_out <- x datetime_cols <- vapply(x, inherits, logical(1), "POSIXt") x[datetime_cols] <- lapply(x[datetime_cols], format, "%Y/%m/%d %H:%M:%S") x[] <- lapply(x, output_column) if (edition_first()) { stream_delim(x, file, delim, col_names = col_names, append = append, na = na, bom = !append, quote_escape = escape, eol = eol ) return(invisible(x_out)) } vroom::vroom_write(x, file, delim, col_names = col_names, append = append, na = na, bom = !append, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress ) invisible(x_out) } #' @rdname write_delim #' @export write_excel_csv2 <- function(x, file, na = "NA", append = FALSE, col_names = !append, delim = ";", quote = "all", escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_excel_csv2(path = )", "write_excel_csv2(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } stopifnot(is.data.frame(x)) check_column_types(x) x_out <- x x <- change_decimal_separator(x, decimal_mark = ",") datetime_cols <- vapply(x, inherits, logical(1), "POSIXt") x[datetime_cols] <- lapply(x[datetime_cols], format, "%Y/%m/%d %H:%M:%S") x[] <- lapply(x, output_column) write_excel_csv(x, file, na, append, col_names, delim, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress ) invisible(x_out) } #' @rdname write_delim #' @export write_tsv <- function(x, file, na = "NA", append = FALSE, col_names = !append, quote = "none", escape = c("double", "backslash", "none"), eol = "\n", num_threads = readr_threads(), progress = show_progress(), path = deprecated(), quote_escape = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_tsv(path = )", "write_tsv(file = )") file <- path } if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } write_delim(x, file, delim = "\t", na = na, append = append, col_names = col_names, quote = quote, escape = escape, eol = eol, num_threads = num_threads, progress = progress ) } #' Convert a data frame to a delimited string #' #' These functions are equivalent to [write_csv()] etc., but instead #' of writing to disk, they return a string. #' #' @return A string. #' @inheritSection write_delim Output #' @inheritParams write_delim #' @param x A data frame. #' @inherit write_delim references #' @examples #' # format_()* functions are useful for testing and reprexes #' cat(format_csv(mtcars)) #' cat(format_tsv(mtcars)) #' cat(format_delim(mtcars, ";")) #' #' # Specifying missing values #' df <- data.frame(x = c(1, NA, 3)) #' format_csv(df, na = "missing") #' #' # Quotes are automatically added as needed #' df <- data.frame(x = c("a ", '"', ",", "\n")) #' cat(format_csv(df)) #' @export format_delim <- function(x, delim, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", quote_escape = deprecated()) { stopifnot(is.data.frame(x)) check_column_types(x) if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } x[] <- lapply(x, output_column) if (edition_first()) { res <- stream_delim(df = x, file = NULL, delim = delim, col_names = col_names, append = append, na = na, quote_escape = escape, eol = eol) Encoding(res) <- "UTF-8" return(res) } res <- vroom::vroom_format(x, delim = delim, eol = eol, col_names = col_names, na = na, quote = quote, escape = escape) Encoding(res) <- "UTF-8" res } #' @export #' @rdname format_delim format_csv <- function(x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", quote_escape = deprecated()) { if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } format_delim(x, delim = ",", na = na, append = append, col_names = col_names, eol = eol, quote = quote, escape = escape) } #' @export #' @rdname format_delim format_csv2 <- function(x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", quote_escape = deprecated()) { if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } x <- change_decimal_separator(x, decimal_mark = ",") format_delim(x, delim = ";", na = na, append = append, col_names = col_names, eol = eol, quote = quote, escape = escape) } #' @export #' @rdname format_delim format_tsv <- function(x, na = "NA", append = FALSE, col_names = !append, quote = c("needed", "all", "none"), escape = c("double", "backslash", "none"), eol = "\n", quote_escape = deprecated()) { if (is_present(quote_escape)) { deprecate_soft("2.0.0", "write_delim(quote_escape = )", "write_delim(escape = )") escape <- quote_escape } format_delim(x, delim = "\t", na = na, append = append, col_names = col_names, eol = eol, quote = quote, escape = escape) } #' Preprocess column for output #' #' This is a generic function that applied to each column before it is saved #' to disk. It provides a hook for S3 classes that need special handling. #' #' @keywords internal #' @param x A vector #' @export #' @examples #' # Most columns are not altered, but POSIXct are converted to ISO8601. #' x <- parse_datetime("2016-01-01") #' str(output_column(x)) output_column <- function(x, name) { UseMethod("output_column") } #' @export output_column.default <- function(x, name) { if (!is.object(x) || "AsIs" %in% class(x)) { return(x) } as.character(x) } #' @export output_column.double <- function(x, name) { x } #' @export output_column.POSIXt <- function(x, name) { format(x, "%Y-%m-%dT%H:%M:%OSZ", tz = "UTC", justify = "none") } stream_delim <- function(df, file, append = FALSE, bom = FALSE, ..., quote_escape, eol) { quote_escape <- standardise_escape(quote_escape) file <- standardise_path(file, input = FALSE) if (is.null(file)) { out_file <- tempfile() con <- file(out_file, "wb") on.exit( { try(close(con), silent = TRUE) unlink(out_file) }, add = TRUE ) stream_delim_(df, con, ..., bom = bom, quote_escape = quote_escape, eol = eol) close(con) return(read_file(out_file)) } if (inherits(file, "connection") && !isOpen(file)) { on.exit(close(file), add = TRUE) if (isTRUE(append)) { open(file, "ab") } else { open(file, "wb") } } stream_delim_(df, file, ..., bom = bom, quote_escape = quote_escape, eol = eol) } change_decimal_separator <- function(x, decimal_mark = ",") { stopifnot(is.data.frame(x)) numeric_cols <- vapply(x, is.numeric, logical(1)) format_seps <- function(x, decimal_mark) { nas <- is.na(x) x <- format(x, decimal.mark = decimal_mark, trim = TRUE, digits = 15) x[nas] <- NA_character_ x } x[numeric_cols] <- lapply(x[numeric_cols], format_seps, decimal_mark) x } standardise_escape <- function(x) { if (identical(x, FALSE)) { x <- "none" } escape_types <- c("double" = 1L, "backslash" = 2L, "none" = 3L) escape <- match.arg(tolower(x), names(escape_types)) escape_types[escape] } check_column_types <- function(x) { is_bad_column <- vapply(x, function(xx) !is.null(dim(xx)), logical(1)) if (any(is_bad_column)) { cli_block(type = rlang::abort, { cli::cli_text("`x` must not contain list or matrix columns:") cli::cli_alert_danger("invalid columns at index(s): {paste0(which(is_bad_column), collapse = '\n')}") }) } } readr/R/read_log.R0000644000176200001440000000151514174704674013467 0ustar liggesusers#' Read common/combined log file into a tibble #' #' This is a fairly standard format for log files - it uses both quotes #' and square brackets for quoting, and there may be literal quotes embedded #' in a quoted string. The dash, "-", is used for missing values. #' #' @inheritParams read_delim #' @export #' @examples #' read_log(readr_example("example.log")) read_log <- function(file, col_names = FALSE, col_types = NULL, trim_ws = TRUE, skip = 0, n_max = Inf, show_col_types = should_show_types(), progress = show_progress()) { tokenizer <- tokenizer_log(trim_ws = trim_ws) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, skip = skip, n_max = n_max, progress = progress, show_col_types = show_col_types ) } readr/R/date-symbols.R0000644000176200001440000000452714304131171014301 0ustar liggesusers#' Create or retrieve date names #' #' When parsing dates, you often need to know how weekdays of the week and #' months are represented as text. This pair of functions allows you to either #' create your own, or retrieve from a standard list. The standard list is #' derived from ICU (`http://site.icu-project.org`) via the stringi package. #' #' @param mon,mon_ab Full and abbreviated month names. #' @param day,day_ab Full and abbreviated week day names. Starts with Sunday. #' @param am_pm Names used for AM and PM. #' @export #' @examples #' date_names_lang("en") #' date_names_lang("ko") #' date_names_lang("fr") date_names <- function(mon, mon_ab = mon, day, day_ab = day, am_pm = c("AM", "PM")) { stopifnot(is.character(mon), length(mon) == 12) stopifnot(is.character(mon_ab), length(mon_ab) == 12) stopifnot(is.character(day), length(day) == 7) stopifnot(is.character(day_ab), length(day_ab) == 7) structure( list( mon = enc2utf8(mon), mon_ab = enc2utf8(mon_ab), day = enc2utf8(day), day_ab = enc2utf8(day_ab), am_pm = enc2utf8(am_pm) ), class = "date_names" ) } #' @export #' @rdname date_names #' @param language A BCP 47 locale, made up of a language and a region, #' e.g. `"en"` for American English. See `date_names_langs()` #' for a complete list of available locales. date_names_lang <- function(language) { check_string(language) symbols <- date_symbols[[language]] if (is.null(symbols)) { stop("Unknown language '", language, "'", call. = FALSE) } symbols } #' @export #' @rdname date_names date_names_langs <- function() { names(date_symbols) } #' @export print.date_names <- function(x, ...) { cat("\n") if (identical(x$day, x$day_ab)) { day <- paste0(x$day, collapse = ", ") } else { day <- paste0(x$day, " (", x$day_ab, ")", collapse = ", ") } if (identical(x$mon, x$mon_ab)) { mon <- paste0(x$mon, collapse = ", ") } else { mon <- paste0(x$mon, " (", x$mon_ab, ")", collapse = ", ") } am_pm <- paste0(x$am_pm, collapse = "/") cat_wrap("Days: ", day) cat_wrap("Months: ", mon) cat_wrap("AM/PM: ", am_pm) } is.date_names <- function(x) inherits(x, "date_names") cat_wrap <- function(header, body) { body <- strwrap(body, exdent = nchar(header)) cat(header, paste(body, collapse = "\n"), "\n", sep = "") } readr/R/example.R0000644000176200001440000000100314174357220013325 0ustar liggesusers#' Get path to readr example #' #' readr comes bundled with a number of sample files in its `inst/extdata` #' directory. This function make them easy to access #' #' @param file Name of file. If `NULL`, the example files will be listed. #' @export #' @examples #' readr_example() #' readr_example("challenge.csv") readr_example <- function(file = NULL) { if (is.null(file)) { dir(system.file("extdata", package = "readr")) } else { system.file("extdata", file, package = "readr", mustWork = TRUE) } } readr/R/read_lines_chunked.R0000644000176200001440000000217014174704674015517 0ustar liggesusers#' Read lines from a file or string by chunk. #' #' @inheritParams datasource #' @inheritParams read_delim_chunked #' @keywords internal #' @family chunked #' @export read_lines_chunked <- function(file, callback, chunk_size = 10000, skip = 0, locale = default_locale(), na = character(), progress = show_progress()) { if (empty_file(file)) { return(character()) } ds <- datasource(file, skip = skip, skip_empty_rows = FALSE) callback <- as_chunk_callback(callback) on.exit(callback$finally(), add = TRUE) read_lines_chunked_(ds, locale, na, chunk_size, callback, FALSE, progress) return(callback$result()) } #' @export #' @rdname read_lines_chunked read_lines_raw_chunked <- function(file, callback, chunk_size = 10000, skip = 0, progress = show_progress()) { if (empty_file(file)) { return(character()) } ds <- datasource(file, skip = skip, skip_empty_rows = FALSE) callback <- as_chunk_callback(callback) on.exit(callback$finally(), add = TRUE) read_lines_raw_chunked_(ds, chunk_size, callback, progress) return(callback$result()) } readr/R/melt_delim_chunked.R0000644000176200001440000000641014174704674015526 0ustar liggesusers# Generates the chunked definition from the melt_* definition generate_melt_chunked_fun <- function(x) { # nocov start args <- formals(x) # Remove n_max argument args <- args[names(args) != "n_max"] args <- append(args, alist(callback = , chunk_size = 10000), 1) b <- as.list(body(x)) # Change melt_delimited to melt_delimited_chunked b[[length(b)]][[1]] <- quote(melt_delimited_chunked) call_args <- as.list(b[[length(b)]]) # Remove the n_max argument call_args <- call_args[!names(call_args) == "n_max"] # add the callback and chunk_size arguments b[[length(b)]] <- as.call(append(call_args, alist(callback = callback, chunk_size = chunk_size), 2)) body(x) <- as.call(b) formals(x) <- args x } # nocov end # Generates the modified melt_delimited function generate_melt_delimited_chunked <- function(x) { # nocov start args <- formals(x) args <- args[names(args) != "n_max"] args <- append(args, alist(callback = , chunk_size = 10000), 1) b <- as.list(body(x)) for (i in seq_along(b)) { if (is.call(b[[i]]) && identical(b[[i]][[1]], as.symbol("<-")) && is.call(b[[i]][[3]]) && identical(b[[i]][[3]][[1]], quote(melt_tokens))) { # Change melt_tokens() to melt_tokens_chunked b[[i]][[3]][[1]] <- quote(melt_tokens_chunked) chunked_call <- as.list(b[[i]][[3]]) # Remove the n_max argument chunked_call <- chunked_call[!names(chunked_call) == "n_max"] # Add the callback and chunk_size arguments b[[i]] <- as.call(append(chunked_call, alist(callback = callback, chunk_size = chunk_size), 2)) # Remove additional calls b <- b[-seq(i + 1, length(b))] body(x) <- as.call(b) formals(x) <- args return(x) } } x } # nocov end melt_tokens_chunked <- function(data, callback, chunk_size, tokenizer, locale_, progress) { callback <- as_chunk_callback(callback) on.exit(callback$finally(), add = TRUE) melt_tokens_chunked_( data, callback, chunk_size, tokenizer, col_spec_melt, locale_, progress ) return(callback$result()) } melt_delimited_chunked <- generate_melt_delimited_chunked(melt_delimited) #' Melt a delimited file by chunks #' #' For certain non-rectangular data formats, it can be useful to parse the data #' into a melted format where each row represents a single token. #' #' `melt_delim_chunked()` and the specialisations `melt_csv_chunked()`, #' `melt_csv2_chunked()` and `melt_tsv_chunked()` read files by a chunk of rows #' at a time, executing a given function on one chunk before reading the next. #' #' @inheritParams read_delim_chunked #' @param callback A callback function to call on each chunk #' @param chunk_size The number of rows to include in each chunk #' @keywords internal #' @family chunked #' @export #' @examples #' # Cars with 3 gears #' f <- function(x, pos) subset(x, data_type == "integer") #' melt_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) melt_delim_chunked <- generate_melt_chunked_fun(melt_delim) #' @rdname melt_delim_chunked #' @export melt_csv_chunked <- generate_melt_chunked_fun(melt_csv) #' @rdname melt_delim_chunked #' @export melt_csv2_chunked <- generate_melt_chunked_fun(melt_csv2) #' @rdname melt_delim_chunked #' @export melt_tsv_chunked <- generate_melt_chunked_fun(melt_tsv) readr/R/count_fields.R0000644000176200001440000000120614152512262014350 0ustar liggesusers#' Count the number of fields in each line of a file #' #' This is useful for diagnosing problems with functions that fail #' to parse correctly. #' #' @inheritParams datasource #' @param tokenizer A tokenizer that specifies how to break the `file` #' up into fields, e.g., [tokenizer_csv()], #' [tokenizer_fwf()] #' @param n_max Optionally, maximum number of rows to count fields for. #' @export #' @examples #' count_fields(readr_example("mtcars.csv"), tokenizer_csv()) count_fields <- function(file, tokenizer, skip = 0, n_max = -1L) { ds <- datasource(file, skip = skip, skip_empty_rows = FALSE) count_fields_(ds, tokenizer, n_max) } readr/R/cpp11.R0000644000176200001440000000753514547554550012647 0ustar liggesusers# Generated by cpp11: do not edit by hand collectorGuess <- function(input, locale_, guessInteger) { .Call(`_readr_collectorGuess`, input, locale_, guessInteger) } read_connection_ <- function(con, filename, chunk_size) { .Call(`_readr_read_connection_`, con, filename, chunk_size) } utctime_ <- function(year, month, day, hour, min, sec, psec) { .Call(`_readr_utctime_`, year, month, day, hour, min, sec, psec) } dim_tokens_ <- function(sourceSpec, tokenizerSpec) { .Call(`_readr_dim_tokens_`, sourceSpec, tokenizerSpec) } count_fields_ <- function(sourceSpec, tokenizerSpec, n_max) { .Call(`_readr_count_fields_`, sourceSpec, tokenizerSpec, n_max) } guess_header_ <- function(sourceSpec, tokenizerSpec, locale_) { .Call(`_readr_guess_header_`, sourceSpec, tokenizerSpec, locale_) } tokenize_ <- function(sourceSpec, tokenizerSpec, n_max) { .Call(`_readr_tokenize_`, sourceSpec, tokenizerSpec, n_max) } parse_vector_ <- function(x, collectorSpec, locale_, na, trim_ws) { .Call(`_readr_parse_vector_`, x, collectorSpec, locale_, na, trim_ws) } read_file_ <- function(sourceSpec, locale_) { .Call(`_readr_read_file_`, sourceSpec, locale_) } read_file_raw_ <- function(sourceSpec) { .Call(`_readr_read_file_raw_`, sourceSpec) } read_lines_ <- function(sourceSpec, locale_, na, n_max, skip_empty_rows, progress) { .Call(`_readr_read_lines_`, sourceSpec, locale_, na, n_max, skip_empty_rows, progress) } read_lines_chunked_ <- function(sourceSpec, locale_, na, chunkSize, callback, skip_empty_rows, progress) { invisible(.Call(`_readr_read_lines_chunked_`, sourceSpec, locale_, na, chunkSize, callback, skip_empty_rows, progress)) } read_lines_raw_ <- function(sourceSpec, n_max, progress) { .Call(`_readr_read_lines_raw_`, sourceSpec, n_max, progress) } read_lines_raw_chunked_ <- function(sourceSpec, chunkSize, callback, progress) { invisible(.Call(`_readr_read_lines_raw_chunked_`, sourceSpec, chunkSize, callback, progress)) } read_tokens_ <- function(sourceSpec, tokenizerSpec, colSpecs, colNames, locale_, n_max, progress) { .Call(`_readr_read_tokens_`, sourceSpec, tokenizerSpec, colSpecs, colNames, locale_, n_max, progress) } read_tokens_chunked_ <- function(sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, colNames, locale_, spec, progress) { invisible(.Call(`_readr_read_tokens_chunked_`, sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, colNames, locale_, spec, progress)) } melt_tokens_ <- function(sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) { .Call(`_readr_melt_tokens_`, sourceSpec, tokenizerSpec, colSpecs, locale_, n_max, progress) } melt_tokens_chunked_ <- function(sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress) { invisible(.Call(`_readr_melt_tokens_chunked_`, sourceSpec, callback, chunkSize, tokenizerSpec, colSpecs, locale_, progress)) } guess_types_ <- function(sourceSpec, tokenizerSpec, locale_, n) { .Call(`_readr_guess_types_`, sourceSpec, tokenizerSpec, locale_, n) } whitespaceColumns <- function(sourceSpec, n, comment) { .Call(`_readr_whitespaceColumns`, sourceSpec, n, comment) } type_convert_col <- function(x, spec, locale_, col, na, trim_ws) { .Call(`_readr_type_convert_col`, x, spec, locale_, col, na, trim_ws) } stream_delim_ <- function(df, connection, delim, na, col_names, bom, quote_escape, eol) { invisible(.Call(`_readr_stream_delim_`, df, connection, delim, na, col_names, bom, quote_escape, eol)) } write_lines_ <- function(lines, connection, na, sep) { invisible(.Call(`_readr_write_lines_`, lines, connection, na, sep)) } write_lines_raw_ <- function(x, connection, sep) { invisible(.Call(`_readr_write_lines_raw_`, x, connection, sep)) } write_file_ <- function(x, connection) { invisible(.Call(`_readr_write_file_`, x, connection)) } write_file_raw_ <- function(x, connection) { invisible(.Call(`_readr_write_file_raw_`, x, connection)) } readr/R/type_convert.R0000644000176200001440000000630114174704674014432 0ustar liggesusers#' Re-convert character columns in existing data frame #' #' This is useful if you need to do some manual munging - you can read the #' columns in as character, clean it up with (e.g.) regular expressions and #' then let readr take another stab at parsing it. The name is a homage to #' the base [utils::type.convert()]. #' #' @param df A data frame. #' @param col_types One of `NULL`, a [cols()] specification, or #' a string. See `vignette("readr")` for more details. #' #' If `NULL`, column types will be imputed using all rows. #' @inheritParams tokenizer_delim #' @inheritParams read_delim #' @inheritParams guess_parser #' @note `type_convert()` removes a 'spec' attribute, #' because it likely modifies the column data types. #' (see [spec()] for more information about column specifications). #' @export #' @examples #' df <- data.frame( #' x = as.character(runif(10)), #' y = as.character(sample(10)), #' stringsAsFactors = FALSE #' ) #' str(df) #' str(type_convert(df)) #' #' df <- data.frame(x = c("NA", "10"), stringsAsFactors = FALSE) #' str(type_convert(df)) #' #' # Type convert can be used to infer types from an entire dataset #' #' # first read the data as character #' data <- read_csv(readr_example("mtcars.csv"), #' col_types = list(.default = col_character()) #' ) #' str(data) #' # Then convert it with type_convert #' type_convert(data) type_convert <- function(df, col_types = NULL, na = c("", "NA"), trim_ws = TRUE, locale = default_locale(), guess_integer = FALSE) { stopifnot(is.data.frame(df)) is_character <- vapply(df, is.character, logical(1)) if (!any(is_character)) { warning("`type_convert()` only converts columns of type 'character'.\n- `df` has no columns of type 'character'", call. = FALSE) } char_cols <- df[is_character] col_types <- keep_character_col_types(df, col_types) guesses <- lapply( char_cols, guess_parser, locale = locale, na = na, guess_integer = guess_integer ) specs <- col_spec_standardise( col_types = col_types, col_names = names(char_cols), guessed_types = guesses ) if (is.null(col_types) && !is_testing()) { show_cols_spec(specs) } df[is_character] <- lapply(seq_along(char_cols), function(i) { type_convert_col(char_cols[[i]], specs$cols[[i]], which(is_character)[i], locale_ = locale, na = na, trim_ws = trim_ws ) }) attr(df, "spec") <- NULL df } keep_character_col_types <- function(df, col_types) { if (is.null(col_types)) { return(col_types) } is_character <- vapply(df, is.character, logical(1)) if (is.character(col_types)) { if (length(col_types) != 1) { stop("`col_types` must be a single string.", call. = FALSE) } if (nchar(col_types) != length(df)) { stop( "`df` and `col_types` must have consistent lengths:\n", " * `df` has length ", length(df), "\n", " * `col_types` has length ", nchar(col_types), call. = FALSE ) } idx <- which(is_character) col_types <- paste(substring(col_types, idx, idx), collapse = "") return(col_types) } char_cols <- names(df)[is_character] col_types$cols <- col_types$cols[names(col_types$cols) %in% char_cols] col_types } readr/R/read_fwf.R0000644000176200001440000001613014304131171013444 0ustar liggesusers #' Read a fixed width file into a tibble #' #' A fixed width file can be a very compact representation of numeric data. #' It's also very fast to parse, because every field is in the same place in #' every line. Unfortunately, it's painful to parse because you need to #' describe the length of every field. Readr aims to make it as easy as possible #' by providing a number of different ways to describe the field structure. #' - [fwf_empty()] - Guesses based on the positions of empty columns. #' - [fwf_widths()] - Supply the widths of the columns. #' - [fwf_positions()] - Supply paired vectors of start and end positions. #' - [fwf_cols()] - Supply named arguments of paired start and end positions or column widths. #' #' @seealso [read_table()] to read fixed width files where each #' column is separated by whitespace. #' #' @section Second edition changes: #' Comments are no longer looked for anywhere in the file. #' They are now only ignored at the start of a line. #' #' @inheritParams datasource #' @inheritParams tokenizer_fwf #' @inheritParams read_delim #' @param col_positions Column positions, as created by [fwf_empty()], #' [fwf_widths()] or [fwf_positions()]. To read in only selected fields, #' use [fwf_positions()]. If the width of the last column is variable (a #' ragged fwf file), supply the last end position as NA. #' @export #' @examples #' fwf_sample <- readr_example("fwf-sample.txt") #' writeLines(read_lines(fwf_sample)) #' #' # You can specify column positions in several ways: #' # 1. Guess based on position of empty columns #' read_fwf(fwf_sample, fwf_empty(fwf_sample, col_names = c("first", "last", "state", "ssn"))) #' # 2. A vector of field widths #' read_fwf(fwf_sample, fwf_widths(c(20, 10, 12), c("name", "state", "ssn"))) #' # 3. Paired vectors of start and end positions #' read_fwf(fwf_sample, fwf_positions(c(1, 30), c(20, 42), c("name", "ssn"))) #' # 4. Named arguments with start and end positions #' read_fwf(fwf_sample, fwf_cols(name = c(1, 20), ssn = c(30, 42))) #' # 5. Named arguments with column widths #' read_fwf(fwf_sample, fwf_cols(name = 20, state = 10, ssn = 12)) read_fwf <- function(file, col_positions = fwf_empty(file, skip, n = guess_max), col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), lazy = should_read_lazy(), skip_empty_rows = TRUE) { if (edition_first()) { ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) if (inherits(ds, "source_file") && empty_file(file)) { return(tibble::tibble()) } tokenizer <- tokenizer_fwf(col_positions$begin, col_positions$end, na = na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows) spec <- col_spec_standardise( file, skip = skip, guess_max = guess_max, tokenizer = tokenizer, locale = locale, col_names = col_positions$col_names, col_types = col_types, drop_skipped_names = TRUE ) if (is.null(col_types) && !inherits(ds, "source_string") && !is_testing()) { show_cols_spec(spec) } out <- read_tokens(datasource(file, skip = spec$skip, skip_empty_rows = skip_empty_rows), tokenizer, spec$cols, names(spec$cols), locale_ = locale, n_max = if (n_max == Inf) -1 else n_max, progress = progress ) out <- name_problems(out, names(spec$cols), source_name(file)) attr(out, "spec") <- spec return(warn_problems(out)) } vroom::vroom_fwf(file, col_positions = col_positions, col_types = col_types, col_select = {{ col_select }}, id = id, .name_repair = name_repair, locale = locale, na = na, comment = comment, skip_empty_rows = skip_empty_rows, trim_ws = trim_ws, skip = skip, n_max = n_max, guess_max = guess_max, show_col_types = show_col_types, progress = progress, altrep = lazy, num_threads = num_threads ) } #' @rdname read_fwf #' @export #' @param n Number of lines the tokenizer will read to determine file structure. By default #' it is set to 100. fwf_empty <- function(file, skip = 0, skip_empty_rows = FALSE, col_names = NULL, comment = "", n = 100L) { if (edition_first()) { ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) out <- whitespaceColumns(ds, comment = comment, n = n) out$end[length(out$end)] <- NA col_names <- fwf_col_names(col_names, length(out$begin)) out$col_names <- col_names return(out) } if (!missing(skip_empty_rows)) { lifecycle::deprecate_soft("2.0.0", "readr::fwf_empty(skip_empty_rows = )") } vroom::fwf_empty(file = file, skip = skip, col_names = col_names, comment = comment, n = n) } #' @rdname read_fwf #' @export #' @param widths Width of each field. Use NA as width of last field when #' reading a ragged fwf file. #' @param col_names Either NULL, or a character vector column names. fwf_widths <- function(widths, col_names = NULL) { if (edition_first()) { pos <- cumsum(c(1L, abs(widths))) return(fwf_positions(pos[-length(pos)], pos[-1] - 1L, col_names)) } vroom::fwf_widths(widths = widths, col_names = col_names) } #' @rdname read_fwf #' @export #' @param start,end Starting and ending (inclusive) positions of each field. #' Use NA as last end field when reading a ragged fwf file. fwf_positions <- function(start, end = NULL, col_names = NULL) { if (edition_first()) { stopifnot(length(start) == length(end)) col_names <- fwf_col_names(col_names, length(start)) return(tibble( begin = start - 1L, end = end, # -1 to change to 0 offset, +1 to be exclusive, col_names = as.character(col_names) )) } vroom::fwf_positions(start = start, end = end, col_names = col_names) } #' @rdname read_fwf #' @export #' @param ... If the first element is a data frame, #' then it must have all numeric columns and either one or two rows. #' The column names are the variable names. The column values are the #' variable widths if a length one vector, and if length two, variable start and end #' positions. The elements of `...` are used to construct a data frame #' with or or two rows as above. fwf_cols <- function(...) { if (edition_first()) { x <- lapply(list(...), as.integer) names(x) <- fwf_col_names(names(x), length(x)) x <- tibble::as_tibble(x) if (nrow(x) == 2) { res <- fwf_positions(as.integer(x[1, ]), as.integer(x[2, ]), names(x)) } else if (nrow(x) == 1) { res <- fwf_widths(as.integer(x[1, ]), names(x)) } else { stop("All variables must have either one (width) two (start, end) values.", call. = FALSE ) } return(res) } vroom::fwf_cols(...) } fwf_col_names <- function(nm, n) { nm <- nm %||% rep("", n) nm_empty <- (nm == "") nm[nm_empty] <- paste0("X", seq_len(n))[nm_empty] nm } readr/R/encoding.R0000644000176200001440000000256714152512262013473 0ustar liggesusers#' Guess encoding of file #' #' Uses [stringi::stri_enc_detect()]: see the documentation there #' for caveats. #' #' @rdname encoding #' @param file A character string specifying an input as specified in #' [datasource()], a raw vector, or a list of raw vectors. #' @inheritParams datasource #' @inheritParams read_lines #' @param threshold Only report guesses above this threshold of certainty. #' @return A tibble #' @export #' @examples #' guess_encoding(readr_example("mtcars.csv")) #' guess_encoding(read_lines_raw(readr_example("mtcars.csv"))) #' guess_encoding(read_file_raw(readr_example("mtcars.csv"))) #' #' guess_encoding("a\n\u00b5\u00b5") guess_encoding <- function(file, n_max = 1e4, threshold = 0.20) { if (!requireNamespace("stringi", quietly = TRUE)) { stop("stringi package required for encoding operations", call. = FALSE) } if (is.character(file)) { lines <- unlist(read_lines_raw(file, n_max = n_max)) } else if (is.raw(file)) { lines <- file } else if (is.list(file)) { lines <- unlist(file) } else { stop("Unknown input to `file`", call. = FALSE) } if (stringi::stri_enc_isascii(lines)) { return(tibble::tibble(encoding = "ASCII", confidence = 1)) } guess <- stringi::stri_enc_detect(lines) df <- tibble::as_tibble(guess[[1]]) names(df) <- tolower(names(df)) df[df$confidence > threshold, c("encoding", "confidence")] } readr/R/read_delim_chunked.R0000644000176200001440000001453114304131171015460 0ustar liggesusers# Generates the modified read_delimited function generate_read_delimited_chunked <- function(x) { args <- formals(x) args <- args[names(args) != "n_max"] args <- append(args, alist(callback = , chunk_size = ), 1) # Change guess_max default to use chunk_size args$guess_max[[3]] <- quote(chunk_size) b <- as.list(body(x)) for (i in seq_along(b)) { if (is.call(b[[i]]) && identical(b[[i]][[1]], as.symbol("<-")) && is.call(b[[i]][[3]]) && identical(b[[i]][[3]][[1]], quote(read_tokens))) { # Change read_tokens() to read_tokens_chunked b[[i]][[3]][[1]] <- quote(read_tokens_chunked) chunked_call <- as.list(b[[i]][[3]]) # Remove the n_max argument chunked_call <- chunked_call[!names(chunked_call) == "n_max"] # Add the callback and chunk_size arguments b[[i]] <- as.call(append(chunked_call, alist(callback = callback, chunk_size = chunk_size, spec = spec), 2)) # Remove additional calls b <- b[-seq(i + 1, length(b))] body(x) <- as.call(b) formals(x) <- args return(x) } } x } read_tokens_chunked <- function(data, callback, chunk_size, tokenizer, col_specs, col_names, locale_, spec, progress) { callback <- as_chunk_callback(callback) on.exit(callback$finally(), add = TRUE) read_tokens_chunked_(data, callback, chunk_size, tokenizer, col_specs, col_names, locale_, spec, progress) return(callback$result()) } utils::globalVariables(c("callback", "chunk_size")) read_delimited_chunked <- generate_read_delimited_chunked(read_delimited) #' Read a delimited file by chunks #' #' @inheritParams read_delim #' @param callback A callback function to call on each chunk #' @param chunk_size The number of rows to include in each chunk #' @keywords internal #' @family chunked #' @export #' @details The number of lines in `file` can exceed the maximum integer value in R (~2 billion). #' @examples #' # Cars with 3 gears #' f <- function(x, pos) subset(x, gear == 3) #' read_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) read_delim_chunked <- function(file, callback, delim = NULL, chunk_size = 10000, quote = '"', escape_backslash = FALSE, escape_double = TRUE, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE) { tokenizer <- tokenizer_delim(delim, quote = quote, escape_backslash = escape_backslash, escape_double = escape_double, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) read_delimited_chunked(file, callback = callback, chunk_size = chunk_size, tokenizer = tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) } #' @rdname read_delim_chunked #' @export read_csv_chunked <- function(file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE) { tokenizer <- tokenizer_csv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) read_delimited_chunked(file, callback = callback, chunk_size = chunk_size, tokenizer = tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) } #' @rdname read_delim_chunked #' @export read_csv2_chunked <- function(file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE) { tokenizer <- tokenizer_delim( delim = ";", na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) read_delimited_chunked(file, callback = callback, chunk_size = chunk_size, tokenizer = tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) } #' @rdname read_delim_chunked #' @export read_tsv_chunked <- function(file, callback, chunk_size = 10000, col_names = TRUE, col_types = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, guess_max = chunk_size, progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE) { tokenizer <- tokenizer_tsv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) read_delimited_chunked(file, callback = callback, chunk_size = chunk_size, tokenizer = tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) } readr/R/rds.R0000644000176200001440000000363614304131171012466 0ustar liggesusers#' Read/write RDS files. #' #' Consistent wrapper around [saveRDS()] and [readRDS()]. #' `write_rds()` does not compress by default as space is generally cheaper #' than time. #' #' @param file The file path to read from/write to. #' @param refhook A function to handle reference objects. #' @export #' @examples #' temp <- tempfile() #' write_rds(mtcars, temp) #' read_rds(temp) #' \dontrun{ #' write_rds(mtcars, "compressed_mtc.rds", "xz", compression = 9L) #' } read_rds <- function(file, refhook = NULL) { con <- file(file) on.exit(close(con)) readRDS(con, refhook = refhook) } #' @param x R object to write to serialise. #' @param compress Compression method to use: "none", "gz" ,"bz", or "xz". #' @param version Serialization format version to be used. The default value is 2 #' as it's compatible for R versions prior to 3.5.0. See [base::saveRDS()] #' for more details. #' @param ... Additional arguments to connection function. For example, control #' the space-time trade-off of different compression methods with #' `compression`. See [connections()] for more details. #' @param path `r lifecycle::badge("deprecated")` Use the `file` argument #' instead. #' @param text If `TRUE` a text representation is used, otherwise a binary representation is used. #' @return `write_rds()` returns `x`, invisibly. #' @rdname read_rds #' @export write_rds <- function(x, file, compress = c("none", "gz", "bz2", "xz"), version = 2, refhook = NULL, text = FALSE, path = deprecated(), ...) { if (is_present(path)) { deprecate_warn("1.4.0", "write_rds(path = )", "write_rds(file = )") file <- path } compress <- match.arg(compress) con <- switch(compress, none = file(file, ...), gz = gzfile(file, ...), bz2 = bzfile(file, ...), xz = xzfile(file, ...) ) on.exit(close(con), add = TRUE) saveRDS(x, con, version = version, refhook = refhook, ascii = text) invisible(x) } readr/R/callback.R0000644000176200001440000001141614547552156013450 0ustar liggesusersas_chunk_callback <- function(x) UseMethod("as_chunk_callback") as_chunk_callback.function <- function(x) { SideEffectChunkCallback$new(x) } as_chunk_callback.R6ClassGenerator <- function(x) { as_chunk_callback(x$new()) } as_chunk_callback.ChunkCallback <- function(x) { x } #' Callback classes #' #' These classes are used to define callback behaviors. #' #' \describe{ #' \item{ChunkCallback}{Callback interface definition, all callback functions should inherit from this class.} #' \item{SideEffectChunkCallback}{Callback function that is used only for side effects, no results are returned.} #' \item{DataFrameCallback}{Callback function that combines each result together at the end.} #' \item{AccumulateCallBack}{ #' Callback function that accumulates a single result. Requires the parameter `acc` to specify #' the initial value of the accumulator. The parameter `acc` is `NULL` by default. #' } #' } #' @usage NULL #' @format NULL #' @name callback #' @keywords internal #' @family chunked #' @examples #' ## If given a regular function it is converted to a SideEffectChunkCallback #' #' # view structure of each chunk #' read_lines_chunked(readr_example("mtcars.csv"), str, chunk_size = 5) #' #' # Print starting line of each chunk #' f <- function(x, pos) print(pos) #' read_lines_chunked(readr_example("mtcars.csv"), SideEffectChunkCallback$new(f), chunk_size = 5) #' #' # If combined results are desired you can use the DataFrameCallback #' #' # Cars with 3 gears #' f <- function(x, pos) subset(x, gear == 3) #' read_csv_chunked(readr_example("mtcars.csv"), DataFrameCallback$new(f), chunk_size = 5) #' #' # The ListCallback can be used for more flexible output #' f <- function(x, pos) x$mpg[x$hp > 100] #' read_csv_chunked(readr_example("mtcars.csv"), ListCallback$new(f), chunk_size = 5) #' #' # The AccumulateCallback accumulates results from each chunk #' f <- function(x, pos, acc) sum(x$mpg) + acc #' read_csv_chunked(readr_example("mtcars.csv"), AccumulateCallback$new(f, acc = 0), chunk_size = 5) #' @export ChunkCallback <- R6::R6Class("ChunkCallback", private = list( callback = NULL ), public = list( initialize = function(callback) NULL, receive = function(data, index) NULL, continue = function() TRUE, result = function() NULL, finally = function() NULL ) ) #' @usage NULL #' @format NULL #' @rdname callback #' @export SideEffectChunkCallback <- R6::R6Class("SideEffectChunkCallback", inherit = ChunkCallback, private = list( cancel = FALSE ), public = list( initialize = function(callback) { check_callback_fun(callback) private$callback <- callback }, receive = function(data, index) { result <- private$callback(data, index) private$cancel <- identical(result, FALSE) }, continue = function() { !private$cancel } ) ) #' @usage NULL #' @format NULL #' @rdname callback #' @export DataFrameCallback <- R6::R6Class("DataFrameCallback", inherit = ChunkCallback, private = list( results = list() ), public = list( initialize = function(callback) { private$callback <- callback }, receive = function(data, index) { result <- private$callback(data, index) private$results <- c(private$results, list(result)) }, result = function() { do.call(`rbind`, private$results) }, finally = function() { private$results <- list() } ) ) #' @usage NULL #' @format NULL #' @rdname callback #' @export ListCallback <- R6::R6Class("ListCallback", inherit = ChunkCallback, private = list( results = list() ), public = list( initialize = function(callback) { private$callback <- callback }, receive = function(data, index) { result <- private$callback(data, index) private$results <- c(private$results, list(result)) }, result = function() { private$results }, finally = function() { private$results <- list() } ) ) #' @usage NULL #' @format NULL #' @rdname callback #' @export AccumulateCallback <- R6::R6Class("AccumulateCallback", inherit = ChunkCallback, private = list( acc = NULL ), public = list( initialize = function(callback, acc = NULL) { check_callback_fun(callback, req_args = 3, message = "`callback` must have three or more arguments" ) private$acc <- acc private$callback <- callback }, receive = function(data, index) { private$acc <- private$callback(data, index, private$acc) }, result = function() { private$acc } ) ) check_callback_fun <- function(callback, req_args = 2, message = NULL) { if (is.null(message)) { message <- "`callback` must have two or more arguments" } n_args <- length(formals(callback)) if (n_args < req_args) { stop(message, call. = FALSE) } } readr/R/collectors.R0000644000176200001440000003500614315646511014056 0ustar liggesuserscollector <- function(type, ...) { structure(list(...), class = c(paste0("collector_", type), "collector")) } is.collector <- function(x) inherits(x, "collector") #' @export print.collector <- function(x, ...) { cat("<", class(x)[1], ">\n", sep = "") } collector_find <- function(name) { if (is.na(name)) { return(col_character()) } get(paste0("col_", name), envir = asNamespace("readr"))() } #' Parse a character vector. #' #' @family parsers #' @param x Character vector of elements to parse. #' @param collector Column specification. #' @inheritParams read_delim #' @inheritParams tokenizer_delim #' @keywords internal #' @export #' @examples #' x <- c("1", "2", "3", "NA") #' parse_vector(x, col_integer()) #' parse_vector(x, col_double()) parse_vector <- function(x, collector, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { stopifnot(is.character(x)) if (is.character(collector)) { collector <- collector_find(collector) } warn_problems(parse_vector_(x, collector, na = na, locale_ = locale, trim_ws = trim_ws)) } #' Parse logicals, integers, and reals #' #' Use `parse_*()` if you have a character vector you want to parse. Use #' `col_*()` in conjunction with a `read_*()` function to parse the #' values as they're read in. #' #' @name parse_atomic #' @aliases NULL #' @param x Character vector of values to parse. #' @inheritParams tokenizer_delim #' @inheritParams read_delim #' @family parsers #' @examples #' parse_integer(c("1", "2", "3")) #' parse_double(c("1", "2", "3.123")) #' parse_number("$1,123,456.00") #' #' # Use locale to override default decimal and grouping marks #' es_MX <- locale("es", decimal_mark = ",") #' parse_number("$1.123.456,00", locale = es_MX) #' #' # Invalid values are replaced with missing values with a warning. #' x <- c("1", "2", "3", "-") #' parse_double(x) #' # Or flag values as missing #' parse_double(x, na = "-") NULL #' @rdname parse_atomic #' @export parse_logical <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_logical(), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_atomic #' @export parse_integer <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_integer(), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_atomic #' @export parse_double <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_double(), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_atomic #' @export parse_character <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_character(), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_atomic #' @export col_logical <- function() { collector("logical") } #' @rdname parse_atomic #' @export col_integer <- function() { collector("integer") } #' @rdname parse_atomic #' @export col_double <- function() { collector("double") } #' @rdname parse_atomic #' @export col_character <- function() { collector("character") } #' Skip a column #' #' Use this function to ignore a column when reading in a file. #' To skip all columns not otherwise specified, use [cols_only()]. #' #' @family parsers #' @export col_skip <- function() { collector("skip") } #' Parse numbers, flexibly #' #' This parses the first number it finds, dropping any non-numeric characters #' before the first number and all characters after the first number. The #' grouping mark specified by the locale is ignored inside the number. #' #' @inheritParams parse_atomic #' @inheritParams tokenizer_delim #' @inheritParams read_delim #' @return A numeric vector (double) of parsed numbers. #' @family parsers #' @export #' @examples #' ## These all return 1000 #' parse_number("$1,000") ## leading `$` and grouping character `,` ignored #' parse_number("euro1,000") ## leading non-numeric euro ignored #' parse_number("t1000t1000") ## only parses first number found #' #' parse_number("1,234.56") #' ## explicit locale specifying European grouping and decimal marks #' parse_number("1.234,56", locale = locale(decimal_mark = ",", grouping_mark = ".")) #' ## SI/ISO 31-0 standard spaces for number grouping #' parse_number("1 234.56", locale = locale(decimal_mark = ".", grouping_mark = " ")) #' #' ## Specifying strings for NAs #' parse_number(c("1", "2", "3", "NA")) #' parse_number(c("1", "2", "3", "NA", "Nothing"), na = c("NA", "Nothing")) parse_number <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_number(), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_number #' @export col_number <- function() { collector("number") } #' Parse using the "best" type #' #' `parse_guess()` returns the parser vector; `guess_parser()` #' returns the name of the parser. These functions use a number of heuristics #' to determine which type of vector is "best". Generally they try to err of #' the side of safety, as it's straightforward to override the parsing choice #' if needed. #' #' @inheritParams parse_atomic #' @inheritParams tokenizer_delim #' @inheritParams read_delim #' @family parsers #' @export #' @examples #' # Logical vectors #' parse_guess(c("FALSE", "TRUE", "F", "T")) #' #' # Integers and doubles #' parse_guess(c("1", "2", "3")) #' parse_guess(c("1.6", "2.6", "3.4")) #' #' # Numbers containing grouping mark #' guess_parser("1,234,566") #' parse_guess("1,234,566") #' #' # ISO 8601 date times #' guess_parser(c("2010-10-10")) #' parse_guess(c("2010-10-10")) parse_guess <- function(x, na = c("", "NA"), locale = default_locale(), trim_ws = TRUE, guess_integer = FALSE) { parse_vector(x, guess_parser(x, locale, guess_integer = guess_integer, na = na), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_guess #' @export col_guess <- function() { collector("guess") } #' @rdname parse_guess #' @param guess_integer If `TRUE`, guess integer types for whole numbers, if #' `FALSE` guess numeric type for all numbers. #' @export guess_parser <- function(x, locale = default_locale(), guess_integer = FALSE, na = c("", "NA")) { x[x %in% na] <- NA_character_ stopifnot(is.locale(locale)) collectorGuess(x, locale, guessInteger = guess_integer) } #' Parse factors #' #' `parse_factor()` is similar to [factor()], but generates a warning if #' `levels` have been specified and some elements of `x` are not found in those #' `levels`. #' #' @param levels Character vector of the allowed levels. When `levels = NULL` #' (the default), `levels` are discovered from the unique values of `x`, in #' the order in which they appear in `x`. #' @param ordered Is it an ordered factor? #' @param include_na If `TRUE` and `x` contains at least one `NA`, then `NA` #' is included in the levels of the constructed factor. #' #' @inheritParams parse_atomic #' @inheritParams tokenizer_delim #' @inheritParams read_delim #' @family parsers #' @export #' @examples #' # discover the levels from the data #' parse_factor(c("a", "b")) #' parse_factor(c("a", "b", "-99")) #' parse_factor(c("a", "b", "-99"), na = c("", "NA", "-99")) #' parse_factor(c("a", "b", "-99"), na = c("", "NA", "-99"), include_na = FALSE) #' #' # provide the levels explicitly #' parse_factor(c("a", "b"), levels = letters[1:5]) #' #' x <- c("cat", "dog", "caw") #' animals <- c("cat", "dog", "cow") #' #' # base::factor() silently converts elements that do not match any levels to #' # NA #' factor(x, levels = animals) #' #' # parse_factor() generates same factor as base::factor() but throws a warning #' # and reports problems #' parse_factor(x, levels = animals) parse_factor <- function(x, levels = NULL, ordered = FALSE, na = c("", "NA"), locale = default_locale(), include_na = TRUE, trim_ws = TRUE) { parse_vector(x, col_factor(levels, ordered, include_na), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_factor #' @export col_factor <- function(levels = NULL, ordered = FALSE, include_na = FALSE) { if (!(is.null(levels) || is.character(levels))) { stop(sprintf("`levels` must be `NULL` or a character vector:\n- `levels` is a '%s'", class(levels)), call. = FALSE) } collector("factor", levels = levels, ordered = ordered, include_na = include_na) } # More complex ------------------------------------------------------------ #' Parse date/times #' #' @section Format specification: #' `readr` uses a format specification similar to [strptime()]. #' There are three types of element: #' #' 1. Date components are specified with "%" followed by a letter. For example #' "%Y" matches a 4 digit year, "%m", matches a 2 digit month and "%d" matches #' a 2 digit day. Month and day default to `1`, (i.e. Jan 1st) if not present, #' for example if only a year is given. #' 2. Whitespace is any sequence of zero or more whitespace characters. #' 3. Any other character is matched exactly. #' #' `parse_datetime()` recognises the following format specifications: #' #' * Year: "%Y" (4 digits). "%y" (2 digits); 00-69 -> 2000-2069, 70-99 -> #' 1970-1999. #' * Month: "%m" (2 digits), "%b" (abbreviated name in current locale), "%B" #' (full name in current locale). #' * Day: "%d" (2 digits), "%e" (optional leading space), "%a" (abbreviated #' name in current locale). #' * Hour: "%H" or "%I" or "%h", use I (and not H) with AM/PM, use h (and not H) #' if your times represent durations longer than one day. #' * Minutes: "%M" #' * Seconds: "%S" (integer seconds), "%OS" (partial seconds) #' * Time zone: "%Z" (as name, e.g. "America/Chicago"), "%z" (as offset from #' UTC, e.g. "+0800") #' * AM/PM indicator: "%p". #' * Non-digits: "%." skips one non-digit character, "%+" skips one or more #' non-digit characters, "%*" skips any number of non-digits characters. #' * Automatic parsers: "%AD" parses with a flexible YMD parser, "%AT" parses #' with a flexible HMS parser. #' * Time since the Unix epoch: "%s" decimal seconds since the Unix epoch. #' * Shortcuts: "%D" = "%m/%d/%y", "%F" = "%Y-%m-%d", "%R" = "%H:%M", "%T" = #' "%H:%M:%S", "%x" = "%y/%m/%d". #' #' @section ISO8601 support: #' #' Currently, readr does not support all of ISO8601. Missing features: #' #' * Week & weekday specifications, e.g. "2013-W05", "2013-W05-10". #' * Ordinal dates, e.g. "2013-095". #' * Using commas instead of a period for decimal separator. #' #' The parser is also a little laxer than ISO8601: #' #' * Dates and times can be separated with a space, not just T. #' * Mostly correct specifications like "2009-05-19 14:" and "200912-01" work. #' #' @param x A character vector of dates to parse. #' @param format A format specification, as described below. If set to "", #' date times are parsed as ISO8601, dates and times used the date and #' time formats specified in the [locale()]. #' #' Unlike [strptime()], the format specification must match #' the complete string. #' @inheritParams read_delim #' @inheritParams tokenizer_delim #' @return A [POSIXct()] vector with `tzone` attribute set to #' `tz`. Elements that could not be parsed (or did not generate valid #' dates) will be set to `NA`, and a warning message will inform #' you of the total number of failures. #' @family parsers #' @export #' @examples #' # Format strings -------------------------------------------------------- #' parse_datetime("01/02/2010", "%d/%m/%Y") #' parse_datetime("01/02/2010", "%m/%d/%Y") #' # Handle any separator #' parse_datetime("01/02/2010", "%m%.%d%.%Y") #' #' # Dates look the same, but internally they use the number of days since #' # 1970-01-01 instead of the number of seconds. This avoids a whole lot #' # of troubles related to time zones, so use if you can. #' parse_date("01/02/2010", "%d/%m/%Y") #' parse_date("01/02/2010", "%m/%d/%Y") #' #' # You can parse timezones from strings (as listed in OlsonNames()) #' parse_datetime("2010/01/01 12:00 US/Central", "%Y/%m/%d %H:%M %Z") #' # Or from offsets #' parse_datetime("2010/01/01 12:00 -0600", "%Y/%m/%d %H:%M %z") #' #' # Use the locale parameter to control the default time zone #' # (but note UTC is considerably faster than other options) #' parse_datetime("2010/01/01 12:00", "%Y/%m/%d %H:%M", #' locale = locale(tz = "US/Central") #' ) #' parse_datetime("2010/01/01 12:00", "%Y/%m/%d %H:%M", #' locale = locale(tz = "US/Eastern") #' ) #' #' # Unlike strptime, the format specification must match the complete #' # string (ignoring leading and trailing whitespace). This avoids common #' # errors: #' strptime("01/02/2010", "%d/%m/%y") #' parse_datetime("01/02/2010", "%d/%m/%y") #' #' # Failures ------------------------------------------------------------- #' parse_datetime("01/01/2010", "%d/%m/%Y") #' parse_datetime(c("01/ab/2010", "32/01/2010"), "%d/%m/%Y") #' #' # Locales -------------------------------------------------------------- #' # By default, readr expects English date/times, but that's easy to change' #' parse_datetime("1 janvier 2015", "%d %B %Y", locale = locale("fr")) #' parse_datetime("1 enero 2015", "%d %B %Y", locale = locale("es")) #' #' # ISO8601 -------------------------------------------------------------- #' # With separators #' parse_datetime("1979-10-14") #' parse_datetime("1979-10-14T10") #' parse_datetime("1979-10-14T10:11") #' parse_datetime("1979-10-14T10:11:12") #' parse_datetime("1979-10-14T10:11:12.12345") #' #' # Without separators #' parse_datetime("19791014") #' parse_datetime("19791014T101112") #' #' # Time zones #' us_central <- locale(tz = "US/Central") #' parse_datetime("1979-10-14T1010", locale = us_central) #' parse_datetime("1979-10-14T1010-0500", locale = us_central) #' parse_datetime("1979-10-14T1010Z", locale = us_central) #' # Your current time zone #' parse_datetime("1979-10-14T1010", locale = locale(tz = "")) parse_datetime <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_datetime(format), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_datetime #' @export parse_date <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_date(format), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_datetime #' @export parse_time <- function(x, format = "", na = c("", "NA"), locale = default_locale(), trim_ws = TRUE) { parse_vector(x, col_time(format), na = na, locale = locale, trim_ws = trim_ws) } #' @rdname parse_datetime #' @export col_datetime <- function(format = "") { collector("datetime", format = format) } #' @rdname parse_datetime #' @export col_date <- function(format = "") { collector("date", format = format) } #' @rdname parse_datetime #' @export col_time <- function(format = "") { collector("time", format = format) } readr/R/tokenizer.R0000644000176200001440000001205314304131171013701 0ustar liggesusers#' Tokenize a file/string. #' #' Turns input into a character vector. Usually the tokenization is done purely #' in C++, and never exposed to R (because that requires a copy). This function #' is useful for testing, or when a file doesn't parse correctly and you want #' to see the underlying tokens. #' #' @inheritParams datasource #' @param tokenizer A tokenizer specification. #' @param skip Number of lines to skip before reading data. #' @param n_max Optionally, maximum number of rows to tokenize. #' @keywords internal #' @export #' @examples #' tokenize("1,2\n3,4,5\n\n6") #' #' # Only tokenize first two lines #' tokenize("1,2\n3,4,5\n\n6", n = 2) tokenize <- function(file, tokenizer = tokenizer_csv(), skip = 0, n_max = -1L) { ds <- datasource(file, skip = skip, skip_empty_rows = FALSE) tokenize_(ds, tokenizer, n_max) } #' Tokenizers. #' #' Explicitly create tokenizer objects. Usually you will not call these #' function, but will instead use one of the use friendly wrappers like #' [read_csv()]. #' #' @keywords internal #' @name Tokenizers #' @examples #' tokenizer_csv() NULL #' @export #' @rdname Tokenizers #' @param comment A string used to identify comments. Any text after the #' comment characters will be silently ignored. #' @param na Character vector of strings to interpret as missing values. Set this #' option to `character()` to indicate no missing values. #' @param quoted_na `r lifecycle::badge("deprecated")` Should missing values #' inside quotes be treated as missing values (the default) or strings. This #' parameter is soft deprecated as of readr 2.0.0. #' @param delim Single character used to separate fields within a record. #' @param quote Single character used to quote strings. #' @param trim_ws Should leading and trailing whitespace (ASCII spaces and tabs) be trimmed from #' each field before parsing it? #' @param escape_double Does the file escape quotes by doubling them? #' i.e. If this option is `TRUE`, the value `""""` represents #' a single quote, `\"`. #' @param escape_backslash Does the file use backslashes to escape special #' characters? This is more general than `escape_double` as backslashes #' can be used to escape the delimiter character, the quote character, or #' to add special characters like `\\n`. #' @param skip_empty_rows Should blank rows be ignored altogether? i.e. If this #' option is `TRUE` then blank rows will not be represented at all. If it is #' `FALSE` then they will be represented by `NA` values in all the columns. tokenizer_delim <- function(delim, quote = '"', na = "NA", quoted_na = TRUE, comment = "", trim_ws = TRUE, escape_double = TRUE, escape_backslash = FALSE, skip_empty_rows = TRUE) { structure( list( delim = delim, quote = quote, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws, escape_double = escape_double, escape_backslash = escape_backslash, skip_empty_rows = skip_empty_rows ), class = "tokenizer_delim" ) } #' @export #' @rdname Tokenizers tokenizer_csv <- function(na = "NA", quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE) { tokenizer_delim( delim = ",", na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE, skip_empty_rows = skip_empty_rows ) } #' @export #' @rdname Tokenizers tokenizer_tsv <- function(na = "NA", quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE) { tokenizer_delim( delim = "\t", na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE, skip_empty_rows = skip_empty_rows ) } #' @export #' @rdname Tokenizers tokenizer_line <- function(na = character(), skip_empty_rows = TRUE) { structure(list(na = na, skip_empty_rows = skip_empty_rows), class = "tokenizer_line" ) } #' @export #' @rdname Tokenizers tokenizer_log <- function(trim_ws) { structure(list(trim_ws = trim_ws), class = "tokenizer_log") } #' @export #' @rdname Tokenizers #' @param begin,end Begin and end offsets for each file. These are C++ #' offsets so the first column is column zero, and the ranges are #' [begin, end) (i.e inclusive-exclusive). tokenizer_fwf <- function(begin, end, na = "NA", comment = "", trim_ws = TRUE, skip_empty_rows = TRUE) { structure(list( begin = as.integer(begin), end = as.integer(end), na = na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ), class = "tokenizer_fwf" ) } #' @export #' @rdname Tokenizers tokenizer_ws <- function(na = "NA", comment = "", skip_empty_rows = TRUE) { structure(list(na = na, comment = comment, skip_empty_rows = skip_empty_rows), class = "tokenizer_ws" ) } readr/R/locale.R0000644000176200001440000001044514304131171013131 0ustar liggesusers#' Create locales #' #' A locale object tries to capture all the defaults that can vary between #' countries. You set the locale in once, and the details are automatically #' passed on down to the columns parsers. The defaults have been chosen to #' match R (i.e. US English) as closely as possible. See #' `vignette("locales")` for more details. #' #' @param date_names Character representations of day and month names. Either #' the language code as string (passed on to [date_names_lang()]) #' or an object created by [date_names()]. #' @param date_format,time_format Default date and time formats. #' @param decimal_mark,grouping_mark Symbols used to indicate the decimal #' place, and to chunk larger numbers. Decimal mark can only be `,` or #' `.`. #' @param tz Default tz. This is used both for input (if the time zone isn't #' present in individual strings), and for output (to control the default #' display). The default is to use "UTC", a time zone that does not use #' daylight savings time (DST) and hence is typically most useful for data. #' The absence of time zones makes it approximately 50x faster to generate #' UTC times than any other time zone. #' #' Use `""` to use the system default time zone, but beware that this #' will not be reproducible across systems. #' #' For a complete list of possible time zones, see [OlsonNames()]. #' Americans, note that "EST" is a Canadian time zone that does not have #' DST. It is *not* Eastern Standard Time. It's better to use #' "US/Eastern", "US/Central" etc. #' @param encoding Default encoding. This only affects how the file is #' read - readr always converts the output to UTF-8. #' @param asciify Should diacritics be stripped from date names and converted to #' ASCII? This is useful if you're dealing with ASCII data where the correct #' spellings have been lost. Requires the \pkg{stringi} package. #' @export #' @examples #' locale() #' locale("fr") #' #' # South American locale #' locale("es", decimal_mark = ",") locale <- function(date_names = "en", date_format = "%AD", time_format = "%AT", decimal_mark = ".", grouping_mark = ",", tz = "UTC", encoding = "UTF-8", asciify = FALSE) { if (is.character(date_names)) { date_names <- date_names_lang(date_names) } stopifnot(is.date_names(date_names)) if (asciify) { date_names[] <- lapply(date_names, stringi::stri_trans_general, id = "latin-ascii") } if (missing(grouping_mark) && !missing(decimal_mark)) { grouping_mark <- if (decimal_mark == ".") "," else "." } else if (missing(decimal_mark) && !missing(grouping_mark)) { decimal_mark <- if (grouping_mark == ".") "," else "." } stopifnot(decimal_mark %in% c(".", ",")) check_string(grouping_mark) if (decimal_mark == grouping_mark) { stop("`decimal_mark` and `grouping_mark` must be different", call. = FALSE) } tz <- check_tz(tz) check_encoding(encoding) structure( list( date_names = date_names, date_format = date_format, time_format = time_format, decimal_mark = decimal_mark, grouping_mark = grouping_mark, tz = tz, encoding = encoding ), class = "locale" ) } is.locale <- function(x) inherits(x, "locale") #' @export print.locale <- function(x, ...) { cat("\n") cat("Numbers: ", prettyNum(123456.78, big.mark = x$grouping_mark, decimal.mark = x$decimal_mark, digits = 8 ), "\n", sep = "") cat("Formats: ", x$date_format, " / ", x$time_format, "\n", sep = "") cat("Timezone: ", x$tz, "\n", sep = "") cat("Encoding: ", x$encoding, "\n", sep = "") print(x$date_names) } #' @export #' @rdname locale default_locale <- function() { loc <- getOption("readr.default_locale") if (is.null(loc)) { loc <- locale() options("readr.default_locale" = loc) } loc } check_tz <- function(x) { check_string(x, nm = "tz") if (identical(x, "")) { x <- Sys.timezone() if (identical(x, "") || identical(x, NA_character_)) { x <- "UTC" } } if (x %in% tzdb::tzdb_names()) { x } else { stop("Unknown TZ ", x, call. = FALSE) } } check_encoding <- function(x) { check_string(x, nm = "encoding") if (tolower(x) %in% tolower(iconvlist())) { return(TRUE) } stop("Unknown encoding ", x, call. = FALSE) } readr/R/read_delim.R0000644000176200001440000005011314510343737013767 0ustar liggesusers#' @useDynLib readr, .registration = TRUE NULL #' Read a delimited file (including CSV and TSV) into a tibble #' #' `read_csv()` and `read_tsv()` are special cases of the more general #' `read_delim()`. They're useful for reading the most common types of #' flat file data, comma separated values and tab separated values, #' respectively. `read_csv2()` uses `;` for the field separator and `,` for the #' decimal point. This format is common in some European countries. #' @inheritParams datasource #' @inheritParams tokenizer_delim #' @inheritParams vroom::vroom #' @param col_names Either `TRUE`, `FALSE` or a character vector #' of column names. #' #' If `TRUE`, the first row of the input will be used as the column #' names, and will not be included in the data frame. If `FALSE`, column #' names will be generated automatically: X1, X2, X3 etc. #' #' If `col_names` is a character vector, the values will be used as the #' names of the columns, and the first row of the input will be read into #' the first row of the output data frame. #' #' Missing (`NA`) column names will generate a warning, and be filled #' in with dummy names `...1`, `...2` etc. Duplicate column names #' will generate a warning and be made unique, see `name_repair` to control #' how this is done. #' @param col_types One of `NULL`, a [cols()] specification, or #' a string. See `vignette("readr")` for more details. #' #' If `NULL`, all column types will be inferred from `guess_max` rows of the #' input, interspersed throughout the file. This is convenient (and fast), #' but not robust. If the guessed types are wrong, you'll need to increase #' `guess_max` or supply the correct types yourself. #' #' Column specifications created by [list()] or [cols()] must contain #' one column specification for each column. If you only want to read a #' subset of the columns, use [cols_only()]. #' #' Alternatively, you can use a compact string representation where each #' character represents one column: #' - c = character #' - i = integer #' - n = number #' - d = double #' - l = logical #' - f = factor #' - D = date #' - T = date time #' - t = time #' - ? = guess #' - _ or - = skip #' #' By default, reading a file without a column specification will print a #' message showing what `readr` guessed they were. To remove this message, #' set `show_col_types = FALSE` or set `options(readr.show_col_types = FALSE)`. #' @param id The name of a column in which to store the file path. This is #' useful when reading multiple input files and there is data in the file #' paths, such as the data collection date. If `NULL` (the default) no extra #' column is created. #' @param show_col_types If `FALSE`, do not show the guessed column types. If #' `TRUE` always show the column types, even if they are supplied. If `NULL` #' (the default) only show the column types if they are not explicitly supplied #' by the `col_types` argument. #' @param locale The locale controls defaults that vary from place to place. #' The default locale is US-centric (like R), but you can use #' [locale()] to create your own locale that controls things like #' the default time zone, encoding, decimal mark, big mark, and day/month #' names. #' @param skip Number of lines to skip before reading data. If `comment` is #' supplied any commented lines are ignored _after_ skipping. #' @param n_max Maximum number of lines to read. #' @param guess_max Maximum number of lines to use for guessing column types. #' Will never use more than the number of lines read. #' See `vignette("column-types", package = "readr")` for more details. #' @param progress Display a progress bar? By default it will only display #' in an interactive session and not while knitting a document. The automatic #' progress bar can be disabled by setting option `readr.show_progress` to #' `FALSE`. #' @param lazy Read values lazily? By default, this is `FALSE`, because there #' are special considerations when reading a file lazily that have tripped up #' some users. Specifically, things get tricky when reading and then writing #' back into the same file. But, in general, lazy reading (`lazy = TRUE`) has #' many benefits, especially for interactive use and when your downstream work #' only involves a subset of the rows or columns. #' #' Learn more in [should_read_lazy()] and in the documentation for the #' `altrep` argument of [vroom::vroom()]. #' @param num_threads The number of processing threads to use for initial #' parsing and lazy reading of data. If your data contains newlines within #' fields the parser should automatically detect this and fall back to using #' one thread only. However if you know your file has newlines within quoted #' fields it is safest to set `num_threads = 1` explicitly. #' @param name_repair Handling of column names. The default behaviour is to #' ensure column names are `"unique"`. Various repair strategies are #' supported: #' * `"minimal"`: No name repair or checks, beyond basic existence of names. #' * `"unique"` (default value): Make sure names are unique and not empty. #' * `"check_unique"`: No name repair, but check they are `unique`. #' * `"unique_quiet"`: Repair with the `unique` strategy, quietly. #' * `"universal"`: Make the names `unique` and syntactic. #' * `"universal_quiet"`: Repair with the `universal` strategy, quietly. #' * A function: Apply custom name repair (e.g., `name_repair = make.names` #' for names in the style of base R). #' * A purrr-style anonymous function, see [rlang::as_function()]. #' #' This argument is passed on as `repair` to [vctrs::vec_as_names()]. #' See there for more details on these terms and the strategies used #' to enforce them. #' #' @return A [tibble()]. If there are parsing problems, a warning will alert you. #' You can retrieve the full details by calling [problems()] on your dataset. #' @export #' @examples #' # Input sources ------------------------------------------------------------- #' # Read from a path #' read_csv(readr_example("mtcars.csv")) #' read_csv(readr_example("mtcars.csv.zip")) #' read_csv(readr_example("mtcars.csv.bz2")) #' \dontrun{ #' # Including remote paths #' read_csv("https://github.com/tidyverse/readr/raw/main/inst/extdata/mtcars.csv") #' } #' #' # Read from multiple file paths at once #' continents <- c("africa", "americas", "asia", "europe", "oceania") #' filepaths <- vapply( #' paste0("mini-gapminder-", continents, ".csv"), #' FUN = readr_example, #' FUN.VALUE = character(1) #' ) #' read_csv(filepaths, id = "file") #' #' # Or directly from a string with `I()` #' read_csv(I("x,y\n1,2\n3,4")) #' #' # Column selection----------------------------------------------------------- #' # Pass column names or indexes directly to select them #' read_csv(readr_example("chickens.csv"), col_select = c(chicken, eggs_laid)) #' read_csv(readr_example("chickens.csv"), col_select = c(1, 3:4)) #' #' # Or use the selection helpers #' read_csv( #' readr_example("chickens.csv"), #' col_select = c(starts_with("c"), last_col()) #' ) #' #' # You can also rename specific columns #' read_csv( #' readr_example("chickens.csv"), #' col_select = c(egg_yield = eggs_laid, everything()) #' ) #' #' # Column types -------------------------------------------------------------- #' # By default, readr guesses the columns types, looking at `guess_max` rows. #' # You can override with a compact specification: #' read_csv(I("x,y\n1,2\n3,4"), col_types = "dc") #' #' # Or with a list of column types: #' read_csv(I("x,y\n1,2\n3,4"), col_types = list(col_double(), col_character())) #' #' # If there are parsing problems, you get a warning, and can extract #' # more details with problems() #' y <- read_csv(I("x\n1\n2\nb"), col_types = list(col_double())) #' y #' problems(y) #' #' # Column names -------------------------------------------------------------- #' # By default, readr duplicate name repair is noisy #' read_csv(I("x,x\n1,2\n3,4")) #' #' # Same default repair strategy, but quiet #' read_csv(I("x,x\n1,2\n3,4"), name_repair = "unique_quiet") #' #' # There's also a global option that controls verbosity of name repair #' withr::with_options( #' list(rlib_name_repair_verbosity = "quiet"), #' read_csv(I("x,x\n1,2\n3,4")) #' ) #' #' # Or use "minimal" to turn off name repair #' read_csv(I("x,x\n1,2\n3,4"), name_repair = "minimal") #' #' # File types ---------------------------------------------------------------- #' read_csv(I("a,b\n1.0,2.0")) #' read_csv2(I("a;b\n1,0;2,0")) #' read_tsv(I("a\tb\n1.0\t2.0")) #' read_delim(I("a|b\n1.0|2.0"), delim = "|") read_delim <- function(file, delim = NULL, quote = '"', escape_backslash = FALSE, escape_double = TRUE, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, comment = "", trim_ws = FALSE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy()) { if (!is.null(delim) && !nzchar(delim)) { stop("`delim` must be at least one character, ", "use `read_table()` for whitespace delimited input.", call. = FALSE ) } if (edition_first()) { tokenizer <- tokenizer_delim(delim, quote = quote, escape_backslash = escape_backslash, escape_double = escape_double, na = na, quoted_na = quoted_na, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) return(read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, guess_max = guess_max, progress = progress, show_col_types = show_col_types )) } if (!missing(quoted_na)) { lifecycle::deprecate_soft("2.0.0", "readr::read_delim(quoted_na = )") } vroom::vroom(file, delim = delim, col_names = col_names, col_types = col_types, col_select = {{ col_select }}, id = id, .name_repair = name_repair, skip = skip, n_max = n_max, na = na, quote = quote, comment = comment, skip_empty_rows = skip_empty_rows, trim_ws = trim_ws, escape_double = escape_double, escape_backslash = escape_backslash, locale = locale, guess_max = guess_max, progress = progress, altrep = lazy, show_col_types = show_col_types, num_threads = num_threads ) } #' @rdname read_delim #' @export read_csv <- function(file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), name_repair = "unique", num_threads = readr_threads(), progress = show_progress(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy()) { if (edition_first()) { tokenizer <- tokenizer_csv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) return( read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) ) } if (!missing(quoted_na)) { lifecycle::deprecate_soft("2.0.0", "readr::read_csv(quoted_na = )") } vroom::vroom( file, delim = ",", col_names = col_names, col_types = col_types, col_select = {{ col_select }}, id = id, .name_repair = name_repair, skip = skip, n_max = n_max, na = na, quote = quote, comment = comment, skip_empty_rows = skip_empty_rows, trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE, locale = locale, guess_max = guess_max, show_col_types = show_col_types, progress = progress, altrep = lazy, num_threads = num_threads ) } #' @rdname read_delim #' @export read_csv2 <- function(file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy()) { if (locale$decimal_mark == ".") { cli::cli_alert_info("Using {.val ','} as decimal and {.val '.'} as grouping mark. Use {.fn read_delim} for more control.") locale$decimal_mark <- "," locale$grouping_mark <- "." } if (edition_first()) { tokenizer <- tokenizer_delim( delim = ";", na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) return(read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, guess_max = guess_max, progress = progress, show_col_types = show_col_types )) } vroom::vroom(file, delim = ";", col_names = col_names, col_types = col_types, col_select = {{ col_select }}, id = id, .name_repair = name_repair, skip = skip, n_max = n_max, na = na, quote = quote, comment = comment, skip_empty_rows = skip_empty_rows, trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE, locale = locale, guess_max = guess_max, show_col_types = show_col_types, progress = progress, altrep = lazy, num_threads = num_threads ) } #' @rdname read_delim #' @export read_tsv <- function(file, col_names = TRUE, col_types = NULL, col_select = NULL, id = NULL, locale = default_locale(), na = c("", "NA"), quoted_na = TRUE, quote = "\"", comment = "", trim_ws = TRUE, skip = 0, n_max = Inf, guess_max = min(1000, n_max), progress = show_progress(), name_repair = "unique", num_threads = readr_threads(), show_col_types = should_show_types(), skip_empty_rows = TRUE, lazy = should_read_lazy()) { tokenizer <- tokenizer_tsv( na = na, quoted_na = quoted_na, quote = quote, comment = comment, trim_ws = trim_ws, skip_empty_rows = skip_empty_rows ) if (edition_first()) { return(read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, comment = comment, n_max = n_max, guess_max = guess_max, progress = progress, show_col_types = show_col_types )) } vroom::vroom(file, delim = "\t", col_names = col_names, col_types = col_types, col_select = {{ col_select }}, id = id, .name_repair = name_repair, skip = skip, n_max = n_max, na = na, quote = quote, comment = comment, skip_empty_rows = skip_empty_rows, trim_ws = trim_ws, escape_double = TRUE, escape_backslash = FALSE, locale = locale, guess_max = guess_max, show_col_types = show_col_types, progress = progress, altrep = lazy, num_threads = num_threads ) } # Helper functions for reading from delimited files ---------------------------- read_tokens <- function(data, tokenizer, col_specs, col_names, locale_, n_max, progress) { if (n_max == Inf) { n_max <- -1 } read_tokens_(data, tokenizer, col_specs, col_names, locale_, n_max, progress) } read_delimited <- function(file, tokenizer, col_names = TRUE, col_types = NULL, locale = default_locale(), skip = 0, skip_empty_rows = TRUE, skip_quote = TRUE, comment = "", n_max = Inf, guess_max = min(1000, n_max), progress = show_progress(), show_col_types = should_show_types()) { name <- source_name(file) # If connection needed, read once. file <- standardise_path(file) if (is.connection(file)) { data <- datasource_connection(file, skip, skip_empty_rows, comment) if (empty_file(data[[1]])) { return(tibble::tibble()) } } else { if (!isTRUE(grepl("\n", file)[[1]]) && empty_file(file)) { return(tibble::tibble()) } if (is.character(file) && identical(locale$encoding, "UTF-8")) { # When locale is not set, file is probably marked as its correct encoding. # As default_locale() assumes file is UTF-8, file should be encoded as UTF-8 for non-UTF-8 MBCS locales. data <- enc2utf8(file) } else { data <- file } } spec <- col_spec_standardise( data, skip = skip, skip_empty_rows = skip_empty_rows, skip_quote = skip_quote, comment = comment, guess_max = guess_max, col_names = col_names, col_types = col_types, tokenizer = tokenizer, locale = locale ) ds <- datasource(data, skip = spec$skip, skip_empty_rows = skip_empty_rows, comment = comment, skip_quote = skip_quote) has_col_types <- !is.null(col_types) if ( ((is.null(show_col_types) && !has_col_types) || isTRUE(show_col_types)) && !inherits(ds, "source_string") ) { show_cols_spec(spec) } out <- read_tokens(ds, tokenizer, spec$cols, names(spec$cols), locale_ = locale, n_max = n_max, progress = progress ) out <- name_problems(out, names(spec$cols), name) attr(out, "spec") <- spec warn_problems(out) } generate_spec_fun <- function(f) { formals(f)$n_max <- 0 formals(f)$guess_max <- 1000 formals(f)$col_types <- list() old_body <- body(f) body(f) <- rlang::inject(quote(spec(with_edition(1, (function() !!old_body)())))) f } #' Generate a column specification #' #' When printed, only the first 20 columns are printed by default. To override, #' set `options(readr.num_columns)` can be used to modify this (a value of 0 #' turns off printing). #' #' @return The `col_spec` generated for the file. #' @inheritParams read_delim #' @export #' @examples #' # Input sources ------------------------------------------------------------- #' # Retrieve specs from a path #' spec_csv(system.file("extdata/mtcars.csv", package = "readr")) #' spec_csv(system.file("extdata/mtcars.csv.zip", package = "readr")) #' #' # Or directly from a string (must contain a newline) #' spec_csv(I("x,y\n1,2\n3,4")) #' #' # Column types -------------------------------------------------------------- #' # By default, readr guesses the columns types, looking at 1000 rows #' # throughout the file. #' # You can specify the number of rows used with guess_max. #' spec_csv(system.file("extdata/mtcars.csv", package = "readr"), guess_max = 20) spec_delim <- generate_spec_fun(read_delim) #' @rdname spec_delim #' @export spec_csv <- generate_spec_fun(read_csv) #' @rdname spec_delim #' @export spec_csv2 <- generate_spec_fun(read_csv2) #' @rdname spec_delim #' @export spec_tsv <- generate_spec_fun(read_tsv) readr/R/problems.R0000644000176200001440000000577014462256076013543 0ustar liggesusers#' Retrieve parsing problems #' #' Readr functions will only throw an error if parsing fails in an unrecoverable #' way. However, there are lots of potential problems that you might want to #' know about - these are stored in the `problems` attribute of the #' output, which you can easily access with this function. #' `stop_for_problems()` will throw an error if there are any parsing #' problems: this is useful for automated scripts where you want to throw #' an error as soon as you encounter a problem. #' #' @param x A data frame (from `read_*()`) or a vector (from `parse_*()`). #' @return A data frame with one row for each problem and four columns: #' \item{row,col}{Row and column of problem} #' \item{expected}{What readr expected to find} #' \item{actual}{What it actually got} #' @export #' @examples #' x <- parse_integer(c("1X", "blah", "3")) #' problems(x) #' #' y <- parse_integer(c("1", "2", "3")) #' problems(y) problems <- local({ no_problems <- tibble::tibble( row = integer(), col = integer(), expected = character(), actual = character() ) function(x = .Last.value) { problems <- probs(x) if (is.null(problems)) { return(invisible(no_problems)) } if (inherits(problems, "tbl_df")) { return(problems) } vroom::problems(x) } }) #' @export #' @rdname problems stop_for_problems <- function(x) { n <- n_problems(x) if (n == 0) { return(invisible(x)) } stop(n, " parsing failure", if (n > 1) "s", call. = FALSE) } probs <- function(x) { attr(suppressWarnings(x), "problems") } n_problems <- function(x) { probs <- problems(x) if (is.null(probs)) 0 else nrow(probs) } problem_rows <- function(x) { if (n_problems(x) == 0) { return(x[0, , drop = FALSE]) } probs <- problems(x) x[unique(probs$row), , drop = FALSE] } warn_problems <- function(x) { n <- n_problems(x) if (n == 0) { return(x) } probs <- as.data.frame(attr(x, "problems")) many_problems <- nrow(probs) > 5 probs_f <- format(utils::head(probs, 5), justify = "left") probs_f[probs_f == "NA"] <- "--" probs_f <- rbind(names(probs), probs_f) probs_f <- lapply(probs_f, format, justify = "right") if (many_problems) { # nchar fails with non-ascii characters, so encode characters beforehand. width <- vapply(probs_f, function(x) max(nchar(encodeString(x))), integer(1)) dots <- vapply(width, function(i) paste(rep(".", i), collapse = ""), FUN.VALUE = character(1) ) probs_f <- Map(c, probs_f, dots) } probs_f <- do.call(paste, c(probs_f, list(sep = " ", collapse = "\n"))) warning(n, " parsing failure", if (n > 1) "s", ".\n", probs_f, "\n", if (many_problems) "See problems(...) for more details.\n", call. = FALSE, immediate. = TRUE, noBreaks. = TRUE ) x } name_problems <- function(x, all_colnames, name = "input") { if (n_problems(x) == 0) { return(x) } problems <- problems(x) problems$file <- name problems$col <- all_colnames[problems$col] attr(x, "problems") <- problems x } readr/R/read_table.R0000644000176200001440000000527014304131171013754 0ustar liggesusers#' Read whitespace-separated columns into a tibble #' #' @description #' `read_table()` is designed to read the type of textual #' data where each column is separated by one (or more) columns of space. #' #' `read_table()` is like [read.table()], it allows any number of whitespace #' characters between columns, and the lines can be of different lengths. #' #' `spec_table()` returns the column specifications rather than a data frame. #' #' @seealso [read_fwf()] to read fixed width files where each column #' is not separated by whitespace. `read_fwf()` is also useful for reading #' tabular data with non-standard formatting. #' @inheritParams datasource #' @inheritParams tokenizer_fwf #' @inheritParams read_delim #' @export #' @examples #' ws <- readr_example("whitespace-sample.txt") #' writeLines(read_lines(ws)) #' read_table(ws) #' @rdname read_table #' @export read_table <- function(file, col_names = TRUE, col_types = NULL, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", show_col_types = should_show_types(), skip_empty_rows = TRUE) { tokenizer <- tokenizer_ws( na = na, comment = comment, skip_empty_rows = skip_empty_rows ) read_delimited(file, tokenizer, col_names = col_names, col_types = col_types, locale = locale, skip = skip, skip_empty_rows = skip_empty_rows, skip_quote = FALSE, comment = comment, n_max = n_max, guess_max = guess_max, progress = progress, show_col_types = show_col_types ) } #' Read whitespace-separated columns into a tibble #' #' @description #' `r lifecycle::badge("deprecated")` #' #' This function is deprecated because we renamed it to [read_table()] and #' removed the old `read_table` function, which was too strict for most cases #' and was analogous to just using `read_fwf()`. #' @keywords internal #' @export read_table2 <- function(file, col_names = TRUE, col_types = NULL, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", skip_empty_rows = TRUE) { lifecycle::deprecate_soft("2.0.0", "read_table2()", "read_table()") read_table( file = file, col_names = col_names, col_types = col_types, locale = locale, na = na, skip = skip, n_max = n_max, guess_max = guess_max, progress = progress, comment = comment, skip_empty_rows = skip_empty_rows ) } #' @rdname spec_delim #' @export spec_table <- generate_spec_fun(read_table) readr/R/file.R0000644000176200001440000000351014304131171012604 0ustar liggesusers#' Read/write a complete file #' #' `read_file()` reads a complete file into a single object: either a #' character vector of length one, or a raw vector. `write_file()` takes a #' single string, or a raw vector, and writes it exactly as is. Raw vectors #' are useful when dealing with binary data, or if you have text data with #' unknown encoding. #' #' @inheritParams datasource #' @inheritParams read_delim #' @return #' `read_file`: A length 1 character vector. #' `read_lines_raw`: A raw vector. #' @param x A single string, or a raw vector to write to disk. #' @export #' @examples #' read_file(file.path(R.home("doc"), "AUTHORS")) #' read_file_raw(file.path(R.home("doc"), "AUTHORS")) #' #' tmp <- tempfile() #' #' x <- format_csv(mtcars[1:6, ]) #' write_file(x, tmp) #' identical(x, read_file(tmp)) #' #' read_lines(I(x)) read_file <- function(file, locale = default_locale()) { if (empty_file(file)) { return("") } ds <- datasource(file, skip_empty_rows = FALSE) read_file_(ds, locale) } #' @export #' @rdname read_file read_file_raw <- function(file) { if (empty_file(file)) { return(raw()) } ds <- datasource(file, skip_empty_rows = FALSE) read_file_raw_(ds) } #' @inherit write_lines #' @param path `r lifecycle::badge("deprecated")` Use the `file` argument #' instead. #' @rdname read_file #' @export write_file <- function(x, file, append = FALSE, path = deprecated()) { if (is_present(path)) { deprecate_warn("1.4.0", "write_file(path = )", "write_file(file = )") file <- path } force(x) file <- standardise_path(file, input = FALSE) if (!isOpen(file)) { on.exit(close(file), add = TRUE) if (isTRUE(append)) { open(file, "ab") } else { open(file, "wb") } } if (is.raw(x)) { write_file_raw_(x, file) } else { write_file_(x, file) } invisible(x) } readr/R/melt_table.R0000644000176200001440000000666014304131171014006 0ustar liggesusers#' Return melted data for each token in a whitespace-separated file #' #' @description #' `r lifecycle::badge("superseded")` #' This function has been superseded in readr and moved to [the meltr #' package](https://r-lib.github.io/meltr/). #' #' For certain non-rectangular data formats, it can be useful to parse the data #' into a melted format where each row represents a single token. #' #' `melt_table()` and `melt_table2()` are designed to read the type of textual #' data where each column is separated by one (or more) columns of space. #' #' `melt_table2()` allows any number of whitespace characters between columns, #' and the lines can be of different lengths. #' #' `melt_table()` is more strict, each line must be the same length, #' and each field is in the same position in every line. It first finds empty #' columns and then parses like a fixed width file. #' #' @seealso [melt_fwf()] to melt fixed width files where each column #' is not separated by whitespace. `melt_fwf()` is also useful for reading #' tabular data with non-standard formatting. [read_table()] is the #' conventional way to read tabular data from whitespace-separated files. #' @inheritParams read_table #' @export #' @examples #' fwf <- readr_example("fwf-sample.txt") #' writeLines(read_lines(fwf)) #' melt_table(fwf) #' #' ws <- readr_example("whitespace-sample.txt") #' writeLines(read_lines(ws)) #' melt_table2(ws) melt_table <- function(file, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, guess_max = min(n_max, 1000), progress = show_progress(), comment = "", skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_table()", details = "Please use `meltr::melt_table()` instead") } ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) if (inherits(ds, "source_file") && empty_file(file)) { return(tibble::tibble( row = double(), col = double(), data_type = character(), value = character() )) } local_edition(1) columns <- fwf_empty(ds, skip = skip, skip_empty_rows = skip_empty_rows, n = guess_max, comment = comment) tokenizer <- tokenizer_fwf(columns$begin, columns$end, na = na, comment = comment, skip_empty_rows = skip_empty_rows ) ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows) out <- melt_tokens(ds, tokenizer, locale_ = locale, n_max = n_max, progress = progress ) warn_problems(out) } #' @rdname melt_table #' @export melt_table2 <- function(file, locale = default_locale(), na = "NA", skip = 0, n_max = Inf, progress = show_progress(), comment = "", skip_empty_rows = FALSE) { if (!edition_first()) { lifecycle::deprecate_soft("2.0.0", what = "melt_table2()", details = "Please use `meltr::melt_table2()` instead") } ds <- datasource(file, skip = skip, skip_empty_rows = skip_empty_rows) if (inherits(ds, "source_file") && empty_file(file)) { return(tibble::tibble( row = double(), col = double(), data_type = character(), value = character() )) } tokenizer <- tokenizer_ws( na = na, comment = comment, skip_empty_rows = skip_empty_rows ) ds <- datasource(file = ds, skip = skip, skip_empty_rows = skip_empty_rows) melt_delimited(ds, tokenizer, locale = locale, skip = skip, comment = comment, n_max = n_max, progress = progress ) } readr/R/edition.R0000644000176200001440000000363314315646511013341 0ustar liggesusersedition_set <- function(edition) { edition <- as.integer(edition) stopifnot(edition %in% c(1L, 2L)) old <- edition_get() options("readr.edition" = edition) invisible(old) } #' Retrieve the currently active edition #' #' @returns An integer corresponding to the currently active edition. #' @export #' @examples #' edition_get() edition_get <- function() { getOption("readr.edition", 2L) } #' Temporarily change the active readr edition #' #' `with_edition()` allows you to change the active edition of readr for a given #' block of code. `local_edition()` allows you to change the active edition of #' readr until the end of the current function or file. #' #' @export #' @param edition Should be a single integer, such as `1` or `2`. #' @param env Environment that controls scope of changes. For expert use only. #' @param code Code to run with the changed edition. #' #' @examples #' with_edition(1, edition_get()) #' with_edition(2, edition_get()) #' #' # readr 1e and 2e behave differently when input rows have different number #' # number of fields #' with_edition(1, read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z"))) #' with_edition(2, read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z"))) #' #' # local_edition() applies in a specific scope, for example, inside a function #' read_csv_1e <- function(...) { #' local_edition(1) #' read_csv(...) #' } #' read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 2e behaviour #' read_csv_1e("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 1e behaviour #' read_csv("1,2\n3,4,5", col_names = c("X", "Y", "Z")) # 2e behaviour with_edition <- function(edition, code) { local_edition(edition) code } #' @rdname with_edition #' @export local_edition <- function(edition, env = parent.frame()) { rlang::check_installed("withr") old <- edition_set(edition) withr::defer(edition_set(old), envir = env) } edition_first <- function() { edition_get() == 1L } readr/NEWS.md0000644000176200001440000012302514547603015012455 0ustar liggesusers# readr 2.1.5 * No major user-facing changes. Patch release with housekeeping changes and internal changes requested by CRAN around format specification in compiled code. # readr 2.1.4 * No user-facing changes. Patch release with internal changes requested by CRAN. # readr 2.1.3 * Help files below `man/` have been re-generated, so that they give rise to valid HTML5. (This is the impetus for this release, to keep the package safely on CRAN.) * `mini-gapminder-africa.csv` and friends are new example datasets accessible via `readr_example()`, which have been added to illustrate reading multiple files at once, into a single data frame. # readr 2.1.2 * `read_table()`, `read_log()`, and `read_delim_chunked()` (and friends) gain the `show_col_types` argument found elsewhere. All `read_*()` functions now respect the `show_col_types` argument or option, even when using the first edition parsing engine (#1331). * `show_progress()` uses `rlang::is_interactive()` instead of `base::interactive()` (#1356). * `read_builtin()` does more argument checking, so that we catch obviously malformed input before passing along to `utils::data()` (#1361). * `chickens.csv` and `whitespace-sample.txt` are new example datasets accessible via `readr_example()` (#1354). # readr 2.1.1 * Jenny Bryan is now the maintainer. * Fix buffer overflow when trying to parse an integer from a field that is over 64 characters long (#1326) # readr 2.1.0 * All readr functions again read eagerly by default. Unfortunately many users experienced frustration from the drawbacks of lazy reading, in particular locking files on Windows, so it was decided to disable lazy reading default. However `options(readr.read_lazy = TRUE)` can be used to set the default to by lazy if desired. * New `readr.read_lazy` global option to control if readr reads files lazily or not (#1266) # readr 2.0.2 * minor test tweak for compatibility with testthat 3.1.0 (#@lionel-, #1304) * `write_rds()` gains a `text=` argument, to control using a text based object representation, like the `ascii=` argument in `saveRDS()` (#1270) # readr 2.0.1 * `options(readr.show_col_types = FALSE)` now works as intended (#1250) * `read_delim_chunked()` now again correctly respects the `chunk_size` parameter (#1248) * `type_convert()` gains a `guess_integer` argument, passed to `guess_parser()` (@jmbarbone, #1264) * `read_tsv()` now correctly passes the `quote` and `na` arguments to `vroom::vroom()` (#1254, #1255) * Avoid spurious byte compilation errors due to the programmatically generated `spec_*()` functions. # readr 2.0.0 ## second edition changes readr 2.0.0 is a major release of readr and introduces a new second edition parsing and writing engine implemented via the [vroom](https://vroom.r-lib.org/) package. This engine takes advantage of lazy reading, multi-threading and performance characteristics of modern SSD drives to significantly improve the performance of reading and writing compared to the first edition engine. We will continue to support the first edition for a number of releases, but eventually this support will be first deprecated and then removed. You can use the `with_edition()` or `local_edition()` functions to temporarily change the edition of readr for a section of code. e.g. - `with_edition(1, read_csv("my_file.csv"))` will read `my_file.csv` with the first edition of readr. - `readr::local_edition(1)` placed at the top of your function or script will use the first edition for the rest of the function or script. ### Lazy reading Edition two uses lazy reading by default. When you first call a `read_*()` function the delimiters and newlines throughout the entire file are found, but the data is not actually read until it is used in your program. This can provide substantial speed improvements for reading character data. It is particularly useful during interactive exploration of only a subset of a full dataset. However this also means that problematic values are not necessarily seen immediately, only when they are actually read. Because of this a warning will be issued the first time a problem is encountered, which may happen after initial reading. Run `problems()` on your dataset to read the entire dataset and return all of the problems found. Run `problems(lazy = TRUE)` if you only want to retrieve the problems found so far. Deleting files after reading is also impacted by laziness. On Windows open files cannot be deleted as long as a process has the file open. Because readr keeps a file open when reading lazily this means you cannot read, then immediately delete the file. readr will in most cases close the file once it has been completely read. However, if you know you want to be able to delete the file after reading it is best to pass `lazy = FALSE` when reading the file. ### Reading multiple files at once Edition two has built-in support for reading sets of files with the same columns into one output table in a single command. Just pass the filenames to be read in the same vector to the reading function. First we generate some files to read by splitting the nycflights dataset by airline. ```{r} library(nycflights13) purrr::iwalk( split(flights, flights$carrier), ~ { .x$carrier[[1]]; vroom::vroom_write(.x, glue::glue("flights_{.y}.tsv"), delim = "\t") } ) ``` Then we can efficiently read them into one tibble by passing the filenames directly to readr. ```{r} files <- fs::dir_ls(glob = "flights*tsv") files readr::read_tsv(files) ``` If the filenames contain data, such as the date when the sample was collected, use `id` argument to include the paths as a column in the data. You will likely have to post-process the paths to keep only the relevant portion for your use case. ### Delimiter guessing Edition two supports automatic guessing of delimiters. Because of this you can now use `read_delim()` without specifying a `delim` argument in many cases. ```{r} x <- read_delim(readr_example("mtcars.csv")) ``` ### Literal data In edition one the reading functions treated any input with a newline in it or vectors of length > 1 as literal data. In edition two vectors of length > 1 are now assumed to correspond to multiple files. Because of this we now have a more explicit way to represent literal data, by putting `I()` around the input. ```{r} readr::read_csv(I("a,b\n1,2")) ``` ### License changes We are systematically re-licensing tidyverse and r-lib packages to use the MIT license, to make our package licenses as clear and permissive as possible. To this end the readr and vroom packages are now released under the MIT license. ### Deprecated or superseded functions and features * `melt_csv()`, `melt_delim()`, `melt_tsv()` and `melt_fwf()` have been superseded by functions in the same name in the meltr package. The versions in readr have been deprecated. These functions rely on the first edition parsing code and would be challenging to update to the new parser. When the first edition parsing code is eventually removed from readr they will be removed. * `read_table2()` has been renamed to `read_table()`, as most users expect `read_table()` to work like `utils::read.table()`. If you want the previous strict behavior of the `read_table()` you can use `read_fwf()` with `fwf_empty()` directly (#717). * Normalizing newlines in files with just carriage returns `\r` is no longer supported. The last major OS to use only CR as the newline was 'classic' Mac OS, which had its final release in 2001. ### Other second edition changes * `read_*_chunked()` functions now include their specification as an attribute (#1143) * All `read_*()` functions gain a `col_select` argument to more easily choose which columns to select. * All `read_*()` functions gain a `id` argument to optionally store the file paths when reading multiple files. * All `read_*()` functions gain a `name_repair` argument to control how column names are repaired. * All `read_*()` and `write_*()` functions gain a `num_threads` argument to control the number of processing threads they use (#1201) * All `write_*()` and `format_*()` functions gain `quote` and `escape` arguments, to explicitly control how fields are quoted and how double quotes are escaped. (#653, #759, #844, #993, #1018, #1083) * All `write_*()` functions gain a `progress` argument and display a progress bar when writing (#791). * write_excel_csv() now defaults to `quote = "all"` (#759) * write_tsv() now defaults to `quote = "none"` (#993) * `read_table()` now handles skipped lines with unpaired quotes properly (#1180) ## Additional features and fixes * The BH package is no longer a dependency. The boost C++ headers in BH have thousands of files, so can take a long time to extract and compiling them takes a great deal of memory, which made readr difficult to compile on systems with limited memory (#1147). * readr now uses the tzdb package when parsing date-times (@DavisVaughan, r-lib/vroom#273) * Chunked readers now support files with more than `INT_MAX` (~ 2 Billion) number of lines (#1177) * Memory no longer inadvertently leaks when reading memory from R connections (#1161) * Invalid date formats no longer can potentially crash R (#1151) * `col_factor()` now throws a more informative error message if given non-character levels (#1140) * `problems()` now takes `.Last.value` as its default argument. This lets you run `problems()` without an argument to see the problems in the previously read dataset. * `read_delim()` fails when sample of parsing problems contains non-ASCII characters (@hidekoji, #1136) * `read_log()` gains a `trim_ws` argument (#738) * `read_rds()` and `write_rds()` gain a `refhook` argument, to pass functions that handle references objects (#1206) * `read_rds()` can now read .Rds files from URLs (#1186) * `read_*()` functions gain a `show_col_types` argument, if set to `FALSE` this turns off showing the column types unconditionally. * `type_convert()` now throws a warning if the input has no character columns (#1020) * `write_csv()` now errors if given a matrix column (#1171) * `write_csv()` now again is able to write data with duplicated column names (#1169) * `write_file()` now forces its argument before opening the output file (#1158) # readr 1.4.0 ## Breaking changes * `write_*()` functions first argument is now `file` instead of `path`, for consistency with the `read_*()` functions. `path` has been deprecated and will be removed in a future version of readr (#1110, @brianrice2) * `write_*()` functions now output any NaN values in the same way as NA values, controlled by the `na=` argument. (#1082). ## New features * It is now possible to generate a column specification from any tibble (or data.frame) with `as.col_spec()` and convert any column specification to a short representation with `as.character()` s <- as.col_spec(iris) s #> cols( #> Sepal.Length = col_double(), #> Sepal.Width = col_double(), #> Petal.Length = col_double(), #> Petal.Width = col_double(), #> Species = col_factor(levels = c("setosa", "versicolor", "virginica"), ordered = FALSE, include_na = FALSE) #> ) as.character(s) #> [1] "ddddf" * The cli package is now used for all messages. * The runtime performance for tables with an extreme number of columns is greatly improved (#825) * Compressed files are now detected by magic numbers rather than by the file extension (#1125) * A memory leak when reading files is now fixed (#1092) * `write_*()` functions gain a `eol =` argument to control the end of line character used (#857). This allows writing of CSV files with Windows newlines (CRLF) if desired. * The Rcpp dependency has been removed in favor of cpp11. * The build system has been greatly simplified so should work on more systems. ## Additional features and fixes * The full problem field is now displayed in the problems tibble, as intended (#444). * New `%h` placeholder for parsing unrestricted hours (<0 and >23) to support parsing durations (#549, @krlmlr). * `as.character.col_spec()` now handles logical columns as well (#1127) * `fwf_positions(end)` no longer has a default argument and must be specified (#996) * `guess_parser()` gains a `na` argument and removes NA values before guessing (#1041). * `parse_guess()` now passes the `na` argument to `guess_parser()` * `read_*` functions now close properly all connections, including on errors like HTTP errors when reading from a url (@cderv, #1050). * `read_delimited()` no longer mistakenly stats literal filenames (#1063) * `read_lines()` now ignores quotations when skipping lines (#991). * `read_lines(skip_empty_rows = TRUE)` no longer crashes if a file ends with an empty line (#968) * `write_*()` functions now invisibly return the input data frame unchanged, rather than a version with factors and dates converted to strings. (@jesse-ross, #975). * `write_csv2()` now formats decimal numbers more consistently with `utils::write.csv2()` (#1087) * `write_csv2()` and `format_csv2()` no longer pad number columns with whitespaces (@keesdeschepper, #1046). * `write_excel_csv()` no longer outputs a byte order mark when appending to a file (#1075). * Uses of `tibble::data_frame` updated to `tibble::tibble` ([tidyverse/dplyr#4069](https://github.com/tidyverse/dplyr/issues/4069), @thays42, #1124, @brianrice2) * `read_delimited()` now returns an empty `tibble::data_frame()` rather than signaling an error when given a connection with an empty file (@pralitp, #963). * More helpful error when trying to write out data frames with list columns (@ellessenne, #938) * `type_convert()` removes a 'spec' attribute, because the current columns likely have modified data types. The 'spec' attribute is set by functions like `read_delim()` (@jimhester, @wibeasley, #1032). * `write_rds()` now can specify the Rds version to use. The default value is 2 as it's compatible to R versions prior to 3.5.0 (@shrektan, #1001). * Fixes for issues related to variable initialization in C++ code (@michaelquinn32, ##1133). # readr 1.3.1 * Column specifications are now coloured when printed. This makes it easy to see at a glance when a column is input as a different type then the rest. Colouring can be disabled by setting `options(crayon.enabled = FALSE)`. * `as.col_spec()` can now use named character vectors, which makes `read_csv("file.csv", col_types = c(xyz = "c"))` equivalent to `read_csv("file.csv", col_types = cols(xyz = col_character())` * Fix skipping when single quotes are embedded in double quoted strings, and single quotes in skipped or commented lines (#944, #945). * Fix for compilation using custom architectures on macOS (#919) * Fix for valgrind errors (#941) # readr 1.3.0 ## Breaking Changes ### Blank line skipping readr's blank line skipping has been modified to be more consistent and to avoid edge cases that affected the behavior in 1.2.0. The skip parameter now behaves more similar to how it worked previous to readr 1.2.0, but in addition the parameter `skip_blank_rows` can be used to control if fully blank lines are skipped. (#923) ### tibble data frame subclass readr 1.3.0 returns results with a `spec_tbl_df` subclass. This differs from a regular tibble only that the `spec` attribute (which holds the column specification) is lost as soon as the object is subset (and a normal `tbl_df` object is returned). Historically `tbl_df`'s lost their attributes once they were subset. However recent versions of tibble retain the attributes when subetting, so the `spec_tbl_df` subclass is needed to ensure the previous behavior. This should only break compatibility if you are explicitly checking the class of the returned object. A way to get backwards compatible behavior is to call subset with no arguments on your object, e.g. `x[]`. ## Bugfixes * `hms` objects with NA values are now written without whitespace padding (#930). * `read_*()` functions now return `spec_tbl_df` objects, which differ from regular `tbl_df` objects only in that the `spec` attribute is removed (and they are demoted to regular `tbl_df` objects) as soon as they are subset (#934). * `write_csv2()` now properly respects the `na` argument (#928) * Fixes compilation with multiple architectures on linux (#922). * Fixes compilation with R < 3.3.0 # readr 1.2.1 This release skips the clipboard tests on CRAN servers # readr 1.2.0 ## Breaking Changes ### Integer column guessing readr functions no longer guess columns are of type integer, instead these columns are guessed as numeric. Because R uses 32 bit integers and 64 bit doubles all integers can be stored in doubles, guaranteeing no loss of information. This change was made to remove errors when numeric columns were incorrectly guessed as integers. If you know a certain column is an integer and would like to read them as such you can do so by specifying the column type explicitly with the `col_types` argument. ### Blank line skipping readr now always skips blank lines automatically when parsing, which may change the number of lines you need to pass to the `skip` parameter. For instance if your file had a one blank line then two more lines you want to skip previously you would pass `skip = 3`, now you only need to pass `skip = 2`. ## New features ### Melt functions There is now a family of `melt_*()` functions in readr. These functions store data in 'long' or 'melted' form, where each row corresponds to a single value in the dataset. This form is useful when your data is ragged and not rectangular. ``` r data <-"a,b,c 1,2 w,x,y,z" readr::melt_csv(data) #> # A tibble: 9 x 4 #> row col data_type value #> #> 1 1 1 character a #> 2 1 2 character b #> 3 1 3 character c #> 4 2 1 integer 1 #> 5 2 2 integer 2 #> 6 3 1 character w #> 7 3 2 character x #> 8 3 3 character y #> 9 3 4 character z ``` Thanks to Duncan Garmonsway (@nacnudus) for great work on the idea an implementation of the `melt_*()` functions! ### Connection improvements readr 1.2.0 changes how R connections are parsed by readr. In previous versions of readr the connections were read into an in-memory raw vector, then passed to the readr functions. This made reading connections from small to medium datasets fast, but also meant that the dataset had to fit into memory at least twice (once for the raw data, once for the parsed data). It also meant that reading could not begin until the full vector was read through the connection. Now we instead write the connection to a temporary file (in the R temporary directory), than parse that temporary file. This means connections may take a little longer to be read, but also means they will no longer need to fit into memory. It also allows the use of the chunked readers to process the data in parts. Future improvements to readr would allow it to parse data from connections in a streaming fashion, which would avoid many of the drawbacks of either method. ### Additional new features * `melt_*()` functions added for reading ragged data (#760, @nacnudus). * `AccumulateCallback` R6 class added to provide an example of accumulating values in a single result (#689, @blakeboswell). * `read_fwf()` can now accept overlapping field specifications (#692, @gergness) * `type_convert()` now allows character column specifications and also silently skips non-character columns (#369, #699) * The `parse_*()` functions and `read_fwf()` gain a `trim_ws` argument to control whether the fields should be trimmed before parsing (#636, #735). * `parse_number()` now parses numbers in scientific notation using `e` and `E` (#684, @sambrady3). * Add `write_excel_csv2()` function to allow writing csv files with comma as a decimal separator and semicolon as a column separator (#753, @olgamie). * `read_*()` files now support reading from the clipboard by using `clipboard()` (#656). * `write_file()` gains a `sep` argument, to specify the line separator (#665). * Allow files to be read via FTP over SSH by recognising `sftp` as a URL protocol (#707, @jdeboer). * `parse_date*() accepts `%a` for local day of week (#763, @tigertoes). * Added function `read_lines_raw_chunked()` (#710, @gergness) * `write_csv2()` added to complement `write_excel_csv2()` and allow writing csv file readable by `read_csv2()` (#870, @cderv). * `as.col_spec()` is now exported (#517). * `write*()` functions gain a `quote_escape` argument to control how quotes are escaped in the output (#854). * `read*()` functions now have a more informative error when trying to read a remote bz2 file (#891). * `spec_table2()` function added to correspond to `read_table2()` (#778, @mawds). * `parse_factor()` now has `levels = NULL` by default (#862, @mikmart). * `"f"` can now be used as a shortcode for `col_factor()` in `cols()` and the `col_types` argument to `read_delim()` and friends (#810, @mikmart). * Functions now read connections to a temporary file rather than to an in-memory object (#610, #76). ## Bug Fixes * `standardise_path()` now uses a case-insensitive comparison for the file extensions (#794). * `parse_guess()` now guesses logical types when given (lowercase) 'true' and 'false' inputs (#818). * `read_*()` now do not print a progress bar when running inside a RStudio notebook chunk (#793) * `read_table2()` now skips comments anywhere in the file (#908). * `parse_factor()` now handles the case of empty strings separately, so you can have a factor level that is an empty string (#864). * `read_delim()` now correctly reads quoted headers with embedded newlines (#784). * `fwf_positions()` now always returns `col_names` as a character (#797). * `format_*()` now explicitly marks it's output encoding as UTF-8 (#697). * `read_delim()` now ignores whitespace between the delimiter and quoted fields (#668). * `read_table2()` now properly ignores blank lines at the end of a file like `read_table()` and `read_delim()` (#657). * `read_delim()`, `read_table()` and `read_table()` now skip blank lines at the start of a file (#680, #747). * `guess_parser()` now guesses a logical type for columns which are all missing. This is useful when binding multiple files together where some files have missing columns. (#662). * Column guessing will now never guess an integer type. This avoids issues where double columns are incorrectly guessed as integers if they have only integer values in the first 1000 (#645, #652). * `read_*()` now converts string `file`s to UTF-8 before parsing, which is convenient for non-UTF-8 platforms in most cases (#730, @yutannihilation). * `write_csv()` writes integers up to 10^15 without scientific notation (#765, @zeehio) * `read_*()` no longer throws a "length of NULL cannot be changed" warning when trying to resize a skipped column (#750, #833). * `read_*()` now handles non-ASCII paths properly with R >=3.5.0 on Windows (#838, @yutannihilation). * `read*()`'s `trim_ws` parameter now trims both spaces and tabs (#767) # readr 1.1.1 * Point release for test compatibility with tibble v1.3.1. * Fixed undefined behavior in localtime.c when using `locale(tz = "")` after loading a timezone due to incomplete reinitialization of the global locale. # readr 1.1.0 ## New features ### Parser improvements * `parse_factor()` gains a `include_na` argument, to include `NA` in the factor levels (#541). * `parse_factor()` will now can accept `levels = NULL`, which allows one to generate factor levels based on the data (like stringsAsFactors = TRUE) (#497). * `parse_numeric()` now returns the full string if it contains no numbers (#548). * `parse_time()` now correctly handles 12 AM/PM (#579). * `problems()` now returns the file path in additional to the location of the error in the file (#581). * `read_csv2()` gives a message if it updates the default locale (#443, @krlmlr). * `read_delim()` now signals an error if given an empty delimiter (#557). * `write_*()` functions witting whole number doubles are no longer written with a trailing `.0` (#526). ### Whitespace / fixed width improvements * `fwf_cols()` allows for specifying the `col_positions` argument of `read_fwf()` with named arguments of either column positions or widths (#616, @jrnold). * `fwf_empty()` gains an `n` argument to control how many lines are read for whitespace to determine column structure (#518, @Yeedle). * `read_fwf()` gives error message if specifications have overlapping columns (#534, @gergness) * `read_table()` can now handle `pipe()` connections (#552). * `read_table()` can now handle files with many lines of leading comments (#563). * `read_table2()` which allows any number of whitespace characters as delimiters, a more exact replacement for `utils::read.table()` (#608). ## Writing to connections * `write_*()` functions now support writing to binary connections. In addition output filenames with `.gz`, `.bz2` or `.xz` will automatically open the appropriate connection and to write the compressed file. (#348) * `write_lines()` now accepts a list of raw vectors (#542). ## Miscellaneous features * `col_euro_double()`, `parse_euro_double()`, `col_numeric()`, and `parse_numeric()` have been removed. * `guess_encoding()` returns a tibble, and works better with lists of raw vectors (as returned by `read_lines_raw()`). * `ListCallback` R6 Class to provide a more flexible return type for callback functions (#568, @mmuurr) * `tibble::as.tibble()` now used to construct tibbles (#538). * `read_csv`, `read_csv2`, and `read_tsv` gain a `quote` argument, (#631, @noamross) ## Bugfixes * `parse_factor()` now converts data to UTF-8 based on the supplied locale (#615). * `read_*()` functions with the `guess_max` argument now throw errors on inappropriate inputs (#588). * `read_*_chunked()` functions now properly end the stream if `FALSE` is returned from the callback. * `read_delim()` and `read_fwf()` when columns are skipped using `col_types` now report the correct column name (#573, @cb4ds). * `spec()` declarations that are long now print properly (#597). * `read_table()` does not print `spec` when `col_types` is not `NULL` (#630, @jrnold). * `guess_encoding()` now returns a tibble for all ASCII input as well (#641). # readr 1.0.0 ## Column guessing The process by which readr guesses the types of columns has received a substantial overhaul to make it easier to fix problems when the initial guesses aren't correct, and to make it easier to generate reproducible code. Now column specifications are printing by default when you read from a file: ```R challenge <- read_csv(readr_example("challenge.csv")) #> Parsed with column specification: #> cols( #> x = col_integer(), #> y = col_character() #> ) ``` And you can extract those values after the fact with `spec()`: ```R spec(challenge) #> cols( #> x = col_integer(), #> y = col_character() #> ) ``` This makes it easier to quickly identify parsing problems and fix them (#314). If the column specification is long, the new `cols_condense()` is used to condense the spec by identifying the most common type and setting it as the default. This is particularly useful when only a handful of columns have a different type (#466). You can also generating an initial specification without parsing the file using `spec_csv()`, `spec_tsv()`, etc. Once you have figured out the correct column types for a file, it's often useful to make the parsing strict. You can do this either by copying and pasting the printed output, or for very long specs, saving the spec to disk with `write_rds()`. In production scripts, combine this with `stop_for_problems()` (#465): if the input data changes form, you'll fail fast with an error. You can now also adjust the number of rows that readr uses to guess the column types with `guess_max`: ```R challenge <- read_csv(readr_example("challenge.csv"), guess_max = 1500) #> Parsed with column specification: #> cols( #> x = col_double(), #> y = col_date(format = "") #> ) ``` You can now access the guessing algorithm from R. `guess_parser()` will tell you which parser readr will select for a character vector (#377). We've made a number of fixes to the guessing algorithm: * New example `extdata/challenge.csv` which is carefully created to cause problems with the default column type guessing heuristics. * Blank lines and lines with only comments are now skipped automatically without warning (#381, #321). * Single '-' or '.' are now parsed as characters, not numbers (#297). * Numbers followed by a single trailing character are parsed as character, not numbers (#316). * We now guess at times using the `time_format` specified in the `locale()`. We have made a number of improvements to the reification of the `col_types`, `col_names` and the actual data: * If `col_types` is too long, it is subsetted correctly (#372, @jennybc). * If `col_names` is too short, the added names are numbered correctly (#374, @jennybc). * Missing column name names are now given a default name (`X2`, `X7` etc) (#318). Duplicated column names are now deduplicated. Both changes generate a warning; to suppress it supply an explicit `col_names` (setting `skip = 1` if there's an existing ill-formed header). * `col_types()` accepts a named list as input (#401). ## Column parsing The date time parsers recognise three new format strings: * `%I` for 12 hour time format (#340). * `%AD` and `%AT` are "automatic" date and time parsers. They are both slightly less flexible than previous defaults. The automatic date parser requires a four digit year, and only accepts `-` and `/` as separators (#442). The flexible time parser now requires colons between hours and minutes and optional seconds (#424). `%y` and `%Y` are now strict and require 2 or 4 characters respectively. Date and time parsing functions received a number of small enhancements: * `parse_time()` returns `hms` objects rather than a custom `time` class (#409). It now correctly parses missing values (#398). * `parse_date()` returns a numeric vector (instead of an integer vector) (#357). * `parse_date()`, `parse_time()` and `parse_datetime()` gain an `na` argument to match all other parsers (#413). * If the format argument is omitted `parse_date()` or `parse_time()`, date and time formats specified in the locale will be used. These now default to `%AD` and `%AT` respectively. * You can now parse partial dates with `parse_date()` and `parse_datetime()`, e.g. `parse_date("2001", "%Y")` returns `2001-01-01`. `parse_number()` is slightly more flexible - it now parses numbers up to the first ill-formed character. For example `parse_number("-3-")` and `parse_number("...3...")` now return -3 and 3 respectively. We also fixed a major bug where parsing negative numbers yielded positive values (#308). `parse_logical()` now accepts `0`, `1` as well as lowercase `t`, `f`, `true`, `false`. ## New readers and writers * `read_file_raw()` reads a complete file into a single raw vector (#451). * `read_*()` functions gain a `quoted_na` argument to control whether missing values within quotes are treated as missing values or as strings (#295). * `write_excel_csv()` can be used to write a csv file with a UTF-8 BOM at the start, which forces Excel to read it as UTF-8 encoded (#375). * `write_lines()` writes a character vector to a file (#302). * `write_file()` to write a single character or raw vector to a file (#474). * Experimental support for chunked reading a writing (`read_*_chunked()`) functions. The API is unstable and subject to change in the future (#427). ## Minor features and bug fixes * Printing double values now uses an [implementation](https://github.com/juj/MathGeoLib/blob/master/src/Math/grisu3.c) of the [grisu3 algorithm](http://www.cs.tufts.edu/~nr/cs257/archive/florian-loitsch/printf.pdf) which speeds up writing of large numeric data frames by ~10X. (#432) '.0' is appended to whole number doubles, to ensure they will be read as doubles as well. (#483) * readr imports tibble so that you get consistent `tbl_df` behaviour (#317, #385). * New example `extdata/challenge.csv` which is carefully created to cause problems with the default column type guessing heuristics. * `default_locale()` now sets the default locale in `readr.default_locale` rather than regenerating it for each call. (#416). * `locale()` now automatically sets decimal mark if you set the grouping mark. It throws an error if you accidentally set decimal and grouping marks to the same character (#450). * All `read_*()` can read into long vectors, substantially increasing the number of rows you can read (#309). * All `read_*()` functions return empty objects rather than signaling an error when run on an empty file (#356, #441). * `read_delim()` gains a `trim_ws` argument (#312, noamross) * `read_fwf()` received a number of improvements: * `read_fwf()` now can now reliably read only a partial set of columns (#322, #353, #469) * `fwf_widths()` accepts negative column widths for compatibility with the `widths` argument in `read.fwf()` (#380, @leeper). * You can now read fixed width files with ragged final columns, by setting the final end position in `fwf_positions()` or final width in `fwf_widths()` to `NA` (#353, @ghaarsma). `fwf_empty()` does this automatically. * `read_fwf()` and `fwf_empty()` can now skip commented lines by setting a `comment` argument (#334). * `read_lines()` ignores embedded null's in strings (#338) and gains a `na` argument (#479). * `readr_example()` makes it easy to access example files bundled with readr. * `type_convert()` now accepts only `NULL` or a `cols` specification for `col_types` (#369). * `write_delim()` and `write_csv()` now invisibly return the input data frame (as documented, #363). * Doubles are parsed with `boost::spirit::qi::long_double` to work around a bug in the spirit library when parsing large numbers (#412). * Fix bug when detecting column types for single row files without headers (#333). # readr 0.2.2 * Fix bug when checking empty values for missingness (caused valgrind issue and random crashes). # readr 0.2.1 * Fixes so that readr works on Solaris. # readr 0.2.0 ## Internationalisation readr now has a strategy for dealing with settings that vary from place to place: locales. The default locale is still US centric (because R itself is), but you can now easily override the default timezone, decimal separator, grouping mark, day & month names, date format, and encoding. This has lead to a number of changes: * `read_csv()`, `read_tsv()`, `read_fwf()`, `read_table()`, `read_lines()`, `read_file()`, `type_convert()`, `parse_vector()` all gain a `locale` argument. * `locale()` controls all the input settings that vary from place-to-place. * `col_euro_double()` and `parse_euro_double()` have been deprecated. Use the `decimal_mark` parameter to `locale()` instead. * The default encoding is now UTF-8. To load files that are not in UTF-8, set the `encoding` parameter of the `locale()` (#40). New `guess_encoding()` function uses stringi to help you figure out the encoding of a file. * `parse_datetime()` and `parse_date()` with `%B` and `%b` use the month names (full and abbreviate) defined in the locale (#242). They also inherit the tz from the locale, rather than using an explicit `tz` parameter. See `vignette("locales")` for more details. ## File parsing improvements * `cols()` lets you pick the default column type for columns not otherwise explicitly named (#148). You can refer to parsers either with their full name (e.g. `col_character()`) or their one letter abbreviation (e.g. `c`). * `cols_only()` allows you to load only named columns. You can also choose to override the default column type in `cols()` (#72). * `read_fwf()` is now much more careful with new lines. If a line is too short, you'll get a warning instead of a silent mistake (#166, #254). Additionally, the last column can now be ragged: the width of the last field is silently extended until it hits the next line break (#146). This appears to be a common feature of "fixed" width files in the wild. * In `read_csv()`, `read_tsv()`, `read_delim()` etc: * `comment` argument allows you to ignore comments (#68). * `trim_ws` argument controls whether leading and trailing whitespace is removed. It defaults to `TRUE` (#137). * Specifying the wrong number of column names, or having rows with an unexpected number of columns, generates a warning, rather than an error (#189). * Multiple NA values can be specified by passing a character vector to `na` (#125). The default has been changed to `na = c("", "NA")`. Specifying `na = ""` now works as expected with character columns (#114). ## Column parsing improvements Readr gains `vignette("column-types")` which describes how the defaults work and how to override them (#122). * `parse_character()` gains better support for embedded nulls: any characters after the first null are dropped with a warning (#202). * `parse_integer()` and `parse_double()` no longer silently ignore trailing letters after the number (#221). * New `parse_time()` and `col_time()` allows you to parse times (hours, minutes, seconds) into number of seconds since midnight. If the format is omitted, it uses a flexible parser that looks for hours, then optional colon, then minutes, then optional colon, then optional seconds, then optional am/pm (#249). * `parse_date()` and `parse_datetime()`: * `parse_datetime()` no longer incorrectly reads partial dates (e.g. 19, 1900, 1900-01) (#136). These triggered common false positives and after re-reading the ISO8601 spec, I believe they actually refer to periods of time, and should not be translated in to a specific instant (#228). * Compound formats "%D", "%F", "%R", "%X", "%T", "%x" are now parsed correctly, instead of using the ISO8601 parser (#178, @kmillar). * "%." now requires a non-digit. New "%+" skips one or more non-digits. * You can now use `%p` to refer to AM/PM (and am/pm) (#126). * `%b` and `%B` formats (month and abbreviated month name) ignore case when matching (#219). * Local (non-UTC) times with and without daylight savings are now parsed correctly (#120, @andres-s). * `parse_number()` is a somewhat flexible numeric parser designed to read currencies and percentages. It only reads the first number from a string (using the grouping mark defined by the locale). * `parse_numeric()` has been deprecated because the name is confusing - it's a flexible number parser, not a parser of "numerics", as R collectively calls doubles and integers. Use `parse_number()` instead. As well as improvements to the parser, I've also made a number of tweaks to the heuristics that readr uses to guess column types: * New `parse_guess()` and `col_guess()` to explicitly guess column type. * Bumped up row inspection for column typing guessing from 100 to 1000. * The heuristics for guessing `col_integer()` and `col_double()` are stricter. Numbers with leading zeros now default to being parsed as text, rather than as integers/doubles (#266). * A column is guessed as `col_number()` only if it parses as a regular number when you ignoring the grouping marks. ## Minor improvements and bug fixes * Now use R's platform independent `iconv` wrapper, thanks to BDR (#149). * Pathological zero row inputs (due to empty input, `skip` or `n_max`) now return zero row data frames (#119). * When guessing field types, and there's no information to go on, use character instead of logical (#124, #128). * Concise `col_types` specification now understands `?` (guess) and `-` (skip) (#188). * `count_fields()` starts counting from 1, not 0 (#200). * `format_csv()` and `format_delim()` make it easy to render a csv or delimited file into a string. * `fwf_empty()` now works correctly when `col_names` supplied (#186, #222). * `parse_*()` gains a `na` argument that allows you to specify which values should be converted to missing. * `problems()` now reports column names rather than column numbers (#143). Whenever there is a problem, the first five problems are printing out in a warning message, so you can more easily see what's wrong. * `read_*()` throws a warning instead of an error is `col_types` specifies a non-existent column (#145, @alyst). * `read_*()` can read from a remote gz compressed file (#163). * `read_delim()` defaults to `escape_backslash = FALSE` and `escape_double = TRUE` for consistency. `n_max` also affects the number of rows read to guess the column types (#224). * `read_lines()` gains a progress bar. It now also correctly checks for interrupts every 500,000 lines so you can interrupt long running jobs. It also correctly estimates the number of lines in the file, considerably speeding up the reading of large files (60s -> 15s for a 1.5 Gb file). * `read_lines_raw()` allows you to read a file into a list of raw vectors, one element for each line. * `type_convert()` gains `NA` and `trim_ws` arguments, and removes missing values before determining column types. * `write_csv()`, `write_delim()`, and `write_rds()` all invisibly return their input so you can use them in a pipe (#290). * `write_delim()` generalises `write_csv()` to write any delimited format (#135). `write_tsv()` is a helpful wrapper for tab separated files. * Quotes are only used when they're needed (#116): when the string contains a quote, the delimiter, a new line or NA. * Double vectors are saved using same amount of precision as `as.character()` (#117). * New `na` argument that specifies how missing values should be written (#187) * POSIXt vectors are saved in a ISO8601 compatible format (#134). * No longer fails silently if it can't open the target for writing (#193, #172). * `write_rds()` and `read_rds()` wrap around `readRDS()` and `saveRDS()`, defaulting to no compression (#140, @nicolasCoutin). readr/MD50000644000176200001440000003235614547623042011677 0ustar liggesusers7b992f970fd08ae47ef81db804ef4c70 *DESCRIPTION 5da5931c29930c5c2f246bdb6ac533d5 *LICENSE 181390224b0751104ec490b388d80af4 *NAMESPACE 5e8805a99fe11680b9727e84637e4155 *NEWS.md 4d8cd06e6accb7e97c45d9f1ec38b95d *R/POSIXct.R c4a9882b366636c6162f3499172bb601 *R/callback.R ab06bb4ec284e06d33c5000f48375512 *R/col_types.R b273432f35cf1924d8c66ad67bdfffcc *R/collectors.R f4175876bbfc73fa541258c7df658431 *R/count_fields.R 42836a935ecb8e19bf0973aecc65281f *R/cpp11.R 664734ec05fef8083e8e99e0969cdc9d *R/date-symbols.R 6a5c4b4be33ce436d738b615e6dfbd45 *R/edition.R 48ae89f3562f4ad46358974e056eb81a *R/encoding.R 1f721fafa9f86a652cd502c015b264f0 *R/example.R f6276d456b16d74e2ce78e0b0a4ca8ae *R/file.R c57b571a150eb7d68cfb9906b2c0b120 *R/lines.R d3b384b9ec9000344ff264aacb14189d *R/locale.R 71c78eb5e7320b10d438c6014f87d255 *R/melt_delim.R 52b6e06aba96e3147e43a4b70b380db5 *R/melt_delim_chunked.R 1095d2511bfff137fcb06a79b6fe0e42 *R/melt_fwf.R 9861f4a987f89d43580d1e6ea886c13c *R/melt_table.R 1958295f9a850379385eaa031d3eb042 *R/problems.R ee5ef2aabd0b436b16b325842da67eaa *R/rds.R 239f2ed89e20b9d6f72c915e997c6ef7 *R/read_builtin.R 4e422b27b695898302a3136369559bce *R/read_delim.R 92c90126ef1f8caf428405132437adf4 *R/read_delim_chunked.R 6fbb0eac0c28458d6cadc1c0154030f3 *R/read_fwf.R 9d26a1e7f3c63ac298ca16b636a0d561 *R/read_lines_chunked.R bf42eb6850593de7a20eba02e2c384f5 *R/read_log.R c4d467373274f4c888d4bf3d374afdf8 *R/read_table.R b8d1c307dd1007678179ebc90bda7538 *R/readr-package.R 39388dd058a4b400e13cda4f3729ca40 *R/source.R 7485ed94645922a8f68273c9d2532890 *R/sysdata.rda f550b04c631d41dacb7f995a63db3108 *R/tokenizer.R e69e4b718341e1fa4aa04f5509c5bd71 *R/type_convert.R 1f4946a84dc8b0b00de99ca813e2b74e *R/utils.R f641f5090e48ca89b54a34f79f399837 *R/write.R db37d10551d07f25040c9c82880c5e86 *R/zzz.R 7213c3fffde3c1e2de69ba1b16988696 *README.md a253321154cc9823fdab6c72cc2f0907 *build/vignette.rds a6eb050437c2432a413c35d8b4686c51 *inst/WORDLIST 5cc3e966211df74b79ab6673008b8c72 *inst/doc/column-types.R 7fa33a7676cfc63e3ebed6f998c105c3 *inst/doc/column-types.Rmd 9a1a448304831b04072293dbcf81157b *inst/doc/column-types.html aac6ac562216aa4f4f090d8a425c7e6f *inst/doc/locales.R 9f808ade76413bbe77863fbc24005e49 *inst/doc/locales.Rmd 60219dd172d9e0659862845a8f324733 *inst/doc/locales.html 45fb6b9a0a7c4778a9e252ae3eca6682 *inst/doc/readr.R 6314dcada7bf246a50adef898979ca0a *inst/doc/readr.Rmd a652c2f5373c74a17732b5ae9f693cc6 *inst/doc/readr.html b05a668d9a4de93b5dd397cfde7905b1 *inst/extdata/challenge.csv 0609bf64b5d30d1ad05edadde1b87b01 *inst/extdata/chickens.csv fa584bf0652806be23f04c9938ec0ec8 *inst/extdata/epa78.txt 9dc92f35a3293d75ce989e1e694a57c7 *inst/extdata/example.log 891bca40aeba031848809c3e587b20d7 *inst/extdata/fwf-sample.txt 0e5e0f32575cc33876c3db780f327708 *inst/extdata/massey-rating.txt d1865551a1e4363cebcfacf77917efc8 *inst/extdata/mini-gapminder-africa.csv e2608531de787cb9f0bbbdac3655898b *inst/extdata/mini-gapminder-americas.csv 9a0ce5032bdf9d0aca361c48d007aeb6 *inst/extdata/mini-gapminder-asia.csv 38bce5365b60510ef33191bfdcff20be *inst/extdata/mini-gapminder-europe.csv cee4b608d76039bd3826ec08319448d7 *inst/extdata/mini-gapminder-oceania.csv 5143f7b8ed70e91698d432d721c11a63 *inst/extdata/mtcars.csv 99100423693851c707ccdb228723ac83 *inst/extdata/mtcars.csv.bz2 d347f11bcaccca8806927c7a211a9640 *inst/extdata/mtcars.csv.zip 96dec15369e6c514702251b8c36e67c4 *inst/extdata/whitespace-sample.txt efcdbab736896f20d6022981dfe77c3c *man/Tokenizers.Rd 9712d25f783f4f214cf5ec9db940cbee *man/as.col_spec.Rd 07c6fa1c525b666aa02fa515bb37ba07 *man/callback.Rd d55eac5b542f5c85db2c2d7ca5b2d37f *man/clipboard.Rd bed465d43c09aff0c5771540ec522574 *man/col_skip.Rd 02e2985ec1664c29fbb8fc3919604dfd *man/cols.Rd a5d80bf335213f363a1f967f55157270 *man/count_fields.Rd 0e80e0e07af293dd9c0f596634c54a81 *man/datasource.Rd db2c4faf37ea766ce733224f6081fd62 *man/date_names.Rd 5274fb27331cf8696688556bd34d686a *man/edition_get.Rd 9c74d5657c2d4b25920297fce8cefc40 *man/encoding.Rd a1cbaf3f328e8d74e747faacf640c7fc *man/figures/lifecycle-archived.svg 6f521fb1819410630e279d1abf88685a *man/figures/lifecycle-defunct.svg 391f696f961e28914508628a7af31b74 *man/figures/lifecycle-deprecated.svg 691b1eb2aec9e1bec96b79d11ba5e631 *man/figures/lifecycle-experimental.svg 405e252e54a79b33522e9699e4e9051c *man/figures/lifecycle-maturing.svg f41ed996be135fb35afe00641621da61 *man/figures/lifecycle-questioning.svg 306bef67d1c636f209024cf2403846fd *man/figures/lifecycle-soft-deprecated.svg ed42e3fbd7cc30bc6ca8fa9b658e24a8 *man/figures/lifecycle-stable.svg 99f6e77d8cd1072e42e744a4950e143a *man/figures/lifecycle-superseded.svg a4859de3cd6dccad67b2c2568502f48a *man/figures/logo.png 273068431e8d15673138b0593c4e632a *man/format_delim.Rd 53dab8f77cf5599c61c4d4a0ca80738b *man/locale.Rd 58896571d04879abe41b296ed33269bd *man/melt_delim.Rd 64651fb09a564d1c73e617aa51f87b98 *man/melt_delim_chunked.Rd a88068f3b85e6c465782f384993a02d5 *man/melt_fwf.Rd f7405fa14cf76ad26c4539c61c9c641a *man/melt_table.Rd 9244495b29b82a02a5f3c78e2f0ae2c8 *man/output_column.Rd 350126b3e78984ee47ab117dff891b25 *man/parse_atomic.Rd 2b1997360681a6743ae259ec2bd08f83 *man/parse_datetime.Rd 2f53a785ae4d8b3fbff836ce462dab3a *man/parse_factor.Rd 7707bad21bb67e009406ce11515e967c *man/parse_guess.Rd 6ed11b3acbbc0d903d678fa0314e58fa *man/parse_number.Rd 5b9b1209af75b61b7c6505b221cc28ec *man/parse_vector.Rd 953c736f23d59d3c67e264a167793e93 *man/problems.Rd c61754e95b1d7970302370734559cf8b *man/read_builtin.Rd 1d1005136ad4ea98f068dec378116f12 *man/read_delim.Rd 14a6bfe8779ef79a3fa47061f3cac398 *man/read_delim_chunked.Rd 40e668f3acceccc306661bf7b9e46cfc *man/read_file.Rd 13ef1b13ec5d6e94652d484fc544aad0 *man/read_fwf.Rd 6234a033e604d5c0e0dec1e41fdcb7a2 *man/read_lines.Rd 0686805112cf10fe16bdd43c247c4b32 *man/read_lines_chunked.Rd c2b5534ec3c95ce151def215006bbe4a *man/read_log.Rd 6cbe9f86599a7c22a612bd922545162f *man/read_rds.Rd bea7116c3ab41af95d100d6a38d6d32b *man/read_table.Rd ffc10f074b4e6670014ac0024483a39c *man/read_table2.Rd 6a39fd4445f2dec77d8c316c7a97ec2b *man/readr-package.Rd 12c4ee7d59e87dd0ad45f71fbab2bff1 *man/readr_example.Rd cd84a7237f6d89813118176c300c1805 *man/readr_threads.Rd 3206ec42fa70585faa8154462d840eb9 *man/should_read_lazy.Rd d798480a8cbc5297eb33c93cfac600ad *man/should_show_types.Rd e0584a335243335a85da766d9ece841d *man/show_progress.Rd f4b762b6576fec3d5b31b41dba916382 *man/spec.Rd 113fa54a8cffc050facbe649b3157d1a *man/spec_delim.Rd c3b5bbc4a152d2f085d8d7b12ff99955 *man/tokenize.Rd a9a7f50b5373a5368234c63ae4b34bc1 *man/type_convert.Rd ac58292fc5d19e5f68bc5391e77cc8fa *man/with_edition.Rd 670e755a1aa95e57c10154682219c0f2 *man/write_delim.Rd 26dfccb471c985640d9160080aed7f33 *src/Collector.cpp 64a71882b65a478619918801c4af38bc *src/Collector.h 8c1ac53423f826c66f912b783299de2d *src/CollectorGuess.cpp 06f7969e70359bbf1a42daa13d6e728e *src/DateTime.h 56b54319923e88a2cbe8279b8bf0339c *src/DateTimeParser.h 55aeda90bd832edef30dc15f18980311 *src/Iconv.cpp f004d325dcf25ec6771e2ba892a041b8 *src/Iconv.h 24d51dce9a28eadf29c4a84e9c2eaa4a *src/LocaleInfo.cpp 6d9b773ac972f0cabaeb2ebab8785af9 *src/LocaleInfo.h 483f4a0880e8a99ee277b5db12a7dc46 *src/Progress.h fda894c39b72e809d95acf808f06f81f *src/QiParsers.h 00fb93249294298bbb14c756b907a239 *src/Reader.cpp f96f1e65fda29a3f7df5311c02604224 *src/Reader.h 6681e87e375b0692b70dc2555b960f03 *src/Source.cpp 29a432d2eccdb129c5492729f1bd7d6d *src/Source.h 022fccc9fa4b5d626a2cf0b81339c729 *src/SourceFile.h b7f65e7ba80b88a331d6fd754d151e8c *src/SourceRaw.h 55fbb3e5bba978d130c43d5f61f1d1e3 *src/SourceString.h 4427d16f075f37b30c86f0d9f5f09636 *src/Token.h 3ec1d1a217a30aaa9ea0bab0001c4845 *src/Tokenizer.cpp 6e11af298a584f38f3a57bf8cfdec3dd *src/Tokenizer.h 11f13f2bc63160678b69841d31560142 *src/TokenizerDelim.cpp 7e60760f23d91ee68b3ef8b999330510 *src/TokenizerDelim.h e036ec57d62dcb56a67046d6a248cb14 *src/TokenizerFwf.cpp 608b56741aec48c088e6d2ef96713129 *src/TokenizerFwf.h 3bcae443bffe42e64341a3282c51d51c *src/TokenizerLine.h 3584a6f59cffde273ac856cdc38d8d14 *src/TokenizerLog.h 54cbc5437f5120682996851e867d50fc *src/TokenizerWs.cpp cc85d1824185254a42e051d675f27534 *src/TokenizerWs.h 28bb94725d3ee08ee38e120efce63349 *src/Warnings.h eea1893f813b05fbe0de1978472cb87c *src/connection.cpp 9d5a0829db4842475e18569f2917aa8b *src/connection.h a4b9cc1efd2bdd05f98c3b55c4e8f1b1 *src/cpp11.cpp 276c8bb5bb6493d3f6d7024b68af9291 *src/datetime.cpp 5fa30517d92787ad11e7c9814c7ea405 *src/grisu3.c d763bbf07076d54cf56b950534f343ab *src/grisu3.h 02cfcca205b4203604b7aeea14b5f840 *src/init.c e2a59af967b5fa630c6d8eda31162305 *src/mio.h cfe9e9f07040af5bc966e9714d3c68b8 *src/parse.cpp 28726d9b786f03777b7e556aeb765d30 *src/read.cpp d4f598925ccf73665c010357051e7979 *src/type_convert.cpp d261cb4daf619cef259029b993429e7e *src/tzfile.h f675245a4ae6615793764ec0aa900bd8 *src/unicode_fopen.h 192ca2fc04ecf41702866937c380c755 *src/utils.h 3ce9c02a81e7e6212f5dfb0ebf9adba3 *src/write.cpp 4fc5e077429295459f2b439dca452898 *src/write_delim.cpp e3165e94965cabeac005242eb872ba13 *tests/first_edition.R ac6e89c9cad51c62f8f6afeeda497df7 *tests/second_edition.R d4bbddb01053b52c56f14f47a306e0c1 *tests/spelling.R b567ea6fdf9b64daed6d159ba949a317 *tests/testthat/_snaps/col-spec.md f538edeaf4b26ad03eb76464787cd9c8 *tests/testthat/_snaps/edition-1/col-spec.md fb63420c0c9d31926bc4014b69a7f211 *tests/testthat/_snaps/edition-1/read-csv.md e028c99283782efccfcd745b0db83ede *tests/testthat/_snaps/edition-2/col-spec.md 1ec36c418f390759050ddfd7a0b069b8 *tests/testthat/_snaps/edition-2/read-csv.md 814662da769a6fa481a46fbcadcee20c *tests/testthat/_snaps/utils.md 2686557b47e277b9177c830ce874eb22 *tests/testthat/basic-df-singlequote.csv d3d05c4f078dc2bf4c3640dd6a36db7b *tests/testthat/basic-df.csv b82551970134f15ac554f56c12a3717b *tests/testthat/colour-test d41d8cd98f00b204e9800998ecf8427e *tests/testthat/empty-file 7c924d6682b55bb601d4e5b428123709 *tests/testthat/enc-iso-8859-1.txt a06a26f43f86d0d2badd0c1c8c43ebf4 *tests/testthat/eol-cr.csv e55dde023260053db920dacbb2648d68 *tests/testthat/eol-cr.txt 87ad70e2779bf2fe683df5922e4a76a9 *tests/testthat/eol-cr.txt.bz2 bdb17292feb64034e5eb2924d5862801 *tests/testthat/eol-cr.txt.gz 4681b3bd5b571d733e085743fd59397d *tests/testthat/eol-cr.txt.xz d5b4be352f40c106430d43c5e861152d *tests/testthat/eol-cr.txt.zip 403913f0469f686e762c722326f8859b *tests/testthat/eol-crlf.csv e55dde023260053db920dacbb2648d68 *tests/testthat/eol-crlf.txt 920aabc4d3eabf4f3709c8aefcddff55 *tests/testthat/eol-lf.csv e55dde023260053db920dacbb2648d68 *tests/testthat/eol-lf.txt 2b4d8a640b79cf108e795e2a81a9cb4b *tests/testthat/fwf-trailing.txt a03c201d4dd375ced7da830dfc3405d6 *tests/testthat/helper.R 5511d4e73899e23e91ac8439de2905eb *tests/testthat/non-tabular.csv 2e5a6ac9fca4e989ef83a3a090da9099 *tests/testthat/null-file ea427f49d3ef99f68cc3902c7d317a87 *tests/testthat/raw.csv 0add241c7230a0eec1d1d516b0c52264 *tests/testthat/sample_text.txt dfde01c0efdea5fedf9038b4fabf10fc *tests/testthat/setup.R e7f02a0756584926b37a9d56d6984d3e *tests/testthat/table-crash f773d2b1f06201c96d541ce025e4ab13 *tests/testthat/teardown.R a2ed514c4adc74efded5bd972bac8140 *tests/testthat/test-col-spec.R 00a43f8b2190bf54898e7b2b1a0cbc6b *tests/testthat/test-collectors.R a18d7f0c79d55fcbc1e9e099e865264f *tests/testthat/test-encoding.R 5345d9a259159e8f05c475f02856343e *tests/testthat/test-eol.R aa456a19bc1c4a4725713a6b39c88332 *tests/testthat/test-locale.R 19938ab6cf3b87fabd2dfbfaf3c7890c *tests/testthat/test-melt-chunked.R 663ae658d938f70925de64f536c64b26 *tests/testthat/test-melt-csv.R 1ae0c4489af8e5f749ea83105edda54b *tests/testthat/test-melt-fwf.R 7e949331d936f27dd9378f36ca34a607 *tests/testthat/test-melt-table.R 12a9213ed32270255330fd0e974b1a43 *tests/testthat/test-non-ascii-1152.rds 0fcedc242505ac179d110acb48b10645 *tests/testthat/test-parsing-character.R 4bee9005b39ba3ff4bf825cdbda204a4 *tests/testthat/test-parsing-count-fields.R 07ef434341b8f8a0144eca0361877967 *tests/testthat/test-parsing-datetime.R c4227c91115b584fa359f00186e9104f *tests/testthat/test-parsing-factors.R 4f95ea1f94ccb7cfcdc556fc79641845 *tests/testthat/test-parsing-logical.R e2b022f659af6defb1b3348c1fbf90f2 *tests/testthat/test-parsing-numeric.R 504d718f019bd871966bcda872b01be0 *tests/testthat/test-parsing-time.R 87af2525283c48d8221f4645b32a062a *tests/testthat/test-parsing.R 80ec25bd92b0b69d6166230a0d141104 *tests/testthat/test-problems.R 17071977b0541ba2d1ff3ea686971a66 *tests/testthat/test-read-builtin.R 75a10fcca7b69d1547059fc4cb6b441c *tests/testthat/test-read-chunked.R 2ff78b9e0c602ff670769ac2982aaed7 *tests/testthat/test-read-csv.R 126f0fa39095e9efc0faf6359f0c0eb8 *tests/testthat/test-read-file.R 98e0a2cfd3c739ee4ff8b2d7004c67eb *tests/testthat/test-read-fwf.R f70b6bb4f3c57915a448a84af089c0b2 *tests/testthat/test-read-lines.R 3d626088f57857d99ee9a1a75e18137c *tests/testthat/test-read-table.R a060c46dd5630f136e1b72e2550f310e *tests/testthat/test-read_log.R e4910c0dad594f9e3905506f6f113381 *tests/testthat/test-source.R eb189a2342aa2fe006b8174d0767cc61 *tests/testthat/test-tokenizer-delim.R 7f57e20cdfe351b6fba5a5a6f7ec9c73 *tests/testthat/test-type-convert.R 5903e2cdde5e9c09744c0a32e94ca285 *tests/testthat/test-utils.R 02f179d5e3f4e32895bbbc9dead6b764 *tests/testthat/test-write-lines.R 0ecb1ead4814e301caa4d9c6cb3ec1a4 *tests/testthat/test-write.R b85a2395369b3b3cf1882bf5fd3f7007 *tests/testthat/test_list_col_name.csv 7fa33a7676cfc63e3ebed6f998c105c3 *vignettes/column-types.Rmd 9f808ade76413bbe77863fbc24005e49 *vignettes/locales.Rmd 6314dcada7bf246a50adef898979ca0a *vignettes/readr.Rmd readr/inst/0000755000176200001440000000000014547603063012334 5ustar liggesusersreadr/inst/doc/0000755000176200001440000000000014547603063013101 5ustar liggesusersreadr/inst/doc/column-types.Rmd0000644000176200001440000000524514371264576016221 0ustar liggesusers--- title: "Column type" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Column type} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ``` This vignette provides an overview of column type specification with readr. Currently it focuses on how automatic guessing works, but over time we expect to cover more topics. ```{r setup} library(readr) ``` ## Automatic guessing If you don't explicit specify column types with the `col_types` argument, readr will attempt to guess them using some simple heuristics. By default, it will inspect 1000 values, evenly spaced from the first to the last row. This is a heuristic designed to always be fast (no matter how large your file is) and, in our experience, does a good job in most cases. If needed, you can request that readr use more rows by supplying the `guess_max` argument. You can even supply `guess_max = Inf` to use every row to guess the column types. You might wonder why this isn't the default. That's because it's slow: it has to look at every column twice, once to determine the type and once to parse the value. In most cases, you're best off supplying `col_types` yourself. ### Legacy behavior Column type guessing was substantially worse in the first edition of readr (meaning, prior to v2.0.0), because it always looked at the first 1000 rows, and through some application of Murphy's Law, it appears that many real csv files have lots of empty values at the start, followed by more "excitement" later in the file. Let's demonstrate the problem with a slightly tricky file: the column `x` is mostly empty, but has some numeric data at the very end, in row 1001. ```{r} tricky_dat <- tibble::tibble( x = rep(c("", "2"), c(1000, 1)), y = "y" ) tfile <- tempfile("tricky-column-type-guessing-", fileext = ".csv") write_csv(tricky_dat, tfile) ``` The first edition parser doesn't guess the right type for `x` so the `2` becomes an `NA`: ```{r} df <- with_edition(1, read_csv(tfile)) tail(df) ``` For this specific case, we can fix the problem by marginally increasing `guess_max`: ```{r} df <- with_edition(1, read_csv(tfile, guess_max = 1001)) tail(df) ``` Unlike the second edition, we don't recommend using `guess_max = Inf` with the legacy parser, because the engine pre-allocates a large amount of memory in the face of this uncertainty. This means that reading with `guess_max = Inf` can be extremely slow and might even crash your R session. Instead specify the `col_types`: ```{r} df <- with_edition(1, read_csv(tfile, col_types = list(x = col_double()))) tail(df) ``` ```{r} #| include: false file.remove(tfile) ``` readr/inst/doc/locales.R0000644000176200001440000000743014547603062014651 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ## ----------------------------------------------------------------------------- locale() ## ----------------------------------------------------------------------------- locale("ko") # Korean locale("fr") # French ## ----------------------------------------------------------------------------- parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr")) parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr")) ## ----------------------------------------------------------------------------- parse_date("1 août 2015", "%d %B %Y", locale = locale("fr")) parse_date("1 aout 2015", "%d %B %Y", locale = locale("fr", asciify = TRUE)) ## ----------------------------------------------------------------------------- maori <- locale(date_names( day = c("Rātapu", "Rāhina", "Rātū", "Rāapa", "Rāpare", "Rāmere", "Rāhoroi"), mon = c("Kohi-tātea", "Hui-tanguru", "Poutū-te-rangi", "Paenga-whāwhā", "Haratua", "Pipiri", "Hōngongoi", "Here-turi-kōkā", "Mahuru", "Whiringa-ā-nuku", "Whiringa-ā-rangi", "Hakihea") )) ## ----------------------------------------------------------------------------- parse_datetime("2001-10-10 20:10") parse_datetime("2001-10-10 20:10", locale = locale(tz = "Pacific/Auckland")) parse_datetime("2001-10-10 20:10", locale = locale(tz = "Europe/Dublin")) ## ----eval = FALSE------------------------------------------------------------- # is_datetime <- sapply(df, inherits, "POSIXct") # df[is_datetime] <- lapply(df[is_datetime], function(x) { # attr(x, "tzone") <- "UTC" # x # }) ## ----------------------------------------------------------------------------- str(parse_guess("2010-10-10")) str(parse_guess("2010/10/10")) ## ----------------------------------------------------------------------------- str(parse_guess("01/31/2013")) str(parse_guess("01/31/2013", locale = locale(date_format = "%m/%d/%Y"))) ## ----------------------------------------------------------------------------- str(parse_guess("17:55:14")) str(parse_guess("5:55:14 PM")) # Example of a non-standard time str(parse_guess("h5m55s14 PM")) str(parse_guess("h5m55s14 PM", locale = locale(time_format = "h%Hm%Ms%S %p"))) ## ----------------------------------------------------------------------------- library(stringi) x <- "Émigré cause célèbre déjà vu.\n" y <- stri_conv(x, "UTF-8", "latin1") # These strings look like they're identical: x y identical(x, y) # But they have different encodings: Encoding(x) Encoding(y) # That means while they print the same, their raw (binary) # representation is actually quite different: charToRaw(x) charToRaw(y) # readr expects strings to be encoded as UTF-8. If they're # not, you'll get weird characters parse_character(x) parse_character(y) # If you know the encoding, supply it: parse_character(y, locale = locale(encoding = "latin1")) ## ----------------------------------------------------------------------------- guess_encoding(x) guess_encoding(y) # Note that the first guess produces a valid string, but isn't correct: parse_character(y, locale = locale(encoding = "ISO-8859-2")) # But ISO-8859-1 is another name for latin1 parse_character(y, locale = locale(encoding = "ISO-8859-1")) ## ----------------------------------------------------------------------------- parse_double("1,23", locale = locale(decimal_mark = ",")) ## ----------------------------------------------------------------------------- parse_number("$1,234.56") parse_number("$1.234,56", locale = locale(decimal_mark = ",", grouping_mark = ".") ) # readr is smart enough to guess that if you're using , for decimals then # you're probably using . for grouping: parse_number("$1.234,56", locale = locale(decimal_mark = ",")) readr/inst/doc/locales.html0000644000176200001440000011322314547603062015412 0ustar liggesusers Locales

Locales

The goal of readr’s locales is to encapsulate common options that vary between languages and localities. This includes:

  • The names of months and days, used when parsing dates.
  • The default time zone, used when parsing datetimes.
  • The character encoding, used when reading non-ASCII strings.
  • Default date format, used when guessing column types.
  • The decimal and grouping marks, used when reading numbers.

(Strictly speaking these are not locales in the usual technical sense of the word because they also contain information about time zones and encoding.)

To create a new locale, you use the locale() function:

locale()
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
#>         (Thu), Friday (Fri), Saturday (Sat)
#> Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
#>         June (Jun), July (Jul), August (Aug), September (Sep), October
#>         (Oct), November (Nov), December (Dec)
#> AM/PM:  AM/PM

This rest of this vignette will explain what each of the options do.

All of the parsing function in readr take a locale argument. You’ll most often use it with read_csv(), read_fwf() or read_table(). Readr is designed to work the same way across systems, so the default locale is English centric like R. If you’re not in an English speaking country, this makes initial import a little harder, because you have to override the defaults. But the payoff is big: you can share your code and know that it will work on any other system. Base R takes a different philosophy. It uses system defaults, so typical data import is a little easier, but sharing code is harder.

Rather than demonstrating the use of locales with read_csv() and fields, in this vignette I’m going to use the parse_*() functions. These work with a character vector instead of a file on disk, so they’re easier to use in examples. They’re also useful in their own right if you need to do custom parsing. See type_convert() if you need to apply multiple parsers to a data frame.

Dates and times

Names of months and days

The first argument to locale() is date_names, and it controls what values are used for month and day names. The easiest way to specify it is with a ISO 639 language code:

locale("ko") # Korean
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   일요일 (일), 월요일 (월), 화요일 (화), 수요일 (수), 목요일 (목), 금요일
#>         (금), 토요일 (토)
#> Months: 1월, 2월, 3월, 4월, 5월, 6월, 7월, 8월, 9월, 10월, 11월, 12월
#> AM/PM:  오전/오후
locale("fr") # French
#> <locale>
#> Numbers:  123,456.78
#> Formats:  %AD / %AT
#> Timezone: UTC
#> Encoding: UTF-8
#> <date_names>
#> Days:   dimanche (dim.), lundi (lun.), mardi (mar.), mercredi (mer.), jeudi
#>         (jeu.), vendredi (ven.), samedi (sam.)
#> Months: janvier (janv.), février (févr.), mars (mars), avril (avr.), mai (mai),
#>         juin (juin), juillet (juil.), août (août), septembre (sept.),
#>         octobre (oct.), novembre (nov.), décembre (déc.)
#> AM/PM:  AM/PM

If you don’t already know the code for your language, Wikipedia has a good list. Currently readr has 185 languages available. You can list them all with date_names_langs().

Specifying a locale allows you to parse dates in other languages:

parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
#> [1] "2015-01-01"
parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr"))
#> [1] "1979-10-14"

For many languages, it’s common to find that diacritics have been stripped so they can be stored as ASCII. You can tell the locale that with the asciify option:

parse_date("1 août 2015", "%d %B %Y", locale = locale("fr"))
#> [1] "2015-08-01"
parse_date("1 aout 2015", "%d %B %Y", locale = locale("fr", asciify = TRUE))
#> [1] "2015-08-01"

Note that the quality of the translations is variable, especially for the rarer languages. If you discover that they’re not quite right for your data, you can create your own with date_names(). The following example creates a locale with Māori date names:

maori <- locale(date_names(
  day = c("Rātapu", "Rāhina", "Rātū", "Rāapa", "Rāpare", "Rāmere", "Rāhoroi"),
  mon = c("Kohi-tātea", "Hui-tanguru", "Poutū-te-rangi", "Paenga-whāwhā",
    "Haratua", "Pipiri", "Hōngongoi", "Here-turi-kōkā", "Mahuru",
    "Whiringa-ā-nuku", "Whiringa-ā-rangi", "Hakihea")
))

Timezones

Unless otherwise specified, readr assumes that times are in UTC, the Universal Coordinated Time (this is a successor to GMT and for almost all intents is identical). UTC is most suitable for data because it doesn’t have daylight savings - this avoids a whole class of potential problems. If your data isn’t already in UTC, you’ll need to supply a tz in the locale:

parse_datetime("2001-10-10 20:10")
#> [1] "2001-10-10 20:10:00 UTC"
parse_datetime("2001-10-10 20:10", locale = locale(tz = "Pacific/Auckland"))
#> [1] "2001-10-10 20:10:00 NZDT"
parse_datetime("2001-10-10 20:10", locale = locale(tz = "Europe/Dublin"))
#> [1] "2001-10-10 20:10:00 IST"

You can see a complete list of time zones with OlsonNames().

If you’re American, note that “EST” is a Canadian time zone that does not have DST. It’s not Eastern Standard Time! Instead use:

  • PST/PDT = “US/Pacific”
  • CST/CDT = “US/Central”
  • MST/MDT = “US/Mountain”
  • EST/EDT = “US/Eastern”

(Note that there are more specific time zones for smaller areas that don’t follow the same rules. For example, “US/Arizona”, which follows mostly follows mountain time, but doesn’t have daylight savings. If you’re dealing with historical data, you might need an even more specific zone like “America/North_Dakota/New_Salem” - that will get you the most accurate time zones.)

Note that these are only used as defaults. If individual times have timezones and you’re using “%Z” (as name, e.g. “America/Chicago”) or “%z” (as offset from UTC, e.g. “+0800”), they’ll override the defaults. There’s currently no good way to parse times that use US abbreviations.

Note that once you have the date in R, changing the time zone just changes its printed representation - it still represents the same instants of time. If you’ve loaded non-UTC data, and want to display it as UTC, try this snippet of code:

is_datetime <- sapply(df, inherits, "POSIXct")
df[is_datetime] <- lapply(df[is_datetime], function(x) {
  attr(x, "tzone") <- "UTC"
  x
})

Default formats

Locales also provide default date and time formats. The date format is used when guessing column types. The default date format is %AD, a flexible YMD parser (see ?parse_date):

str(parse_guess("2010-10-10"))
#>  Date[1:1], format: "2010-10-10"
str(parse_guess("2010/10/10"))
#>  Date[1:1], format: "2010-10-10"

If you’re an American, you might want to use your illogical date system::

str(parse_guess("01/31/2013"))
#>  chr "01/31/2013"
str(parse_guess("01/31/2013", locale = locale(date_format = "%m/%d/%Y")))
#>  Date[1:1], format: "2013-01-31"

The time format is also used when guessing column types. The default time format is %AT, a flexible HMS parser (see ?parse_time):

str(parse_guess("17:55:14"))
#>  'hms' num 17:55:14
#>  - attr(*, "units")= chr "secs"
str(parse_guess("5:55:14 PM"))
#>  'hms' num 17:55:14
#>  - attr(*, "units")= chr "secs"
# Example of a non-standard time
str(parse_guess("h5m55s14 PM"))
#>  chr "h5m55s14 PM"
str(parse_guess("h5m55s14 PM", locale = locale(time_format = "h%Hm%Ms%S %p")))
#>  'hms' num 17:55:14
#>  - attr(*, "units")= chr "secs"

Character

All readr functions yield strings encoded in UTF-8. This encoding is the most likely to give good results in the widest variety of settings. By default, readr assumes that your input is also in UTF-8. This is less likely to be the case, especially when you’re working with older datasets.

The following code illustrates the problems with encodings:

library(stringi)
x <- "Émigré cause célèbre déjà vu.\n"
y <- stri_conv(x, "UTF-8", "latin1")

# These strings look like they're identical:
x
#> [1] "Émigré cause célèbre déjà vu.\n"
y
#> [1] "Émigré cause célèbre déjà vu.\n"
identical(x, y)
#> [1] TRUE

# But they have different encodings:
Encoding(x)
#> [1] "UTF-8"
Encoding(y)
#> [1] "latin1"

# That means while they print the same, their raw (binary)
# representation is actually quite different:
charToRaw(x)
#>  [1] c3 89 6d 69 67 72 c3 a9 20 63 61 75 73 65 20 63 c3 a9 6c c3 a8 62 72 65 20
#> [26] 64 c3 a9 6a c3 a0 20 76 75 2e 0a
charToRaw(y)
#>  [1] c9 6d 69 67 72 e9 20 63 61 75 73 65 20 63 e9 6c e8 62 72 65 20 64 e9 6a e0
#> [26] 20 76 75 2e 0a

# readr expects strings to be encoded as UTF-8. If they're
# not, you'll get weird characters
parse_character(x)
#> [1] "Émigré cause célèbre déjà vu.\n"
parse_character(y)
#> [1] "\xc9migr\xe9 cause c\xe9l\xe8bre d\xe9j\xe0 vu.\n"

# If you know the encoding, supply it:
parse_character(y, locale = locale(encoding = "latin1"))
#> [1] "Émigré cause célèbre déjà vu.\n"

If you don’t know what encoding the file uses, try guess_encoding(). It’s not 100% perfect (as it’s fundamentally a heuristic), but should at least get you pointed in the right direction:

guess_encoding(x)
#> # A tibble: 3 × 2
#>   encoding     confidence
#>   <chr>             <dbl>
#> 1 UTF-8              1   
#> 2 windows-1250       0.34
#> 3 windows-1252       0.26
guess_encoding(y)
#> # A tibble: 2 × 2
#>   encoding   confidence
#>   <chr>           <dbl>
#> 1 ISO-8859-2        0.4
#> 2 ISO-8859-1        0.3

# Note that the first guess produces a valid string, but isn't correct:
parse_character(y, locale = locale(encoding = "ISO-8859-2"))
#> [1] "Émigré cause célčbre déjŕ vu.\n"
# But ISO-8859-1 is another name for latin1
parse_character(y, locale = locale(encoding = "ISO-8859-1"))
#> [1] "Émigré cause célèbre déjà vu.\n"

Numbers

Some countries use the decimal point, while others use the decimal comma. The decimal_mark option controls which readr uses when parsing doubles:

parse_double("1,23", locale = locale(decimal_mark = ","))
#> [1] 1.23

Additionally, when writing out big numbers, you might have 1,000,000, 1.000.000, 1 000 000, or 1'000'000. The grouping mark is ignored by the more flexible number parser:

parse_number("$1,234.56")
#> [1] 1234.56
parse_number("$1.234,56", 
  locale = locale(decimal_mark = ",", grouping_mark = ".")
)
#> [1] 1234.56

# readr is smart enough to guess that if you're using , for decimals then
# you're probably using . for grouping:
parse_number("$1.234,56", locale = locale(decimal_mark = ","))
#> [1] 1234.56
readr/inst/doc/readr.Rmd0000644000176200001440000002415714314603711014643 0ustar liggesusers--- title: "Introduction to readr" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Introduction to readr} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` The key problem that readr solves is __parsing__ a flat file into a tibble. Parsing is the process of taking a text file and turning it into a rectangular tibble where each column is the appropriate part. Parsing takes place in three basic stages: 1. The flat file is parsed into a rectangular matrix of strings. 1. The type of each column is determined. 1. Each column of strings is parsed into a vector of a more specific type. It's easiest to learn how this works in the opposite order Below, you'll learn how the: 1. __Vector parsers__ turn a character vector in to a more specific type. 1. __Column specification__ describes the type of each column and the strategy readr uses to guess types so you don't need to supply them all. 1. __Rectangular parsers__ turn a flat file into a matrix of rows and columns. Each `parse_*()` is coupled with a `col_*()` function, which will be used in the process of parsing a complete tibble. ## Vector parsers It's easiest to learn the vector parses using `parse_` functions. These all take a character vector and some options. They return a new vector the same length as the old, along with an attribute describing any problems. ### Atomic vectors `parse_logical()`, `parse_integer()`, `parse_double()`, and `parse_character()` are straightforward parsers that produce the corresponding atomic vector. ```{r} parse_integer(c("1", "2", "3")) parse_double(c("1.56", "2.34", "3.56")) parse_logical(c("true", "false")) ``` By default, readr expects `.` as the decimal mark and `,` as the grouping mark. You can override this default using `locale()`, as described in `vignette("locales")`. ### Flexible numeric parser `parse_integer()` and `parse_double()` are strict: the input string must be a single number with no leading or trailing characters. `parse_number()` is more flexible: it ignores non-numeric prefixes and suffixes, and knows how to deal with grouping marks. This makes it suitable for reading currencies and percentages: ```{r} parse_number(c("0%", "10%", "150%")) parse_number(c("$1,234.5", "$12.45")) ``` ### Date/times readr supports three types of date/time data: * dates: number of days since 1970-01-01. * times: number of seconds since midnight. * datetimes: number of seconds since midnight 1970-01-01. ```{r} parse_datetime("2010-10-01 21:45") parse_date("2010-10-01") parse_time("1:00pm") ``` Each function takes a `format` argument which describes the format of the string. If not specified, it uses a default value: * `parse_datetime()` recognises [ISO8601](https://en.wikipedia.org/wiki/ISO_8601) datetimes. * `parse_date()` uses the `date_format` specified by the `locale()`. The default value is `%AD` which uses an automatic date parser that recognises dates of the format `Y-m-d` or `Y/m/d`. * `parse_time()` uses the `time_format` specified by the `locale()`. The default value is `%At` which uses an automatic time parser that recognises times of the form `H:M` optionally followed by seconds and am/pm. In most cases, you will need to supply a `format`, as documented in `parse_datetime()`: ```{r} parse_datetime("1 January, 2010", "%d %B, %Y") parse_datetime("02/02/15", "%m/%d/%y") ``` ### Factors When reading a column that has a known set of values, you can read directly into a factor. `parse_factor()` will generate a warning if a value is not in the supplied levels. ```{r} parse_factor(c("a", "b", "a"), levels = c("a", "b", "c")) parse_factor(c("a", "b", "d"), levels = c("a", "b", "c")) ``` ## Column specification It would be tedious if you had to specify the type of every column when reading a file. Instead readr, uses some heuristics to guess the type of each column. You can access these results yourself using `guess_parser()`: ```{r} guess_parser(c("a", "b", "c")) guess_parser(c("1", "2", "3")) guess_parser(c("1,000", "2,000", "3,000")) guess_parser(c("2001/10/10")) ``` The guessing policies are described in the documentation for the individual functions. Guesses are fairly strict. For example, we don't guess that currencies are numbers, even though we can parse them: ```{r} guess_parser("$1,234") parse_number("$1,234") ``` There are two parsers that will never be guessed: `col_skip()` and `col_factor()`. You will always need to supply these explicitly. You can see the specification that readr would generate for a column file by using `spec_csv()`, `spec_tsv()` and so on: ```{r} x <- spec_csv(readr_example("challenge.csv")) ``` For bigger files, you can often make the specification simpler by changing the default column type using `cols_condense()` ```{r} mtcars_spec <- spec_csv(readr_example("mtcars.csv")) mtcars_spec cols_condense(mtcars_spec) ``` By default readr only looks at the first 1000 rows. This keeps file parsing speedy, but can generate incorrect guesses. For example, in `challenge.csv` the column types change in row 1001, so readr guesses the wrong types. One way to resolve the problem is to increase the number of rows: ```{r} x <- spec_csv(readr_example("challenge.csv"), guess_max = 1001) ``` Another way is to manually specify the `col_type`, as described below. ## Rectangular parsers readr comes with five parsers for rectangular file formats: * `read_csv()` and `read_csv2()` for csv files * `read_tsv()` for tabs separated files * `read_fwf()` for fixed-width files * `read_log()` for web log files Each of these functions firsts calls `spec_xxx()` (as described above), and then parses the file according to that column specification: ```{r} df1 <- read_csv(readr_example("challenge.csv")) ``` The rectangular parsing functions almost always succeed; they'll only fail if the format is severely messed up. Instead, readr will generate a data frame of problems. The first few will be printed out, and you can access them all with `problems()`: ```{r} problems(df1) ``` You've already seen one way of handling bad guesses: increasing the number of rows used to guess the type of each column. ```{r} df2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001) ``` Another approach is to manually supply the column specification. ### Overriding the defaults In the previous examples, you may have noticed that readr printed the column specification that it used to parse the file: ```{r} #> Parsed with column specification: #> cols( #> x = col_integer(), #> y = col_character() #> ) ``` You can also access it after the fact using `spec()`: ```{r} spec(df1) spec(df2) ``` (This also allows you to access the full column specification if you're reading a very wide file. By default, readr will only print the specification of the first 20 columns.) If you want to manually specify the column types, you can start by copying and pasting this code, and then tweaking it fix the parsing problems. ```{r} df3 <- read_csv( readr_example("challenge.csv"), col_types = list( x = col_double(), y = col_date(format = "") ) ) ``` In general, it's good practice to supply an explicit column specification. It is more work, but it ensures that you get warnings if the data changes in unexpected ways. To be really strict, you can use `stop_for_problems(df3)`. This will throw an error if there are any parsing problems, forcing you to fix those problems before proceeding with the analysis. ### Available column specifications The available specifications are: (with string abbreviations in brackets) * `col_logical()` [l], containing only `T`, `F`, `TRUE` or `FALSE`. * `col_integer()` [i], integers. * `col_double()` [d], doubles. * `col_character()` [c], everything else. * `col_factor(levels, ordered)` [f], a fixed set of values. * `col_date(format = "")` [D]: with the locale's `date_format`. * `col_time(format = "")` [t]: with the locale's `time_format`. * `col_datetime(format = "")` [T]: ISO8601 date times * `col_number()` [n], numbers containing the `grouping_mark` * `col_skip()` [_, -], don't import this column. * `col_guess()` [?], parse using the "best" type based on the input. Use the `col_types` argument to override the default choices. There are two ways to use it: * With a string: `"dc__d"`: read first column as double, second as character, skip the next two and read the last column as a double. (There's no way to use this form with types that take additional parameters.) * With a (named) list of col objects: ```r read_csv("iris.csv", col_types = list( Sepal.Length = col_double(), Sepal.Width = col_double(), Petal.Length = col_double(), Petal.Width = col_double(), Species = col_factor(c("setosa", "versicolor", "virginica")) )) ``` Or, with their abbreviations: ```r read_csv("iris.csv", col_types = list( Sepal.Length = "d", Sepal.Width = "d", Petal.Length = "d", Petal.Width = "d", Species = col_factor(c("setosa", "versicolor", "virginica")) )) ``` Any omitted columns will be parsed automatically, so the previous call will lead to the same result as: ```r read_csv("iris.csv", col_types = list( Species = col_factor(c("setosa", "versicolor", "virginica"))) ) ``` You can also set a default type that will be used instead of relying on the automatic detection for columns you don't specify: ```r read_csv("iris.csv", col_types = list( Species = col_factor(c("setosa", "versicolor", "virginica")), .default = col_double()) ) ``` If you only want to read specified columns, use `cols_only()`: ```r read_csv("iris.csv", col_types = cols_only( Species = col_factor(c("setosa", "versicolor", "virginica"))) ) ``` ### Output The output of all these functions is a tibble. Note that characters are never automatically converted to factors (i.e. no more `stringsAsFactors = FALSE`) and column names are left as is, not munged into valid R identifiers (i.e. there is no `check.names = TRUE`). Row names are never set. Attributes store the column specification (`spec()`) and any parsing problems (`problems()`). readr/inst/doc/readr.html0000644000176200001440000011622714547603063015075 0ustar liggesusers Introduction to readr

Introduction to readr

The key problem that readr solves is parsing a flat file into a tibble. Parsing is the process of taking a text file and turning it into a rectangular tibble where each column is the appropriate part. Parsing takes place in three basic stages:

  1. The flat file is parsed into a rectangular matrix of strings.

  2. The type of each column is determined.

  3. Each column of strings is parsed into a vector of a more specific type.

It’s easiest to learn how this works in the opposite order Below, you’ll learn how the:

  1. Vector parsers turn a character vector in to a more specific type.

  2. Column specification describes the type of each column and the strategy readr uses to guess types so you don’t need to supply them all.

  3. Rectangular parsers turn a flat file into a matrix of rows and columns.

Each parse_*() is coupled with a col_*() function, which will be used in the process of parsing a complete tibble.

Vector parsers

It’s easiest to learn the vector parses using parse_ functions. These all take a character vector and some options. They return a new vector the same length as the old, along with an attribute describing any problems.

Atomic vectors

parse_logical(), parse_integer(), parse_double(), and parse_character() are straightforward parsers that produce the corresponding atomic vector.

parse_integer(c("1", "2", "3"))
#> [1] 1 2 3
parse_double(c("1.56", "2.34", "3.56"))
#> [1] 1.56 2.34 3.56
parse_logical(c("true", "false"))
#> [1]  TRUE FALSE

By default, readr expects . as the decimal mark and , as the grouping mark. You can override this default using locale(), as described in vignette("locales").

Flexible numeric parser

parse_integer() and parse_double() are strict: the input string must be a single number with no leading or trailing characters. parse_number() is more flexible: it ignores non-numeric prefixes and suffixes, and knows how to deal with grouping marks. This makes it suitable for reading currencies and percentages:

parse_number(c("0%", "10%", "150%"))
#> [1]   0  10 150
parse_number(c("$1,234.5", "$12.45"))
#> [1] 1234.50   12.45

Date/times

readr supports three types of date/time data:

  • dates: number of days since 1970-01-01.
  • times: number of seconds since midnight.
  • datetimes: number of seconds since midnight 1970-01-01.
parse_datetime("2010-10-01 21:45")
#> [1] "2010-10-01 21:45:00 UTC"
parse_date("2010-10-01")
#> [1] "2010-10-01"
parse_time("1:00pm")
#> 13:00:00

Each function takes a format argument which describes the format of the string. If not specified, it uses a default value:

  • parse_datetime() recognises ISO8601 datetimes.

  • parse_date() uses the date_format specified by the locale(). The default value is %AD which uses an automatic date parser that recognises dates of the format Y-m-d or Y/m/d.

  • parse_time() uses the time_format specified by the locale(). The default value is %At which uses an automatic time parser that recognises times of the form H:M optionally followed by seconds and am/pm.

In most cases, you will need to supply a format, as documented in parse_datetime():

parse_datetime("1 January, 2010", "%d %B, %Y")
#> [1] "2010-01-01 UTC"
parse_datetime("02/02/15", "%m/%d/%y")
#> [1] "2015-02-02 UTC"

Factors

When reading a column that has a known set of values, you can read directly into a factor. parse_factor() will generate a warning if a value is not in the supplied levels.

parse_factor(c("a", "b", "a"), levels = c("a", "b", "c"))
#> [1] a b a
#> Levels: a b c
parse_factor(c("a", "b", "d"), levels = c("a", "b", "c"))
#> Warning: 1 parsing failure.
#> row col           expected actual
#>   3  -- value in level set      d
#> [1] a    b    <NA>
#> attr(,"problems")
#> # A tibble: 1 × 4
#>     row   col expected           actual
#>   <int> <int> <chr>              <chr> 
#> 1     3    NA value in level set d     
#> Levels: a b c

Column specification

It would be tedious if you had to specify the type of every column when reading a file. Instead readr, uses some heuristics to guess the type of each column. You can access these results yourself using guess_parser():

guess_parser(c("a", "b", "c"))
#> [1] "character"
guess_parser(c("1", "2", "3"))
#> [1] "double"
guess_parser(c("1,000", "2,000", "3,000"))
#> [1] "number"
guess_parser(c("2001/10/10"))
#> [1] "date"

The guessing policies are described in the documentation for the individual functions. Guesses are fairly strict. For example, we don’t guess that currencies are numbers, even though we can parse them:

guess_parser("$1,234")
#> [1] "character"
parse_number("$1,234")
#> [1] 1234

There are two parsers that will never be guessed: col_skip() and col_factor(). You will always need to supply these explicitly.

You can see the specification that readr would generate for a column file by using spec_csv(), spec_tsv() and so on:

x <- spec_csv(readr_example("challenge.csv"))

For bigger files, you can often make the specification simpler by changing the default column type using cols_condense()

mtcars_spec <- spec_csv(readr_example("mtcars.csv"))
mtcars_spec
#> cols(
#>   mpg = col_double(),
#>   cyl = col_double(),
#>   disp = col_double(),
#>   hp = col_double(),
#>   drat = col_double(),
#>   wt = col_double(),
#>   qsec = col_double(),
#>   vs = col_double(),
#>   am = col_double(),
#>   gear = col_double(),
#>   carb = col_double()
#> )

cols_condense(mtcars_spec)
#> cols(
#>   .default = col_double()
#> )

By default readr only looks at the first 1000 rows. This keeps file parsing speedy, but can generate incorrect guesses. For example, in challenge.csv the column types change in row 1001, so readr guesses the wrong types. One way to resolve the problem is to increase the number of rows:

x <- spec_csv(readr_example("challenge.csv"), guess_max = 1001)

Another way is to manually specify the col_type, as described below.

Rectangular parsers

readr comes with five parsers for rectangular file formats:

  • read_csv() and read_csv2() for csv files
  • read_tsv() for tabs separated files
  • read_fwf() for fixed-width files
  • read_log() for web log files

Each of these functions firsts calls spec_xxx() (as described above), and then parses the file according to that column specification:

df1 <- read_csv(readr_example("challenge.csv"))
#> Rows: 2000 Columns: 2
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl  (1): x
#> date (1): y
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

The rectangular parsing functions almost always succeed; they’ll only fail if the format is severely messed up. Instead, readr will generate a data frame of problems. The first few will be printed out, and you can access them all with problems():

problems(df1)
#> # A tibble: 0 × 5
#> # ℹ 5 variables: row <int>, col <int>, expected <chr>, actual <chr>, file <chr>

You’ve already seen one way of handling bad guesses: increasing the number of rows used to guess the type of each column.

df2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
#> Rows: 2000 Columns: 2
#> ── Column specification ────────────────────────────────────────────────────────
#> Delimiter: ","
#> dbl  (1): x
#> date (1): y
#> 
#> ℹ Use `spec()` to retrieve the full column specification for this data.
#> ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Another approach is to manually supply the column specification.

Overriding the defaults

In the previous examples, you may have noticed that readr printed the column specification that it used to parse the file:

#> Parsed with column specification:
#> cols(
#>   x = col_integer(),
#>   y = col_character()
#> )

You can also access it after the fact using spec():

spec(df1)
#> cols(
#>   x = col_double(),
#>   y = col_date(format = "")
#> )
spec(df2)
#> cols(
#>   x = col_double(),
#>   y = col_date(format = "")
#> )

(This also allows you to access the full column specification if you’re reading a very wide file. By default, readr will only print the specification of the first 20 columns.)

If you want to manually specify the column types, you can start by copying and pasting this code, and then tweaking it fix the parsing problems.

df3 <- read_csv(
  readr_example("challenge.csv"), 
  col_types = list(
    x = col_double(),
    y = col_date(format = "")
  )
)

In general, it’s good practice to supply an explicit column specification. It is more work, but it ensures that you get warnings if the data changes in unexpected ways. To be really strict, you can use stop_for_problems(df3). This will throw an error if there are any parsing problems, forcing you to fix those problems before proceeding with the analysis.

Available column specifications

The available specifications are: (with string abbreviations in brackets)

  • col_logical() [l], containing only T, F, TRUE or FALSE.
  • col_integer() [i], integers.
  • col_double() [d], doubles.
  • col_character() [c], everything else.
  • col_factor(levels, ordered) [f], a fixed set of values.
  • col_date(format = "") [D]: with the locale’s date_format.
  • col_time(format = "") [t]: with the locale’s time_format.
  • col_datetime(format = "") [T]: ISO8601 date times
  • col_number() [n], numbers containing the grouping_mark
  • col_skip() [_, -], don’t import this column.
  • col_guess() [?], parse using the “best” type based on the input.

Use the col_types argument to override the default choices. There are two ways to use it:

  • With a string: "dc__d": read first column as double, second as character, skip the next two and read the last column as a double. (There’s no way to use this form with types that take additional parameters.)

  • With a (named) list of col objects:

    read_csv("iris.csv", col_types = list(
      Sepal.Length = col_double(),
      Sepal.Width = col_double(),
      Petal.Length = col_double(),
      Petal.Width = col_double(),
      Species = col_factor(c("setosa", "versicolor", "virginica"))
    ))

    Or, with their abbreviations:

      read_csv("iris.csv", col_types = list(
      Sepal.Length = "d",
      Sepal.Width = "d",
      Petal.Length = "d",
      Petal.Width = "d",
      Species = col_factor(c("setosa", "versicolor", "virginica"))
    ))

Any omitted columns will be parsed automatically, so the previous call will lead to the same result as:

read_csv("iris.csv", col_types = list(
  Species = col_factor(c("setosa", "versicolor", "virginica")))
)

You can also set a default type that will be used instead of relying on the automatic detection for columns you don’t specify:

read_csv("iris.csv", col_types = list(
  Species = col_factor(c("setosa", "versicolor", "virginica")),
  .default = col_double())
)

If you only want to read specified columns, use cols_only():

read_csv("iris.csv", col_types = cols_only(
  Species = col_factor(c("setosa", "versicolor", "virginica")))
)

Output

The output of all these functions is a tibble. Note that characters are never automatically converted to factors (i.e. no more stringsAsFactors = FALSE) and column names are left as is, not munged into valid R identifiers (i.e. there is no check.names = TRUE). Row names are never set.

Attributes store the column specification (spec()) and any parsing problems (problems()).

readr/inst/doc/readr.R0000644000176200001440000000520214547603062014317 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ## ----------------------------------------------------------------------------- parse_integer(c("1", "2", "3")) parse_double(c("1.56", "2.34", "3.56")) parse_logical(c("true", "false")) ## ----------------------------------------------------------------------------- parse_number(c("0%", "10%", "150%")) parse_number(c("$1,234.5", "$12.45")) ## ----------------------------------------------------------------------------- parse_datetime("2010-10-01 21:45") parse_date("2010-10-01") parse_time("1:00pm") ## ----------------------------------------------------------------------------- parse_datetime("1 January, 2010", "%d %B, %Y") parse_datetime("02/02/15", "%m/%d/%y") ## ----------------------------------------------------------------------------- parse_factor(c("a", "b", "a"), levels = c("a", "b", "c")) parse_factor(c("a", "b", "d"), levels = c("a", "b", "c")) ## ----------------------------------------------------------------------------- guess_parser(c("a", "b", "c")) guess_parser(c("1", "2", "3")) guess_parser(c("1,000", "2,000", "3,000")) guess_parser(c("2001/10/10")) ## ----------------------------------------------------------------------------- guess_parser("$1,234") parse_number("$1,234") ## ----------------------------------------------------------------------------- x <- spec_csv(readr_example("challenge.csv")) ## ----------------------------------------------------------------------------- mtcars_spec <- spec_csv(readr_example("mtcars.csv")) mtcars_spec cols_condense(mtcars_spec) ## ----------------------------------------------------------------------------- x <- spec_csv(readr_example("challenge.csv"), guess_max = 1001) ## ----------------------------------------------------------------------------- df1 <- read_csv(readr_example("challenge.csv")) ## ----------------------------------------------------------------------------- problems(df1) ## ----------------------------------------------------------------------------- df2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001) ## ----------------------------------------------------------------------------- #> Parsed with column specification: #> cols( #> x = col_integer(), #> y = col_character() #> ) ## ----------------------------------------------------------------------------- spec(df1) spec(df2) ## ----------------------------------------------------------------------------- df3 <- read_csv( readr_example("challenge.csv"), col_types = list( x = col_double(), y = col_date(format = "") ) ) readr/inst/doc/column-types.html0000644000176200001440000004353414547603062016436 0ustar liggesusers Column type

Column type

This vignette provides an overview of column type specification with readr. Currently it focuses on how automatic guessing works, but over time we expect to cover more topics.

library(readr)

Automatic guessing

If you don’t explicit specify column types with the col_types argument, readr will attempt to guess them using some simple heuristics. By default, it will inspect 1000 values, evenly spaced from the first to the last row. This is a heuristic designed to always be fast (no matter how large your file is) and, in our experience, does a good job in most cases.

If needed, you can request that readr use more rows by supplying the guess_max argument. You can even supply guess_max = Inf to use every row to guess the column types. You might wonder why this isn’t the default. That’s because it’s slow: it has to look at every column twice, once to determine the type and once to parse the value. In most cases, you’re best off supplying col_types yourself.

Legacy behavior

Column type guessing was substantially worse in the first edition of readr (meaning, prior to v2.0.0), because it always looked at the first 1000 rows, and through some application of Murphy’s Law, it appears that many real csv files have lots of empty values at the start, followed by more “excitement” later in the file. Let’s demonstrate the problem with a slightly tricky file: the column x is mostly empty, but has some numeric data at the very end, in row 1001.

tricky_dat <- tibble::tibble(
  x = rep(c("", "2"), c(1000, 1)),
  y = "y"
)
tfile <- tempfile("tricky-column-type-guessing-", fileext = ".csv")
write_csv(tricky_dat, tfile)

The first edition parser doesn’t guess the right type for x so the 2 becomes an NA:

df <- with_edition(1, read_csv(tfile))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   x = col_logical(),
#>   y = col_character()
#> )
#> Warning: 1 parsing failure.
#>  row col           expected actual                                                            file
#> 1001   x 1/0/T/F/TRUE/FALSE      2 '/tmp/Rtmp803EnH/tricky-column-type-guessing-14bec2753da6a.csv'
tail(df)
#> # A tibble: 6 × 2
#>   x     y    
#>   <lgl> <chr>
#> 1 NA    y    
#> 2 NA    y    
#> 3 NA    y    
#> 4 NA    y    
#> 5 NA    y    
#> 6 NA    y

For this specific case, we can fix the problem by marginally increasing guess_max:

df <- with_edition(1, read_csv(tfile, guess_max = 1001))
#> 
#> ── Column specification ────────────────────────────────────────────────────────
#> cols(
#>   x = col_double(),
#>   y = col_character()
#> )
tail(df)
#> # A tibble: 6 × 2
#>       x y    
#>   <dbl> <chr>
#> 1    NA y    
#> 2    NA y    
#> 3    NA y    
#> 4    NA y    
#> 5    NA y    
#> 6     2 y

Unlike the second edition, we don’t recommend using guess_max = Inf with the legacy parser, because the engine pre-allocates a large amount of memory in the face of this uncertainty. This means that reading with guess_max = Inf can be extremely slow and might even crash your R session. Instead specify the col_types:

df <- with_edition(1, read_csv(tfile, col_types = list(x = col_double())))
tail(df)
#> # A tibble: 6 × 2
#>       x y    
#>   <dbl> <chr>
#> 1    NA y    
#> 2    NA y    
#> 3    NA y    
#> 4    NA y    
#> 5    NA y    
#> 6     2 y
readr/inst/doc/locales.Rmd0000644000176200001440000002063014510343737015167 0ustar liggesusers--- title: "Locales" output: rmarkdown::html_vignette vignette: > %\VignetteIndexEntry{Locales} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} --- ```{r, include = FALSE} library(readr) knitr::opts_chunk$set(collapse = TRUE, comment = "#>") ``` The goal of readr's locales is to encapsulate common options that vary between languages and localities. This includes: * The names of months and days, used when parsing dates. * The default time zone, used when parsing datetimes. * The character encoding, used when reading non-ASCII strings. * Default date format, used when guessing column types. * The decimal and grouping marks, used when reading numbers. (Strictly speaking these are not locales in the usual technical sense of the word because they also contain information about time zones and encoding.) To create a new locale, you use the `locale()` function: ```{r} locale() ``` This rest of this vignette will explain what each of the options do. All of the parsing function in readr take a `locale` argument. You'll most often use it with `read_csv()`, `read_fwf()` or `read_table()`. Readr is designed to work the same way across systems, so the default locale is English centric like R. If you're not in an English speaking country, this makes initial import a little harder, because you have to override the defaults. But the payoff is big: you can share your code and know that it will work on any other system. Base R takes a different philosophy. It uses system defaults, so typical data import is a little easier, but sharing code is harder. Rather than demonstrating the use of locales with `read_csv()` and fields, in this vignette I'm going to use the `parse_*()` functions. These work with a character vector instead of a file on disk, so they're easier to use in examples. They're also useful in their own right if you need to do custom parsing. See `type_convert()` if you need to apply multiple parsers to a data frame. ## Dates and times ### Names of months and days The first argument to `locale()` is `date_names`, and it controls what values are used for month and day names. The easiest way to specify it is with a ISO 639 language code: ```{r} locale("ko") # Korean locale("fr") # French ``` If you don't already know the code for your language, [Wikipedia](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) has a good list. Currently readr has `r length(date_names_langs())` languages available. You can list them all with `date_names_langs()`. Specifying a locale allows you to parse dates in other languages: ```{r} parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr")) parse_date("14 oct. 1979", "%d %b %Y", locale = locale("fr")) ``` For many languages, it's common to find that diacritics have been stripped so they can be stored as ASCII. You can tell the locale that with the `asciify` option: ```{r} parse_date("1 août 2015", "%d %B %Y", locale = locale("fr")) parse_date("1 aout 2015", "%d %B %Y", locale = locale("fr", asciify = TRUE)) ``` Note that the quality of the translations is variable, especially for the rarer languages. If you discover that they're not quite right for your data, you can create your own with `date_names()`. The following example creates a locale with Māori date names: ```{r} maori <- locale(date_names( day = c("Rātapu", "Rāhina", "Rātū", "Rāapa", "Rāpare", "Rāmere", "Rāhoroi"), mon = c("Kohi-tātea", "Hui-tanguru", "Poutū-te-rangi", "Paenga-whāwhā", "Haratua", "Pipiri", "Hōngongoi", "Here-turi-kōkā", "Mahuru", "Whiringa-ā-nuku", "Whiringa-ā-rangi", "Hakihea") )) ``` ### Timezones Unless otherwise specified, readr assumes that times are in UTC, the Universal Coordinated Time (this is a successor to GMT and for almost all intents is identical). UTC is most suitable for data because it doesn't have daylight savings - this avoids a whole class of potential problems. If your data isn't already in UTC, you'll need to supply a `tz` in the locale: ```{r} parse_datetime("2001-10-10 20:10") parse_datetime("2001-10-10 20:10", locale = locale(tz = "Pacific/Auckland")) parse_datetime("2001-10-10 20:10", locale = locale(tz = "Europe/Dublin")) ``` You can see a complete list of time zones with `OlsonNames()`. If you're American, note that "EST" is a Canadian time zone that does not have DST. It's not Eastern Standard Time! Instead use: * PST/PDT = "US/Pacific" * CST/CDT = "US/Central" * MST/MDT = "US/Mountain" * EST/EDT = "US/Eastern" (Note that there are more specific time zones for smaller areas that don't follow the same rules. For example, "US/Arizona", which follows mostly follows mountain time, but doesn't have daylight savings. If you're dealing with historical data, you might need an even more specific zone like "America/North_Dakota/New_Salem" - that will get you the most accurate time zones.) Note that these are only used as defaults. If individual times have timezones and you're using "%Z" (as name, e.g. "America/Chicago") or "%z" (as offset from UTC, e.g. "+0800"), they'll override the defaults. There's currently no good way to parse times that use US abbreviations. Note that once you have the date in R, changing the time zone just changes its printed representation - it still represents the same instants of time. If you've loaded non-UTC data, and want to display it as UTC, try this snippet of code: ```{r, eval = FALSE} is_datetime <- sapply(df, inherits, "POSIXct") df[is_datetime] <- lapply(df[is_datetime], function(x) { attr(x, "tzone") <- "UTC" x }) ``` ### Default formats Locales also provide default date and time formats. The date format is used when guessing column types. The default date format is `%AD`, a flexible YMD parser (see `?parse_date`): ```{r} str(parse_guess("2010-10-10")) str(parse_guess("2010/10/10")) ``` If you're an American, you might want to use your illogical date system:: ```{r} str(parse_guess("01/31/2013")) str(parse_guess("01/31/2013", locale = locale(date_format = "%m/%d/%Y"))) ``` The time format is also used when guessing column types. The default time format is `%AT`, a flexible HMS parser (see `?parse_time`): ```{r} str(parse_guess("17:55:14")) str(parse_guess("5:55:14 PM")) # Example of a non-standard time str(parse_guess("h5m55s14 PM")) str(parse_guess("h5m55s14 PM", locale = locale(time_format = "h%Hm%Ms%S %p"))) ``` ## Character All readr functions yield strings encoded in UTF-8. This encoding is the most likely to give good results in the widest variety of settings. By default, readr assumes that your input is also in UTF-8. This is less likely to be the case, especially when you're working with older datasets. The following code illustrates the problems with encodings: ```{r} library(stringi) x <- "Émigré cause célèbre déjà vu.\n" y <- stri_conv(x, "UTF-8", "latin1") # These strings look like they're identical: x y identical(x, y) # But they have different encodings: Encoding(x) Encoding(y) # That means while they print the same, their raw (binary) # representation is actually quite different: charToRaw(x) charToRaw(y) # readr expects strings to be encoded as UTF-8. If they're # not, you'll get weird characters parse_character(x) parse_character(y) # If you know the encoding, supply it: parse_character(y, locale = locale(encoding = "latin1")) ``` If you don't know what encoding the file uses, try [`guess_encoding()`](https://readr.tidyverse.org/reference/encoding.html). It's not 100% perfect (as it's fundamentally a heuristic), but should at least get you pointed in the right direction: ```{r} guess_encoding(x) guess_encoding(y) # Note that the first guess produces a valid string, but isn't correct: parse_character(y, locale = locale(encoding = "ISO-8859-2")) # But ISO-8859-1 is another name for latin1 parse_character(y, locale = locale(encoding = "ISO-8859-1")) ``` ## Numbers Some countries use the decimal point, while others use the decimal comma. The `decimal_mark` option controls which readr uses when parsing doubles: ```{r} parse_double("1,23", locale = locale(decimal_mark = ",")) ``` Additionally, when writing out big numbers, you might have `1,000,000`, `1.000.000`, `1 000 000`, or `1'000'000`. The grouping mark is ignored by the more flexible number parser: ```{r} parse_number("$1,234.56") parse_number("$1.234,56", locale = locale(decimal_mark = ",", grouping_mark = ".") ) # readr is smart enough to guess that if you're using , for decimals then # you're probably using . for grouping: parse_number("$1.234,56", locale = locale(decimal_mark = ",")) ``` readr/inst/doc/column-types.R0000644000176200001440000000202114547603061015654 0ustar liggesusers## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----setup-------------------------------------------------------------------- library(readr) ## ----------------------------------------------------------------------------- tricky_dat <- tibble::tibble( x = rep(c("", "2"), c(1000, 1)), y = "y" ) tfile <- tempfile("tricky-column-type-guessing-", fileext = ".csv") write_csv(tricky_dat, tfile) ## ----------------------------------------------------------------------------- df <- with_edition(1, read_csv(tfile)) tail(df) ## ----------------------------------------------------------------------------- df <- with_edition(1, read_csv(tfile, guess_max = 1001)) tail(df) ## ----------------------------------------------------------------------------- df <- with_edition(1, read_csv(tfile, col_types = list(x = col_double()))) tail(df) ## ----------------------------------------------------------------------------- file.remove(tfile) readr/inst/extdata/0000755000176200001440000000000014315646511013764 5ustar liggesusersreadr/inst/extdata/mtcars.csv0000644000176200001440000000242714152512262015771 0ustar liggesusers"mpg","cyl","disp","hp","drat","wt","qsec","vs","am","gear","carb" 21,6,160,110,3.9,2.62,16.46,0,1,4,4 21,6,160,110,3.9,2.875,17.02,0,1,4,4 22.8,4,108,93,3.85,2.32,18.61,1,1,4,1 21.4,6,258,110,3.08,3.215,19.44,1,0,3,1 18.7,8,360,175,3.15,3.44,17.02,0,0,3,2 18.1,6,225,105,2.76,3.46,20.22,1,0,3,1 14.3,8,360,245,3.21,3.57,15.84,0,0,3,4 24.4,4,146.7,62,3.69,3.19,20,1,0,4,2 22.8,4,140.8,95,3.92,3.15,22.9,1,0,4,2 19.2,6,167.6,123,3.92,3.44,18.3,1,0,4,4 17.8,6,167.6,123,3.92,3.44,18.9,1,0,4,4 16.4,8,275.8,180,3.07,4.07,17.4,0,0,3,3 17.3,8,275.8,180,3.07,3.73,17.6,0,0,3,3 15.2,8,275.8,180,3.07,3.78,18,0,0,3,3 10.4,8,472,205,2.93,5.25,17.98,0,0,3,4 10.4,8,460,215,3,5.424,17.82,0,0,3,4 14.7,8,440,230,3.23,5.345,17.42,0,0,3,4 32.4,4,78.7,66,4.08,2.2,19.47,1,1,4,1 30.4,4,75.7,52,4.93,1.615,18.52,1,1,4,2 33.9,4,71.1,65,4.22,1.835,19.9,1,1,4,1 21.5,4,120.1,97,3.7,2.465,20.01,1,0,3,1 15.5,8,318,150,2.76,3.52,16.87,0,0,3,2 15.2,8,304,150,3.15,3.435,17.3,0,0,3,2 13.3,8,350,245,3.73,3.84,15.41,0,0,3,4 19.2,8,400,175,3.08,3.845,17.05,0,0,3,2 27.3,4,79,66,4.08,1.935,18.9,1,1,4,1 26,4,120.3,91,4.43,2.14,16.7,0,1,5,2 30.4,4,95.1,113,3.77,1.513,16.9,1,1,5,2 15.8,8,351,264,4.22,3.17,14.5,0,1,5,4 19.7,6,145,175,3.62,2.77,15.5,0,1,5,6 15,8,301,335,3.54,3.57,14.6,0,1,5,8 21.4,4,121,109,4.11,2.78,18.6,1,1,4,2 readr/inst/extdata/epa78.txt0000644000176200001440000000420414174357220015450 0ustar liggesusersALFA ROMEO ALFA ROMEO 78010003 ALFETTA 03 81 8 74 7 89 9 ALFETTA 78010053 SPIDER 2000 01 SPIDER 2000 78010103 AMC AMC 78020002 GREMLIN 03 79 9 79 9 GREMLIN 78020053 PACER 04 89 11 89 11 PACER 78020103 PACER WAGON 07 90 26 91 26 PACER WAGON 78020153 CONCORD 04 88 12 90 11 90 11 83 16 CONCORD 78020203 CONCORD WAGON 07 91 30 91 30 CONCORD WAGON 78020253 MATADOR COUPE 05 97 14 97 14 MATADOR COUPE 78020303 MATADOR SEDAN 06 110 20 110 20 MATADOR SEDAN 78020353 MATADOR WAGON 09 112 50 112 50 MATADOR WAGON 78020403 ASTON MARTIN ASTON MARTIN 78040002 ASTON MARTIN ASTON MARTIN 78040053 AUDI AUDI 78050002 FOX 03 84 11 84 11 84 11 FOX 78050053 FOX WAGON 07 83 40 83 40 FOX WAGON 78050103 5000 04 90 15 90 15 5000 78050153 AVANTI AVANTI 78065002 AVANTI II 02 75 8 75 8 AVANTI II 78065053 readr/inst/extdata/massey-rating.txt0000644000176200001440000000136614173633774017327 0ustar liggesusersUCC PAY LAZ KPK RT COF BIH DII ENG ACU Rank Team Conf 1 1 1 1 1 1 1 1 1 1 1 Ohio St B10 2 2 2 2 2 2 2 2 4 2 2 Oregon P12 3 4 3 4 3 4 3 4 2 3 3 Alabama SEC 4 3 4 3 4 3 5 3 3 4 4 TCU B12 6 6 6 5 5 7 6 5 6 11 5 Michigan St B10 7 7 7 6 7 6 11 8 7 8 6 Georgia SEC 5 5 5 7 6 8 4 6 5 5 7 Florida St ACC 8 8 9 9 10 5 7 7 10 7 8 Baylor B12 9 11 8 13 11 11 12 9 14 9 9 Georgia Tech ACC 13 10 13 11 8 9 10 11 9 10 10 Mississippi SEC readr/inst/extdata/mtcars.csv.zip0000644000176200001440000000130714152512262016566 0ustar liggesusersPKIF mtcars.csvUX ܫUjUun1 E A|9n ^RQXstI^R^;ow<|/?/Oxz`ay}zaJ5Q-$#q5a3iϘ,Q˅7MPinR=WJA*+!\2FVp8>yPL8puܘqxVN[MjXKd롆!V ("S epe,:^PO?76n_ºH_Nr }Ma`~Ӹ(0ѫh* xh愣 'V/i"YvI(cPȌ0i_8%n=/J#@P ړd0I1JJ*9CYi;7BM˺ qitmYrڨul5HGSB^@ltw φWpSf= U(f;"3 {[vRZn8cz\Q`@A_3+9{PKX%PKIFX% @mtcars.csvUXܫUjUPKDmreadr/inst/extdata/mini-gapminder-oceania.csv0000644000176200001440000000016714315646511021002 0ustar liggesuserscountry,year,lifeExp,pop,gdpPercap Australia,1952,69.12,8691212,10039.59564 New Zealand,1952,69.39,1994794,10556.57566 readr/inst/extdata/fwf-sample.txt0000644000176200001440000000020114173632075016561 0ustar liggesusersJohn Smith WA 418-Y11-4111 Mary Hartford CA 319-Z19-4341 Evan Nolan IL 219-532-c301 readr/inst/extdata/example.log0000644000176200001440000000032014152512262016107 0ustar liggesusers172.21.13.45 - Microsoft\JohnDoe [08/Apr/2001:17:39:04 -0800] "GET /scripts/iisadmin/ism.dll?http/serv HTTP/1.0" 200 3401 127.0.0.1 - frank [10/Oct/2000:13:55:36 -0700] "GET /apache_pb.gif HTTP/1.0" 200 2326 readr/inst/extdata/mini-gapminder-asia.csv0000644000176200001440000000044014315646511020312 0ustar liggesuserscountry,year,lifeExp,pop,gdpPercap Afghanistan,1952,28.801,8425333,779.4453145 Bahrain,1952,50.939,120447,9867.084765 Bangladesh,1952,37.484,46886859,684.2441716 Cambodia,1952,39.417,4693836,368.4692856 China,1952,44,556263527,400.448611 "Hong Kong, China",1952,60.96,2125900,3054.421209 readr/inst/extdata/chickens.csv0000644000176200001440000000052014304131171016252 0ustar liggesuserschicken,sex,eggs_laid,motto Foghorn Leghorn,rooster,0,"That's a joke, ah say, that's a joke, son." Chicken Little,hen,3,"The sky is falling!" Ginger,hen,12,"Listen. We'll either die free chickens or we die trying." Camilla the Chicken,hen,7,"Bawk, buck, ba-gawk." Ernie The Giant Chicken,rooster,0,"Put Captain Solo in the cargo hold." readr/inst/extdata/mtcars.csv.bz20000644000176200001440000000105114152512262016455 0ustar liggesusersBZh91AY&SY"!ـ>}@ᦀԨɦ&4QAh`jM !  @Fu2f qgtKNr,:Zb^6MBTዔi=XihUP4,4#m^ȱ 75,T 1O9|fGM,D%{1_ӧoFsCU]CUpdxVRmDfK٥AX/{ƺ^U*]WƔ2;ED]ر`ށd9 L<1 "(H Wreadr/inst/extdata/mini-gapminder-africa.csv0000644000176200001440000000042314315646511020623 0ustar liggesuserscountry,year,lifeExp,pop,gdpPercap Algeria,1952,43.077,9279525,2449.008185 Angola,1952,30.015,4232095,3520.610273 Benin,1952,38.223,1738315,1062.7522 Botswana,1952,47.622,442308,851.2411407 Burkina Faso,1952,31.975,4469979,543.2552413 Burundi,1952,39.031,2445618,339.2964587 readr/inst/extdata/challenge.csv0000644000176200001440000011223114314603703016416 0ustar liggesusersx,y 404,NA 4172,NA 3004,NA 787,NA 37,NA 2332,NA 2489,NA 1449,NA 3665,NA 3863,NA 4374,NA 875,NA 172,NA 1602,NA 2012,NA 979,NA 2018,NA 319,NA 1944,NA 4878,NA 1450,NA 3392,NA 3677,NA 980,NA 4903,NA 3708,NA 258,NA 2652,NA 3480,NA 3443,NA 157,NA 1128,NA 1505,NA 3183,NA 2396,NA 2161,NA 3533,NA 4743,NA 902,NA 1085,NA 3401,NA 2495,NA 3209,NA 3302,NA 481,NA 3829,NA 3849,NA 4954,NA 4853,NA 1946,NA 2306,NA 1577,NA 874,NA 2658,NA 2469,NA 3897,NA 1021,NA 3567,NA 327,NA 1772,NA 4126,NA 1370,NA 2851,NA 1679,NA 2982,NA 958,NA 4739,NA 2713,NA 2724,NA 1393,NA 2234,NA 1858,NA 141,NA 2330,NA 1951,NA 101,NA 1885,NA 2800,NA 4286,NA 1925,NA 2640,NA 3004,NA 1307,NA 1451,NA 2401,NA 4601,NA 2004,NA 1066,NA 3359,NA 294,NA 4986,NA 746,NA 2593,NA 4231,NA 3592,NA 1207,NA 2736,NA 4175,NA 140,NA 2347,NA 4029,NA 4071,NA 2020,NA 1093,NA 2092,NA 3345,NA 2539,NA 3302,NA 2559,NA 4178,NA 3544,NA 4372,NA 58,NA 4442,NA 4982,NA 2501,NA 1795,NA 3875,NA 2923,NA 3170,NA 4294,NA 2835,NA 1265,NA 4595,NA 4337,NA 1243,NA 2015,NA 3849,NA 598,NA 974,NA 823,NA 3317,NA 4283,NA 4633,NA 2762,NA 2886,NA 3438,NA 1224,NA 224,NA 4550,NA 354,NA 4985,NA 3060,NA 863,NA 4548,NA 188,NA 2968,NA 1185,NA 4532,NA 4095,NA 3500,NA 1101,NA 3640,NA 1086,NA 2282,NA 1664,NA 2842,NA 1262,NA 2321,NA 4589,NA 4865,NA 4096,NA 4515,NA 2907,NA 3866,NA 4976,NA 3555,NA 1075,NA 1459,NA 3609,NA 4334,NA 1193,NA 23,NA 4718,NA 2191,NA 3754,NA 3340,NA 2040,NA 1757,NA 3691,NA 3322,NA 427,NA 4281,NA 385,NA 4265,NA 532,NA 2425,NA 1237,NA 3433,NA 819,NA 4765,NA 1610,NA 1808,NA 4439,NA 4141,NA 504,NA 4531,NA 3864,NA 1917,NA 4999,NA 1747,NA 4737,NA 1081,NA 161,NA 727,NA 4272,NA 1066,NA 1052,NA 198,NA 4724,NA 1225,NA 3906,NA 1442,NA 4377,NA 1479,NA 4918,NA 2950,NA 3796,NA 4181,NA 3815,NA 2087,NA 691,NA 405,NA 3280,NA 3011,NA 3285,NA 1647,NA 4898,NA 3576,NA 4364,NA 4917,NA 1093,NA 3323,NA 1948,NA 231,NA 3085,NA 2993,NA 2035,NA 4292,NA 2589,NA 4897,NA 86,NA 3368,NA 1857,NA 4591,NA 3390,NA 3326,NA 3781,NA 2715,NA 1197,NA 2545,NA 2087,NA 3635,NA 3189,NA 1983,NA 4798,NA 1494,NA 252,NA 2881,NA 1090,NA 630,NA 4691,NA 4007,NA 3791,NA 2663,NA 2735,NA 480,NA 1942,NA 862,NA 3454,NA 3377,NA 4732,NA 982,NA 4844,NA 1936,NA 3252,NA 4073,NA 355,NA 2635,NA 3818,NA 2177,NA 2763,NA 1021,NA 156,NA 4849,NA 894,NA 3892,NA 4429,NA 4183,NA 3027,NA 4535,NA 180,NA 658,NA 471,NA 3483,NA 2029,NA 329,NA 633,NA 4687,NA 1082,NA 3331,NA 1020,NA 2743,NA 4138,NA 638,NA 1306,NA 1443,NA 74,NA 4269,NA 2071,NA 3756,NA 3596,NA 2577,NA 4875,NA 1804,NA 852,NA 3916,NA 155,NA 3948,NA 834,NA 144,NA 3930,NA 4127,NA 4827,NA 1894,NA 872,NA 3019,NA 4028,NA 184,NA 3665,NA 1077,NA 81,NA 644,NA 3431,NA 3210,NA 1637,NA 1938,NA 3538,NA 3489,NA 4629,NA 2296,NA 2980,NA 826,NA 2835,NA 4491,NA 2973,NA 4159,NA 2968,NA 3895,NA 1989,NA 4250,NA 3710,NA 1589,NA 559,NA 506,NA 4001,NA 1900,NA 264,NA 4933,NA 3021,NA 744,NA 2694,NA 629,NA 4816,NA 235,NA 808,NA 4683,NA 4854,NA 3552,NA 4426,NA 4885,NA 175,NA 2194,NA 3223,NA 4975,NA 1574,NA 4280,NA 2702,NA 4368,NA 2578,NA 4348,NA 4283,NA 1729,NA 1,NA 1026,NA 4727,NA 1407,NA 4405,NA 4806,NA 21,NA 2765,NA 1097,NA 3234,NA 3639,NA 4363,NA 1910,NA 4464,NA 4220,NA 3649,NA 193,NA 1670,NA 3747,NA 4566,NA 1022,NA 2359,NA 1926,NA 1964,NA 1092,NA 37,NA 1819,NA 2631,NA 4221,NA 680,NA 1883,NA 1317,NA 2490,NA 98,NA 436,NA 4980,NA 4711,NA 622,NA 576,NA 1834,NA 2356,NA 3921,NA 2452,NA 510,NA 4718,NA 3531,NA 2512,NA 2650,NA 1293,NA 3559,NA 4843,NA 3306,NA 3982,NA 367,NA 4424,NA 4134,NA 3629,NA 1837,NA 2618,NA 2350,NA 493,NA 2581,NA 2249,NA 2748,NA 3248,NA 796,NA 1469,NA 4457,NA 2941,NA 3167,NA 1298,NA 1592,NA 1697,NA 3804,NA 55,NA 316,NA 1320,NA 2970,NA 1488,NA 474,NA 3807,NA 3863,NA 2010,NA 296,NA 3752,NA 2642,NA 1380,NA 1307,NA 2720,NA 996,NA 3226,NA 3752,NA 1355,NA 4379,NA 4259,NA 230,NA 1906,NA 917,NA 4609,NA 4531,NA 965,NA 4322,NA 67,NA 4429,NA 1958,NA 381,NA 3234,NA 4584,NA 4173,NA 2507,NA 3011,NA 2345,NA 4432,NA 3353,NA 1969,NA 2757,NA 1213,NA 1017,NA 342,NA 1537,NA 4966,NA 582,NA 3578,NA 1131,NA 667,NA 4637,NA 4471,NA 1019,NA 1285,NA 3071,NA 2208,NA 1578,NA 507,NA 1364,NA 3269,NA 4640,NA 134,NA 2798,NA 4271,NA 380,NA 1030,NA 2480,NA 1310,NA 2080,NA 2196,NA 912,NA 392,NA 89,NA 3641,NA 4855,NA 2677,NA 833,NA 291,NA 2296,NA 3114,NA 2975,NA 3716,NA 2622,NA 2485,NA 74,NA 4790,NA 2266,NA 4908,NA 2724,NA 1801,NA 2516,NA 374,NA 4849,NA 3243,NA 4923,NA 2681,NA 3806,NA 2822,NA 3893,NA 3196,NA 1895,NA 1798,NA 4222,NA 2284,NA 896,NA 4832,NA 3568,NA 125,NA 3133,NA 4140,NA 3216,NA 3543,NA 4354,NA 1410,NA 867,NA 549,NA 2932,NA 4254,NA 4608,NA 3379,NA 1753,NA 44,NA 2155,NA 3625,NA 2062,NA 2755,NA 782,NA 216,NA 3424,NA 2573,NA 4729,NA 4216,NA 325,NA 3811,NA 392,NA 668,NA 4049,NA 2797,NA 3669,NA 1749,NA 4914,NA 2045,NA 1805,NA 3263,NA 718,NA 3404,NA 4297,NA 4194,NA 4407,NA 1189,NA 2894,NA 4490,NA 1723,NA 3805,NA 3656,NA 4263,NA 4880,NA 566,NA 4852,NA 3241,NA 281,NA 2366,NA 1474,NA 3052,NA 606,NA 3148,NA 3560,NA 3061,NA 173,NA 3330,NA 3265,NA 2260,NA 2585,NA 3384,NA 4405,NA 3657,NA 1994,NA 2153,NA 728,NA 2256,NA 2894,NA 353,NA 3712,NA 2747,NA 3173,NA 684,NA 4652,NA 3256,NA 2644,NA 1126,NA 4917,NA 546,NA 350,NA 3889,NA 3292,NA 1297,NA 4592,NA 744,NA 3204,NA 1007,NA 3719,NA 4239,NA 4269,NA 4018,NA 120,NA 3977,NA 4433,NA 3001,NA 2164,NA 4602,NA 3081,NA 2179,NA 4487,NA 3846,NA 641,NA 2694,NA 646,NA 2555,NA 2719,NA 1209,NA 4016,NA 4740,NA 2037,NA 2574,NA 4908,NA 1771,NA 2280,NA 1101,NA 410,NA 1847,NA 634,NA 3700,NA 4780,NA 3344,NA 2341,NA 2691,NA 1655,NA 3144,NA 2263,NA 4441,NA 3922,NA 691,NA 1407,NA 3535,NA 2211,NA 3389,NA 3504,NA 343,NA 4793,NA 1223,NA 4632,NA 2514,NA 4678,NA 2194,NA 1753,NA 2383,NA 4831,NA 1486,NA 1454,NA 4299,NA 967,NA 4046,NA 1828,NA 1264,NA 4281,NA 651,NA 3960,NA 1780,NA 4822,NA 594,NA 2291,NA 2619,NA 4186,NA 168,NA 217,NA 3961,NA 1014,NA 344,NA 4323,NA 386,NA 2156,NA 4869,NA 2855,NA 3773,NA 1213,NA 3136,NA 843,NA 2224,NA 824,NA 592,NA 1838,NA 4733,NA 4378,NA 1301,NA 3287,NA 610,NA 1595,NA 3116,NA 2235,NA 3542,NA 4451,NA 522,NA 3153,NA 4208,NA 1822,NA 3115,NA 2304,NA 818,NA 2570,NA 717,NA 3252,NA 777,NA 2542,NA 430,NA 2516,NA 193,NA 4121,NA 1430,NA 1234,NA 1990,NA 3161,NA 4743,NA 1701,NA 3137,NA 4125,NA 726,NA 4836,NA 431,NA 1203,NA 3195,NA 2517,NA 1253,NA 4896,NA 3283,NA 450,NA 3153,NA 4384,NA 4652,NA 2098,NA 2478,NA 1764,NA 1244,NA 4794,NA 1800,NA 995,NA 3632,NA 841,NA 1133,NA 4228,NA 1730,NA 337,NA 136,NA 591,NA 69,NA 3679,NA 4620,NA 4911,NA 2027,NA 1349,NA 2442,NA 256,NA 13,NA 2480,NA 1219,NA 1279,NA 2762,NA 1258,NA 3143,NA 1581,NA 4623,NA 4533,NA 460,NA 3689,NA 2849,NA 3483,NA 3504,NA 200,NA 2158,NA 4072,NA 2833,NA 2942,NA 4402,NA 3597,NA 4683,NA 2148,NA 1431,NA 3854,NA 3083,NA 797,NA 2008,NA 9,NA 2090,NA 3820,NA 3973,NA 1213,NA 3796,NA 146,NA 2187,NA 2653,NA 2150,NA 4047,NA 4613,NA 3376,NA 470,NA 988,NA 2378,NA 3572,NA 2691,NA 4377,NA 1468,NA 1124,NA 3455,NA 1562,NA 2417,NA 609,NA 3451,NA 1579,NA 4081,NA 2730,NA 4737,NA 193,NA 3239,NA 399,NA 2165,NA 3805,NA 1469,NA 537,NA 365,NA 1782,NA 2858,NA 3390,NA 3454,NA 1868,NA 490,NA 496,NA 3875,NA 758,NA 1974,NA 4675,NA 3698,NA 3179,NA 1692,NA 4813,NA 559,NA 3253,NA 4918,NA 896,NA 690,NA 283,NA 2732,NA 2333,NA 4482,NA 93,NA 4255,NA 2508,NA 831,NA 1806,NA 3261,NA 4371,NA 3642,NA 2063,NA 797,NA 4229,NA 4422,NA 1980,NA 191,NA 4757,NA 3919,NA 1098,NA 1655,NA 889,NA 1813,NA 1958,NA 4520,NA 1383,NA 697,NA 2257,NA 552,NA 4405,NA 2670,NA 3697,NA 3598,NA 1323,NA 3370,NA 1049,NA 3453,NA 974,NA 3911,NA 76,NA 4671,NA 423,NA 171,NA 1555,NA 3924,NA 1403,NA 827,NA 2168,NA 4071,NA 3433,NA 3887,NA 457,NA 3714,NA 1984,NA 1481,NA 3715,NA 2333,NA 3866,NA 111,NA 4076,NA 1520,NA 4659,NA 2703,NA 1275,NA 2388,NA 3523,NA 38,NA 3863,NA 1329,NA 4856,NA 953,NA 99,NA 3062,NA 2629,NA 3173,NA 1978,NA 875,NA 1637,NA 3074,NA 396,NA 2596,NA 1532,NA 3357,NA 1969,NA 3740,NA 695,NA 1887,NA 3207,NA 4971,NA 1843,NA 1687,NA 4569,NA 4548,NA 0.23837975086644292,2015-01-16 0.41167997173033655,2018-05-18 0.7460716762579978,2015-09-05 0.723450553836301,2012-11-28 0.614524137461558,2020-01-13 0.473980569280684,2016-04-17 0.5784610391128808,2011-05-14 0.2415937229525298,2020-07-18 0.11437866208143532,2011-04-30 0.2983446326106787,2010-05-11 0.48411949491128325,2014-11-02 0.5674063181504607,2014-06-23 0.7539531090296805,2017-05-31 0.5454252359922975,2017-11-18 0.759677961235866,2013-04-20 0.21296746260486543,2010-12-11 0.8392650238238275,2022-10-10 0.40669705532491207,2010-11-11 0.8737398001831025,2015-03-22 0.8821565378457308,2013-12-25 0.10768936760723591,2011-02-27 0.5745443711057305,2015-12-14 0.996033379342407,2021-02-10 0.8505534324795008,2012-11-28 0.4376550551969558,2020-04-04 0.32642992469482124,2021-04-08 0.3359688585624099,2022-03-13 0.23927381564863026,2010-03-28 0.42137112445198,2023-01-04 0.5121368307154626,2012-04-02 0.1854463662020862,2015-11-09 0.3264005221426487,2017-01-13 0.8170736429747194,2011-02-25 0.7487379980739206,2019-04-04 0.5317418694030493,2022-04-01 0.9615713683888316,2016-05-04 0.10404637176543474,2010-06-02 0.7371236665640026,2017-03-08 0.847479980904609,2014-11-17 0.6437387536279857,2011-07-08 0.23245719773694873,2010-03-26 0.9162295656278729,2015-11-20 0.4245975555386394,2018-08-28 0.19294350570999086,2017-09-17 0.7015503356233239,2010-10-17 0.9925807097461075,2023-08-21 0.7089125071652234,2015-10-13 0.5174851573538035,2017-09-09 0.7087320478167385,2021-03-24 0.3751404786016792,2016-03-01 0.2547737658023834,2021-10-09 0.9964129347354174,2020-05-17 0.5228953601326793,2022-09-10 0.7158094178885221,2011-02-13 0.0024924282915890217,2010-09-04 0.2929687723517418,2014-09-23 0.1808160012587905,2010-01-16 0.4075938919559121,2019-04-15 0.21699588908813894,2018-04-11 0.07134267035871744,2011-08-20 0.2533115807455033,2021-08-08 0.6524795212317258,2022-07-27 0.7344441062305123,2013-06-29 0.8175131441093981,2013-09-28 0.30599033809266984,2014-10-16 0.8852475683670491,2020-06-03 0.6065588523633778,2015-07-15 0.8810191683005542,2012-10-24 0.6799206326249987,2018-01-16 0.1429436623584479,2018-03-12 0.7654655806254596,2010-12-14 0.6269666294101626,2020-11-30 0.7303384605329484,2020-06-26 0.4237044109031558,2021-05-31 0.26833077566698194,2010-02-14 0.7690480363089591,2018-03-20 0.587346678134054,2016-06-28 0.8985677554737777,2015-05-10 0.8703245387878269,2014-05-17 0.39930623723194003,2010-11-04 0.9651191120501608,2015-05-03 0.49599104514345527,2020-10-09 0.46659751585684717,2019-10-28 0.10774453124031425,2011-02-08 0.9142980496399105,2015-09-25 0.14380344981327653,2010-12-23 0.26518719107843935,2018-02-13 0.14846304850652814,2019-01-31 0.41424868325702846,2017-06-14 0.10894967359490693,2020-09-16 0.24707794794812799,2018-04-30 0.5906431097537279,2011-03-23 0.32826729747466743,2022-11-26 0.7329705220181495,2022-05-31 0.7279052240774035,2019-10-16 0.5457234212663025,2018-10-09 0.7337463176809251,2016-07-10 0.9333583756815642,2010-05-17 0.24339259508997202,2016-01-13 0.282959054922685,2015-01-17 0.2166259593795985,2018-04-14 0.12004142836667597,2021-07-23 0.17026365920901299,2019-08-09 0.521528884768486,2018-11-05 0.9634940281976014,2013-11-07 0.8878725194372237,2020-03-21 0.6364465965889394,2021-04-28 0.32938025146722794,2019-08-24 0.6022224121261388,2013-04-15 0.7721615340560675,2016-06-14 0.2101352927275002,2011-05-22 0.5843083660583943,2020-01-13 0.5420758109539747,2020-07-09 0.09239664277993143,2023-02-06 0.3693408251274377,2014-08-14 0.9524294231086969,2013-10-20 0.9496892413590103,2015-06-21 0.44977682176977396,2013-04-15 0.25981824356131256,2014-11-16 0.018906170036643744,2014-04-12 0.7214050476904958,2022-02-24 0.9528096492867917,2012-12-04 0.44022330385632813,2014-06-30 0.5040123793296516,2019-01-16 0.04355699848383665,2021-12-12 0.9224744557868689,2019-03-08 0.9237895561382174,2017-11-18 0.60564771364443,2013-07-14 0.004081981023773551,2011-03-07 0.656856436515227,2021-07-19 0.1509289499372244,2011-05-02 0.1991606669034809,2016-04-16 0.23963832925073802,2010-03-24 0.6615739674307406,2018-07-28 0.2417888215277344,2016-04-28 0.07154973852448165,2014-09-22 0.7046719279605895,2020-05-10 0.3153969100676477,2011-01-25 0.37587519478984177,2014-05-08 0.8150977415498346,2012-07-01 0.2005599664989859,2023-02-28 0.30193018657155335,2020-11-25 0.08772024232894182,2023-05-27 0.1833201281260699,2011-04-21 0.20680187526158988,2019-02-11 0.8549594988580793,2021-09-28 0.262909896671772,2013-11-29 0.9453342743217945,2017-07-05 0.7927354699932039,2012-10-08 0.8315818924456835,2022-11-27 0.618977224919945,2018-12-19 0.44373362860642374,2022-12-03 0.9646256130654365,2010-12-18 0.16219870373606682,2010-12-27 0.1856537905987352,2014-09-29 0.6131014195270836,2021-06-09 0.48148447810672224,2016-02-20 0.3961378976237029,2021-11-17 0.31875640782527626,2010-04-25 0.8383750088978559,2023-08-19 0.2992035255301744,2012-03-10 0.6160618360154331,2010-07-12 0.4621038355398923,2019-01-28 0.939205955946818,2010-01-05 0.006291386438533664,2016-07-03 0.3494274849072099,2018-07-01 0.5662713926285505,2016-05-30 0.5289570635650307,2015-11-26 0.6370153992902488,2016-09-02 0.35079587949439883,2020-03-12 0.9241711974609643,2013-03-27 0.4740412225946784,2016-10-19 0.7781341173686087,2011-10-24 0.7987365354783833,2019-02-01 0.878499910235405,2012-12-25 0.9098438550718129,2020-02-29 0.4455377559643239,2010-07-09 0.28955932846292853,2018-12-04 0.3033107363153249,2014-07-31 0.9190243480261415,2013-01-11 0.9357123947702348,2010-04-17 0.6716483265627176,2021-01-05 0.37126996577717364,2023-08-27 0.5671314361970872,2014-12-16 0.4785984419286251,2022-10-20 0.5677487845532596,2019-01-04 0.2865388400387019,2017-01-12 0.759599128505215,2017-11-03 0.22846577316522598,2020-07-20 0.42650539334863424,2010-06-02 0.45658472809009254,2011-05-26 0.709906758973375,2014-03-05 0.28745697857812047,2015-12-17 0.7730602626688778,2015-12-28 0.5745622855611145,2013-09-16 0.02179576293565333,2022-02-27 0.7730264803394675,2022-04-02 0.05912893358618021,2017-01-29 0.02764830063097179,2018-03-15 0.7408465940970927,2018-12-18 0.5933208465576172,2010-08-10 0.6866767185274512,2022-05-06 0.7353102252818644,2012-09-08 0.46110520721413195,2021-01-18 0.661999277304858,2016-01-22 9.579434990882874e-4,2020-01-05 0.9759655296802521,2015-02-08 0.22016345639713109,2010-12-13 0.30632783845067024,2015-11-29 0.731566637288779,2017-08-03 0.13350622565485537,2011-06-27 0.0998360610101372,2022-05-10 0.171385153895244,2010-05-24 0.6171815898269415,2011-05-09 0.6832633081357926,2016-05-01 0.7905740689020604,2012-04-02 0.21324812644161284,2017-01-09 0.7920108856633306,2022-11-30 0.5926344085019082,2019-04-13 0.6062636019196361,2015-10-09 0.11076854448765516,2022-07-20 0.19560863845981658,2017-10-25 0.007070775609463453,2014-08-21 0.940870595164597,2021-04-29 0.06820935127325356,2020-04-19 0.13692918047308922,2013-03-20 0.332817024551332,2014-08-25 0.4896882916800678,2014-09-27 0.17295454000122845,2014-02-20 0.06493924162350595,2018-07-18 0.45166698982939124,2013-05-20 0.3794023538939655,2011-04-26 0.9092983675654978,2012-07-29 0.029946457827463746,2010-06-26 0.19479636382311583,2017-02-12 0.6794793712906539,2012-06-16 0.7474663024768233,2017-11-09 0.6495377144310623,2018-08-10 0.07758067711256444,2013-09-19 0.599653656128794,2017-12-03 0.8790108121465892,2013-03-08 0.7320371561218053,2011-09-25 0.08005308615975082,2023-08-25 0.8327498119324446,2016-06-30 0.8647056978661567,2011-03-29 0.712964971549809,2013-04-07 0.757407110882923,2013-07-20 0.12243391619995236,2010-05-10 0.1936978818848729,2012-05-28 0.31417828937992454,2018-06-28 0.13753298204392195,2013-01-10 0.8478028802201152,2010-11-25 0.4852219868917018,2018-10-31 0.47024272638373077,2014-04-13 0.7910453744698316,2018-10-01 0.31298327283002436,2023-05-03 0.3087645126506686,2012-03-24 0.34515533596277237,2017-04-30 0.8284433148801327,2018-01-05 0.2739954984281212,2014-05-14 0.430836085928604,2016-05-30 0.4168978400994092,2018-10-03 0.13431219942867756,2011-01-24 0.6863413986284286,2023-08-05 0.17207811656408012,2021-11-10 0.9234934435226023,2020-12-15 0.6137435929849744,2021-10-08 0.31562944664619863,2020-12-10 0.8097330906894058,2014-07-09 0.9023741011042148,2015-11-19 0.1252977994736284,2012-12-28 0.5655571934767067,2015-01-14 0.12764110649004579,2010-06-23 0.6275976162869483,2023-04-15 0.07532395399175584,2011-10-16 0.2854513239581138,2016-03-21 0.31060242909006774,2021-09-18 0.14672756171785295,2011-12-11 0.7997705133166164,2010-12-16 0.1719960793852806,2017-10-16 0.7856838123407215,2010-10-30 0.4700357641559094,2019-12-12 0.4329577290918678,2020-09-13 0.5745328599587083,2017-05-17 0.7299699452705681,2021-09-27 0.8941871484275907,2014-12-31 0.2203063692431897,2015-11-07 0.2915964382700622,2018-10-26 0.8128987492527813,2022-11-17 0.043602329678833485,2010-11-07 0.5052716645877808,2015-09-10 0.24165588174946606,2010-01-27 0.5891424184665084,2021-04-13 0.9711405686102808,2019-03-19 0.23272100347094238,2019-01-11 0.8674180153757334,2014-06-13 0.1107617428060621,2011-05-18 0.8014917799737304,2016-07-08 0.6579244541935623,2012-12-21 0.652054297272116,2013-10-21 0.2263226448558271,2010-02-28 0.8539796313270926,2018-03-15 0.435607039835304,2016-02-28 0.28928173682652414,2017-07-23 0.6375846704468131,2022-09-07 0.2645394585561007,2017-12-26 0.38626837776973844,2011-01-09 0.6191939699929208,2011-06-02 0.5416780826635659,2022-09-20 0.4848310004454106,2013-06-22 0.7642949193250388,2013-02-18 0.9610269367694855,2014-07-02 0.2705845332238823,2015-03-19 0.7306002208497375,2013-10-21 0.13416554615832865,2018-08-28 0.04839822766371071,2014-03-08 0.7036070702597499,2013-09-21 0.14721379429101944,2015-02-19 0.769155333051458,2015-04-04 0.17635010997764766,2021-04-26 0.11075899936258793,2016-02-12 0.9383424082770944,2019-09-09 0.8464711401611567,2020-11-07 0.5711435815319419,2021-02-15 0.6790934063028544,2015-05-18 0.08932224358431995,2013-11-12 0.7853741250000894,2023-06-25 0.22695744805969298,2013-07-10 0.44817846501246095,2021-10-23 0.16122763720341027,2019-11-16 0.1761116897687316,2020-07-24 0.19822812359780073,2010-10-08 0.3576113139279187,2011-07-23 0.1813332038000226,2019-12-15 0.5611448597628623,2018-08-10 0.6590830096974969,2020-08-27 0.6602534090634435,2014-01-24 0.0024007875472307205,2020-10-14 0.9934460341464728,2022-05-01 0.6274892308283597,2016-05-05 0.014630335848778486,2017-01-02 0.20517821749672294,2012-05-28 0.6630766640882939,2014-09-28 0.4637440303340554,2018-06-23 0.36034815781749785,2013-05-03 0.6893663913942873,2022-12-04 0.258860788308084,2010-12-07 0.8512050320859998,2021-03-23 0.4800046910531819,2022-09-04 0.5372663901653141,2014-05-05 0.6616398973856121,2014-12-28 0.3008545101620257,2010-11-11 0.2635015156120062,2016-08-17 0.305046129738912,2010-07-24 0.8749582655727863,2022-03-28 0.7184372169431299,2017-11-15 0.40691969403997064,2011-03-23 0.019359473139047623,2022-02-21 0.050180358812212944,2013-11-17 0.7022510319948196,2015-09-28 0.06637026951648295,2013-04-16 0.03613236825913191,2010-10-27 0.020153695717453957,2010-06-09 0.37278109695762396,2018-10-31 0.22566540399566293,2015-07-07 0.4919181000441313,2019-04-10 0.4466451567132026,2015-01-19 0.6902048990596086,2012-08-29 0.8427399797365069,2015-07-02 0.37583366711623967,2020-04-06 0.9848896882031113,2019-04-10 0.9524263297207654,2013-10-15 0.989898509113118,2014-06-03 0.4431283543817699,2016-02-11 0.1537638516165316,2017-03-03 0.9447273225523531,2016-08-05 0.5194664136506617,2013-02-19 0.45563460420817137,2018-09-24 0.22009019972756505,2010-03-19 0.139182384358719,2022-12-23 0.216157881077379,2015-08-03 0.4056786729488522,2012-12-04 0.23373459139838815,2021-01-29 0.025066359667107463,2015-08-08 0.7523419591598213,2012-04-06 0.7320725433528423,2011-01-09 0.40910677472129464,2014-03-02 0.6308578054886311,2014-02-10 0.0654449830763042,2012-06-09 0.14869215176440775,2022-04-15 0.279701764928177,2010-12-05 0.8506568092852831,2013-03-31 0.021540780318900943,2015-12-12 0.2528298799879849,2017-08-22 0.6567115010693669,2022-12-01 0.7891494096256793,2021-05-22 0.7088456475175917,2021-07-27 0.06459299195557833,2013-06-05 0.511311343871057,2011-09-22 0.20092834974639118,2021-11-28 0.8141155925113708,2012-06-27 0.6537800759542733,2023-08-08 0.35325198201462626,2023-09-06 0.5965948959346861,2023-02-18 0.7277043734211475,2011-12-15 0.9955685392487794,2010-11-12 0.49805527180433273,2013-04-25 0.23029476939700544,2022-08-15 0.30125431274063885,2015-10-09 0.7204666768666357,2014-09-26 0.2614054181613028,2020-07-25 0.1076963481027633,2019-06-03 0.2624227120541036,2016-07-28 0.18670618324540555,2020-06-17 0.5424416796304286,2022-05-26 0.06551847420632839,2010-07-05 0.8803851366974413,2017-04-26 0.2792125369887799,2022-02-19 0.6727036715019494,2013-08-15 0.060130874160677195,2010-10-19 0.9884855502750725,2020-03-08 0.28227543109096587,2021-11-05 0.5541123666334897,2016-12-14 0.8900840044952929,2023-02-13 0.02280205488204956,2020-05-13 0.6776884538121521,2016-03-20 0.33705979655496776,2023-05-22 0.14119609468616545,2016-06-13 0.3525639877188951,2016-07-15 0.4014448565430939,2018-06-22 0.8889143522828817,2018-02-23 0.31261418759822845,2017-08-24 0.06165470811538398,2016-09-23 0.6794862640090287,2022-10-15 0.3781625689007342,2012-10-13 0.5883703480940312,2017-04-03 0.4919785351958126,2022-08-05 0.28577694413252175,2019-01-01 0.008419594960287213,2013-03-19 0.5507742969784886,2017-07-26 0.8132309077773243,2022-06-09 0.6435745854396373,2020-10-10 0.06906200293451548,2012-11-09 0.7287162716966122,2018-10-09 0.6721111985389143,2023-03-13 0.8198009913321584,2020-06-12 0.9146215580403805,2020-04-28 0.6899706807453185,2017-07-02 0.49197780271060765,2017-05-03 0.6469286507926881,2013-03-16 0.45651495666243136,2018-08-04 0.12544662156142294,2015-05-05 0.6043649739585817,2017-03-28 0.8268267358653247,2013-11-16 0.6193782512564212,2018-02-09 0.8410537105519325,2019-07-27 0.42724660760723054,2023-05-16 0.7692867037840188,2011-06-25 0.712140791118145,2012-05-04 0.0633672084659338,2013-02-15 0.4260052361059934,2013-12-13 0.08827764308080077,2022-12-02 0.20831681927666068,2018-03-25 0.22826087311841547,2014-12-11 0.05044214380905032,2015-10-10 0.5618212523404509,2010-11-23 0.6942461444996297,2019-02-13 0.22794265439733863,2022-10-21 0.9603136049117893,2017-03-27 0.4928371913265437,2020-12-18 0.7232086854055524,2010-11-08 0.49136308254674077,2018-08-31 0.8453550811391324,2019-03-31 0.025109663605690002,2019-06-20 0.5484002430457622,2021-01-29 0.9554548165760934,2021-02-03 0.14050186681561172,2011-05-02 0.7102736248634756,2012-10-14 0.12640188890509307,2011-12-04 0.8356562776025385,2019-02-18 0.2981795039959252,2011-10-05 0.381328749936074,2011-06-13 0.24784933566115797,2016-12-21 0.3449816491920501,2020-04-22 0.41978342621587217,2016-09-24 0.6591099870856851,2012-11-27 0.9539570489432663,2015-05-07 0.4698936538770795,2018-12-25 0.15062109171412885,2016-09-18 0.9025228463578969,2016-09-11 0.4380257027223706,2020-11-21 0.8067555839661509,2011-04-22 0.48149546841159463,2013-02-07 0.25803821301087737,2013-04-14 0.17013581050559878,2019-08-23 0.1606106914114207,2010-12-23 0.66139040957205,2010-10-18 0.46855212026275694,2016-05-22 0.5453928408678621,2012-03-21 0.725098253460601,2017-03-01 0.5254825404845178,2023-05-15 0.618936445331201,2011-01-30 0.1789609114639461,2010-06-11 0.7677212303970009,2015-08-10 0.8162949671968818,2012-06-27 0.19458237988874316,2020-03-18 0.21251409477554262,2010-05-09 0.24883011914789677,2012-04-04 0.7480038029607385,2012-11-25 0.40724376146681607,2013-05-02 0.5616738076787442,2010-06-04 0.7505097503308207,2011-05-16 0.5511977674905211,2013-02-07 0.473349581239745,2011-04-24 0.06262986944057047,2013-10-19 0.0939460473600775,2010-11-26 0.5801826189272106,2013-02-20 0.38567587174475193,2017-11-19 0.2365208996925503,2020-05-09 0.5195376325864345,2022-12-20 0.6412099292501807,2016-07-10 0.829900240059942,2015-10-24 0.9190941501874477,2010-06-06 0.9294001522939652,2016-08-21 0.6453137448988855,2011-05-23 0.783117612125352,2019-10-06 0.05871596094220877,2010-07-07 0.35836152103729546,2020-02-18 0.479386042105034,2014-03-09 0.7230917664710432,2018-06-27 0.6964026989880949,2010-07-27 0.870710554998368,2014-09-25 0.708805855596438,2017-01-25 0.3548054692801088,2018-10-31 0.8072527183685452,2016-03-03 0.35278886649757624,2021-05-23 0.7531260862015188,2013-09-13 0.9081510829273611,2020-06-04 0.4353852095082402,2017-09-21 0.22024713945575058,2011-09-24 0.03718220675364137,2013-06-24 0.6980540752410889,2012-03-27 0.14983401424251497,2015-01-20 0.5424376127775759,2012-06-12 0.7967723628971726,2020-01-16 0.7196246690582484,2016-06-26 0.7281896565109491,2021-08-12 0.04781616129912436,2011-04-14 0.45137571380473673,2011-10-05 0.794269957812503,2013-10-29 0.8246223253663629,2021-03-27 0.20905156270600855,2015-05-28 0.16820653341710567,2022-02-26 0.9802742237225175,2018-02-20 0.600670009618625,2013-05-17 0.15167629974894226,2015-02-23 0.29273867909796536,2011-11-17 0.4480606229044497,2011-05-10 0.8239078253973275,2018-10-18 0.9694043302442878,2021-03-12 0.29540916392579675,2012-02-24 0.23268974153324962,2017-01-04 0.3254810383077711,2019-09-24 0.10037211910821497,2014-12-24 0.3302253605797887,2019-02-27 0.19259870192036033,2019-11-13 0.26887363637797534,2017-06-03 0.8835293431766331,2015-05-22 0.6706231615971774,2015-06-24 0.07432106742635369,2013-10-07 0.6853092038072646,2012-04-03 0.6008155907038599,2021-11-15 0.7061404753476381,2018-12-12 0.4681660116184503,2015-05-16 0.01098793395794928,2021-07-27 0.7832956942729652,2022-02-13 0.6218532985076308,2020-01-30 0.8660587386693805,2018-10-23 0.919852337334305,2014-04-23 0.3253989245276898,2013-02-12 0.9157620661426336,2010-11-15 0.08054490759968758,2011-01-29 0.8555176814552397,2020-01-06 0.30502897896803916,2012-05-30 0.7363630407489836,2022-03-12 0.940962569322437,2022-08-28 0.8610232374630868,2017-09-14 0.3641701233573258,2022-09-04 0.9125234829261899,2022-08-07 0.752922203624621,2012-02-01 0.6414824086241424,2018-12-17 0.7954503307119012,2018-05-07 0.9849717258475721,2018-08-07 0.6223692377097905,2014-09-25 0.5553264871705323,2010-12-29 0.9610665023792535,2018-03-21 0.9156397173646837,2012-11-27 0.6032756008207798,2011-06-04 0.311554106650874,2019-04-29 0.5551521240267903,2016-03-22 0.9375977437011898,2011-09-19 0.36503715231083333,2022-07-30 0.33863229816779494,2015-05-20 0.7696835622191429,2010-12-17 0.301342302467674,2019-04-06 0.6296409552451223,2016-05-21 0.8024997254833579,2016-10-03 0.5422355639748275,2019-09-10 0.6309975676704198,2014-01-11 0.18978887028060853,2012-10-27 0.20345269003883004,2021-01-31 0.9283512588590384,2019-08-10 0.41344345500692725,2020-07-21 0.13096988503821194,2015-08-18 0.061163004487752914,2015-12-14 0.8860738726798445,2017-09-16 0.5922085058409721,2013-02-06 0.7224121852777898,2010-07-19 0.5123929986730218,2011-06-11 0.29606865253299475,2014-10-14 0.6397780675906688,2012-01-22 0.8825434281025082,2020-08-10 0.9461507303640246,2016-09-08 0.709416676312685,2020-02-03 0.9236205760389566,2015-05-14 0.016221591737121344,2018-10-01 0.17147828871384263,2019-05-26 0.21398976421914995,2021-01-18 0.8451151894405484,2021-03-27 0.24332171166315675,2018-04-24 0.5150503544136882,2012-03-23 0.8631874904967844,2020-02-02 0.040558676002547145,2010-12-07 0.4600282253231853,2020-09-25 0.16600484843365848,2020-11-13 0.9153843396343291,2011-02-14 0.4094238232355565,2021-07-25 0.6405321785714477,2016-03-05 0.16481841239146888,2021-09-10 0.18098014616407454,2023-01-09 0.996452712919563,2016-06-16 0.6451109414920211,2013-10-23 0.9180984173435718,2021-05-11 0.7950654453597963,2020-06-26 0.9130970847327262,2014-10-20 0.3905595827382058,2012-01-11 0.3574004932306707,2014-08-19 0.6143616286572069,2023-01-08 0.1924407461192459,2014-05-16 0.07183849718421698,2011-11-15 0.3062329371459782,2010-08-17 0.17457634513266385,2014-02-24 0.8877611239440739,2012-05-12 0.4978482248261571,2015-05-29 0.693908091634512,2015-01-04 0.874216026859358,2020-02-01 0.01808677427470684,2018-10-07 0.3819870548322797,2015-11-26 0.5135930245742202,2017-02-10 0.04722265945747495,2014-10-05 0.8030951099935919,2021-12-03 0.6340869336854666,2015-01-20 0.7713282140903175,2014-02-04 0.5017637426499277,2018-04-18 0.7112887632101774,2019-05-16 0.09189838543534279,2019-08-08 0.10590877430513501,2022-08-16 0.22491388185881078,2020-04-28 0.4176635534968227,2016-05-30 0.3440130678936839,2020-12-01 0.6642059565056115,2014-10-01 0.44336367142386734,2019-04-05 0.30618356238119304,2019-08-04 0.26934600668027997,2018-03-07 0.27042659488506615,2012-12-02 0.0976896530482918,2018-08-12 0.9920599514152855,2018-03-05 0.1045265388675034,2018-06-14 0.43448846065439284,2012-12-26 0.684687570668757,2014-04-01 0.329821523046121,2019-12-01 0.39954269072040915,2016-02-12 0.6991565418429673,2021-11-11 0.2425231086090207,2016-11-23 0.027535082073882222,2012-05-15 0.07009002240374684,2020-11-12 0.023703276878222823,2020-05-19 0.20660110423341393,2012-07-12 0.6988863211590797,2014-08-06 0.9813835630193353,2013-03-17 0.7816515797749162,2011-09-26 0.6054745719302446,2019-03-02 0.20836171018891037,2018-02-10 0.7636784943751991,2012-10-04 0.8187859968747944,2015-10-27 0.7611123095266521,2012-11-22 0.8286271207034588,2010-08-30 0.008509289706125855,2016-06-28 0.08882123627699912,2023-04-25 0.91988012520596385,2011-04-07 0.6383964512497187,2012-11-01 0.4169857541564852,2013-05-04 0.7020355253480375,2018-11-22 0.16102612484246492,2013-07-26 0.3779098354279995,2014-06-06 0.9751168165821582,2019-01-21 0.4035551785491407,2013-04-01 0.723759297747165,2021-05-05 0.38518987968564034,2019-02-15 0.38908845093101263,2017-05-29 0.12964176665991545,2011-08-30 0.2847507023252547,2013-12-29 0.13074389309622347,2022-06-02 0.4740489721298218,2012-06-11 0.9800091898068786,2022-07-08 0.16834043501876295,2017-12-26 0.18153826682828367,2015-07-02 0.8789390495512635,2017-06-27 0.6766599684488028,2014-08-06 0.5074436131399125,2022-06-22 0.4814086586702615,2021-10-30 0.9683199205901474,2011-06-18 0.24795010755769908,2020-04-03 0.13281461247242987,2017-12-24 0.06826614774763584,2017-01-06 0.40022375574335456,2014-01-08 0.34733960195444524,2023-07-28 0.08134637214243412,2022-07-29 0.04008660069666803,2023-08-17 0.26790826581418514,2021-09-06 0.4845776897855103,2022-02-24 0.6038457204122096,2016-04-07 0.2126810213085264,2011-06-13 0.08015722362324595,2013-10-31 0.17985428147949278,2018-12-23 0.7949596226681024,2015-02-25 0.5292033553123474,2021-04-19 0.4661243304144591,2019-05-10 0.8921737256459892,2014-05-12 0.7419538695830852,2011-11-06 0.7637358212377876,2022-12-01 0.8440997828729451,2018-03-08 0.7052174902055413,2019-12-05 0.9484422185923904,2014-04-20 0.14429178135469556,2019-04-22 0.8038033130578697,2013-09-01 0.2943441355600953,2013-02-16 0.38137339940294623,2015-10-28 0.23724128119647503,2021-05-20 0.34614214673638344,2012-06-30 0.42900022584944963,2017-09-05 0.5998602632898837,2018-09-17 0.565516313072294,2013-01-10 0.4661923705134541,2019-02-25 0.23348797275684774,2022-07-29 0.740812616655603,2011-04-18 0.29534474899992347,2021-03-29 0.08237326238304377,2014-10-04 0.27436478761956096,2021-07-29 0.9310599053278565,2021-01-23 0.8814145622309297,2019-08-20 0.4742308217100799,2011-11-21 0.5750370132736862,2012-03-20 0.21033262298442423,2013-10-07 0.5982999296393245,2022-01-31 0.02650217106565833,2021-02-16 0.8523679610807449,2014-02-21 0.5338073449674994,2011-02-25 0.09864674136042595,2015-06-13 0.6973787155002356,2010-09-05 0.6462127384729683,2015-02-02 0.9212825754657388,2013-12-09 0.28879159269854426,2017-04-01 0.65436782524921,2010-03-17 0.6188365686684847,2015-07-05 0.644347591791302,2011-12-24 0.5879467707127333,2011-03-01 0.9590033662971109,2021-08-27 0.16798287397250533,2016-08-17 0.24776496808044612,2021-09-24 0.5207485517021269,2019-01-31 0.13296581688337028,2020-05-10 0.8159506213851273,2017-12-10 0.30784280761145055,2017-08-09 0.3464580220170319,2010-08-14 0.32439053687267005,2015-10-04 0.8299951327499002,2020-02-17 0.16978011513128877,2017-01-12 0.27819421770982444,2012-03-11 0.3639769915025681,2018-10-17 0.06989352311939001,2021-05-26 0.5890974695794284,2017-07-12 0.08413626649416983,2010-09-03 0.2758814513217658,2013-11-30 0.0892041027545929,2021-09-30 0.9139310284517705,2017-08-14 0.23067126562818885,2016-04-02 0.9596100023481995,2018-09-28 0.31913768011145294,2023-04-20 0.43079651868902147,2020-06-18 0.9163004402071238,2011-10-02 0.8421652615070343,2022-01-12 0.9530339573975652,2015-05-05 0.3136253524571657,2012-08-28 0.8803836130537093,2012-07-18 0.29948478611186147,2021-10-23 0.4408169274684042,2017-08-04 0.44570411927998066,2019-03-12 0.42943084822036326,2013-02-09 0.8614283904898912,2010-01-27 0.7890478519257158,2019-07-23 0.3662304144818336,2023-07-17 0.33877988043241203,2015-10-21 0.9619562041480094,2017-03-21 0.8873374862596393,2017-01-02 0.4318412118591368,2018-11-03 0.8925788707565516,2018-10-07 0.1908249231055379,2017-07-06 0.753541242564097,2016-09-09 0.18671885086223483,2019-11-10 0.4893101565539837,2021-12-21 0.1323064104653895,2021-10-13 0.6215600143186748,2018-11-04 0.3441609856672585,2020-06-19 0.5986538652796298,2018-08-01 0.5948208479676396,2017-04-27 0.08747628959827125,2012-10-14 0.7449057816993445,2022-07-29 0.7255401618313044,2022-07-30 0.802798884222284,2015-04-04 0.5034499294124544,2015-10-23 0.26467121997848153,2014-03-05 0.5361411133781075,2018-04-26 0.2134377434849739,2018-10-31 0.2555720009841025,2011-12-01 0.3432095227763057,2023-09-05 0.3149803134147078,2010-02-20 0.903441054513678,2012-09-27 0.5070839948020875,2013-12-28 0.8868092112243176,2013-01-28 0.19502249849028885,2016-06-14 0.9889192474074662,2018-01-26 0.9127213363535702,2021-08-17 0.7590857506729662,2020-11-15 0.8878286243416369,2018-01-05 0.2729664109647274,2019-03-11 0.9270147723145783,2014-04-03 0.8476126017048955,2012-12-01 0.4657681928947568,2022-10-19 0.6940696041565388,2014-01-04 0.26842982484959066,2013-11-07 0.19049296411685646,2019-09-04 0.41361317480914295,2023-01-19 0.23820438305847347,2010-08-31 0.09241898846812546,2012-03-04 0.2726121188607067,2019-08-02 0.009083196753636003,2022-03-06 0.629982847487554,2022-06-09 0.07927433913573623,2021-10-07 0.3504166591446847,2022-12-17 0.6004056162200868,2013-09-14 0.9690369053278118,2017-03-29 0.6163354294840246,2019-07-03 0.5224107033573091,2013-04-09 0.5226436799857765,2019-04-11 0.00876278686337173,2012-08-17 0.4591184495948255,2022-10-01 0.4761861457955092,2015-03-13 0.974526327336207,2021-02-08 0.6641715527512133,2010-07-26 0.8101816652342677,2014-01-19 0.9218756454065442,2021-08-26 0.09567142208106816,2022-05-02 0.9353634966537356,2023-07-12 0.5359931767452508,2019-11-19 0.8296154425479472,2023-02-19 0.8165666493587196,2011-04-23 0.30543361068703234,2017-06-16 0.7086418280377984,2018-07-20 0.17579243425279856,2020-09-14 0.19219414866529405,2022-02-06 0.18565151165239513,2022-05-09 0.4843596222344786,2021-03-21 0.7586447366047651,2015-04-13 0.3020715794991702,2012-12-15 0.38378978963010013,2019-08-14 0.18092394573614,2014-08-31 0.6372511743102223,2017-02-25 0.5597414104267955,2019-05-10 0.8500275288242847,2023-06-13 0.6701601550448686,2017-11-09 0.6118010880891234,2020-09-14 0.9065461044665426,2011-04-10 0.3120599687099457,2016-12-10 0.5973760541528463,2022-01-16 0.6979898712597787,2021-03-17 0.8268592851236463,2018-02-06 0.9671381479129195,2017-02-22 0.36611850443296134,2019-05-22 0.8452709591947496,2023-02-05 0.391217652708292,2014-01-22 0.6951273591257632,2020-03-19 0.6493835819419473,2022-07-17 0.023566172923892736,2013-09-16 0.926038958132267,2011-04-28 0.9850510796532035,2016-04-15 0.9585321145132184,2020-02-05 0.26632869709283113,2013-01-09 0.6759593775495887,2021-07-09 0.8263764544390142,2011-06-13 0.7603731814306229,2015-01-14 0.3346221512183547,2019-10-18 0.9804811442736536,2016-05-18 0.9473683452233672,2013-09-15 0.509538036538288,2010-08-04 0.3867357175331563,2018-12-18 0.5971393240615726,2012-06-02 0.13570102746598423,2021-02-28 0.6072117269504815,2020-11-05 0.6190444205421954,2016-11-16 0.1604869430884719,2019-08-06 0.22741486108861864,2012-09-16 0.4889993858523667,2021-07-24 0.26625592773780227,2022-03-23 0.986886880826205,2020-11-30 0.6590274758636951,2018-10-18 0.5617879598867148,2019-01-19 0.6039721027482301,2013-01-01 0.19239175505936146,2013-06-26 0.3716695522889495,2014-09-30 0.12009952031075954,2019-09-14 0.3957092612981796,2010-01-04 0.03923126310110092,2014-01-15 0.6294073443859816,2012-06-19 0.5232696952298284,2015-12-01 0.3931053976994008,2022-04-14 0.8778933002613485,2013-01-14 0.2882499238476157,2017-05-17 0.6321781876031309,2013-09-23 0.313025128794834,2010-11-25 0.14582274248823524,2023-01-05 0.8205009659286588,2017-03-13 0.7456198027357459,2020-12-14 0.6778734670951962,2010-03-23 0.20513771777041256,2018-08-29 0.9919730878900737,2018-09-19 0.6689565279521048,2014-09-11 0.7538818956818432,2022-12-29 0.6451980541460216,2021-03-04 0.10516616073437035,2023-05-21 0.04980481299571693,2022-02-03 0.5507950552273542,2018-01-16 0.027205367805436254,2016-08-03 0.18725806567817926,2013-01-15 0.6483364240266383,2020-09-27 0.8247189852409065,2011-10-22 0.9155435566790402,2022-01-10 0.8255569902248681,2021-08-03 0.7955550437327474,2015-05-20 0.6881147245876491,2021-02-07 0.3386629270389676,2015-03-01 0.46830290742218494,2010-09-07 0.8369869156740606,2015-04-22 0.7704877557698637,2018-02-24 0.5956799318082631,2012-11-21 0.5965282435063273,2010-03-27 0.17414100118912756,2016-05-01 0.47566762403585017,2017-06-19 0.9339482507202774,2016-06-11 0.05953748035244644,2018-03-30 0.14324546162970364,2020-06-10 0.42678032303228974,2013-11-08 0.5644535899627954,2017-07-12 0.18729942245408893,2016-08-12 0.6027495227754116,2022-05-12 0.7348782932385802,2020-08-11 0.06834881310351193,2011-10-26 0.7829179642722011,2015-12-09 0.921492709312588,2012-09-27 0.04428216675296426,2013-02-04 0.7131148546468467,2010-11-29 0.9038860204163939,2013-10-16 0.7395815039053559,2015-04-26 0.1721756304614246,2011-04-12 0.18658997677266598,2017-01-30 0.38248836481943727,2014-06-08 0.45361327519640326,2016-10-19 0.4551314772106707,2023-09-01 0.17310278164222836,2010-01-09 0.3054172566626221,2020-01-11 0.867752101039514,2016-12-16 0.2602322499733418,2010-01-03 0.6808707599993795,2016-04-23 0.8535765560809523,2016-08-10 0.805274312151596,2019-11-21 0.1635163405444473,2018-03-29 0.47193897631950676,2014-08-04 0.7183186465408653,2015-08-16 0.26987858884967864,2020-02-04 0.608237189007923,2019-01-06 readr/inst/extdata/mini-gapminder-americas.csv0000644000176200001440000000042514315646511021164 0ustar liggesuserscountry,year,lifeExp,pop,gdpPercap Argentina,1952,62.485,17876956,5911.315053 Bolivia,1952,40.414,2883315,2677.326347 Brazil,1952,50.917,56602560,2108.944355 Canada,1952,68.75,14785584,11367.16112 Chile,1952,54.745,6377619,3939.978789 Colombia,1952,50.643,12350771,2144.115096 readr/inst/extdata/whitespace-sample.txt0000644000176200001440000000015314174704674020147 0ustar liggesusersfirst last state phone John Smith WA 418-Y11-4111 Mary Hartford CA 319-Z19-4341 Evan Nolan IL 219-532-c301 readr/inst/extdata/mini-gapminder-europe.csv0000644000176200001440000000042714315646511020701 0ustar liggesuserscountry,year,lifeExp,pop,gdpPercap Albania,1952,55.23,1282697,1601.056136 Austria,1952,66.8,6927772,6137.076492 Belgium,1952,68,8730405,8343.105127 Bosnia and Herzegovina,1952,53.82,2791000,973.5331948 Bulgaria,1952,59.6,7274900,2444.286648 Croatia,1952,61.21,3882229,3119.23652 readr/inst/WORDLIST0000644000176200001440000000140114315656753013531 0ustar liggesusersAcknowledgements Allaire BCP BDR BH BOM Bugfixes CMD Cheatsheet Cheng Codecov Colouring Eddelbuettel Florian Garmonsway Gb Internationalisation JJ Loitsch Māori NaN ORCID PLDI POSIXt Preprocess Rcpp Rds SSD TSV Timezones Tokenize Tokenizers YMD automata behaviour bz bzip centric cli coloured cpp csv datetime datetimes deduplicated delim dplyr durations funder fwf generalises github grisu gz gzip https linux localtime lzma macOS mandreyel meltr mio noamross null's nycflights parsers pkgdown pre programmatically purrr readr's recognise recognised recognises recognising reinitialization serialise shortcode specialisations stringi stringsAsFactors subetting testthat tibble tibbles tidyverse timezones tokenization tokenize tokenizer tsv tz tzdb valgrind vroom xz