csv-core-0.1.6/COPYING010064400017500000144000000001761313170244000125060ustar0000000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. csv-core-0.1.6/Cargo.toml.orig010064400017500000144000000015231350500254600143440ustar0000000000000000[package] name = "csv-core" version = "0.1.6" #:version authors = ["Andrew Gallant "] description = "Bare bones CSV parsing with no_std support." documentation = "https://docs.rs/csv-core" homepage = "https://github.com/BurntSushi/rust-csv" repository = "https://github.com/BurntSushi/rust-csv" readme = "README.md" keywords = ["csv", "comma", "parser", "delimited", "no_std"] license = "Unlicense/MIT" categories = ["encoding", "no-std", "parser-implementations"] workspace = ".." edition = "2018" [badges] travis-ci = { repository = "BurntSushi/rust-csv" } appveyor = { repository = "BurntSushi/rust-csv" } [lib] bench = false [features] default = ["libc"] libc = ["memchr/libc"] [dependencies] memchr = { version = "2", default-features = false } [dev-dependencies] arrayvec = { version = "0.4", default-features = false } csv-core-0.1.6/Cargo.toml0000644000000024600000000000000106200ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "csv-core" version = "0.1.6" authors = ["Andrew Gallant "] description = "Bare bones CSV parsing with no_std support." homepage = "https://github.com/BurntSushi/rust-csv" documentation = "https://docs.rs/csv-core" readme = "README.md" keywords = ["csv", "comma", "parser", "delimited", "no_std"] categories = ["encoding", "no-std", "parser-implementations"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/rust-csv" [lib] bench = false [dependencies.memchr] version = "2" default-features = false [dev-dependencies.arrayvec] version = "0.4" default-features = false [features] default = ["libc"] libc = ["memchr/libc"] [badges.appveyor] repository = "BurntSushi/rust-csv" [badges.travis-ci] repository = "BurntSushi/rust-csv" csv-core-0.1.6/Cargo.toml.orig0000644000000024610000000000000115600ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "csv-core" version = "0.1.6" authors = ["Andrew Gallant "] description = "Bare bones CSV parsing with no_std support." homepage = "https://github.com/BurntSushi/rust-csv" documentation = "https://docs.rs/csv-core" readme = "README.md" keywords = ["csv", "comma", "parser", "delimited", "no_std"] categories = ["encoding", "no-std", "parser-implementations"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/rust-csv" [lib] bench = false [dependencies.memchr] version = "2" default-features = false [dev-dependencies.arrayvec] version = "0.4" default-features = false [features] default = ["libc"] libc = ["memchr/libc"] [badges.appveyor] repository = "BurntSushi/rust-csv" [badges.travis-ci] repository = "BurntSushi/rust-csv" csv-core-0.1.6/LICENSE-MIT010064400017500000144000000020711313170244000131030ustar0000000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. csv-core-0.1.6/README.md010064400017500000144000000060161350500255200127330ustar0000000000000000csv-core ======== A fast CSV reader and write for use in a `no_std` context. This crate will never use the Rust standard library. [![Linux build status](https://api.travis-ci.org/BurntSushi/rust-csv.png)](https://travis-ci.org/BurntSushi/rust-csv) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/rust-csv?svg=true)](https://ci.appveyor.com/project/BurntSushi/rust-csv) [![](http://meritbadge.herokuapp.com/csv-core)](https://crates.io/crates/csv-core) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation https://docs.rs/csv-core ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] csv-core = "0.1.6" ``` ### Build features This crate by default links with `libc`, which is done via the `libc` feature. Disabling this feature will drop `csv-core`'s dependency on `libc`. ### Example: reading CSV This example shows how to count the number of fields and records in CSV data. ```rust use csv_core::{Reader, ReadFieldResult}; let data = " foo,bar,baz a,b,c xxx,yyy,zzz "; let mut rdr = Reader::new(); let mut bytes = data.as_bytes(); let mut count_fields = 0; let mut count_records = 0; loop { // We skip handling the output since we don't need it for counting. let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]); bytes = &bytes[nin..]; match result { ReadFieldResult::InputEmpty => {}, ReadFieldResult::OutputFull => panic!("field too large"), ReadFieldResult::Field { record_end } => { count_fields += 1; if record_end { count_records += 1; } } ReadFieldResult::End => break, } } assert_eq!(3, count_records); assert_eq!(9, count_fields); ``` ### Example: writing CSV This example shows how to use the `Writer` API to write valid CSV data. Proper quoting is handled automatically. ```rust use csv_core::Writer; // This is where we'll write out CSV data. let mut out = &mut [0; 1024]; // The number of bytes we've written to `out`. let mut nout = 0; // Create a CSV writer with a default configuration. let mut wtr = Writer::new(); // Write a single field. Note that we ignore the `WriteResult` and the number // of input bytes consumed since we're doing this by hand. let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]); nout += n; // Write a delimiter and then another field that requires quotes. let (_, n) = wtr.delimiter(&mut out[nout..]); nout += n; let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]); nout += n; let (_, n) = wtr.terminator(&mut out[nout..]); nout += n; // Now write another record. let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]); nout += n; let (_, n) = wtr.delimiter(&mut out[nout..]); nout += n; let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]); nout += n; // We must always call finish once done writing. // This ensures that any closing quotes are written. let (_, n) = wtr.finish(&mut out[nout..]); nout += n; assert_eq!(&out[..nout], &b"\ foo,\"bar,baz\" \"a \"\"b\"\" c\",quux"[..]); ``` csv-core-0.1.6/UNLICENSE010064400017500000144000000022731313170244000127230ustar0000000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to csv-core-0.1.6/benches/bench.rs010064400017500000144000000062021350500243100145010ustar0000000000000000#![feature(test)] extern crate test; use test::Bencher; use csv_core::{Reader, ReaderBuilder}; static NFL: &'static str = include_str!("../../examples/data/bench/nfl.csv"); static GAME: &'static str = include_str!("../../examples/data/bench/game.csv"); static POP: &'static str = include_str!("../../examples/data/bench/worldcitiespop.csv"); static MBTA: &'static str = include_str!("../../examples/data/bench/gtfs-mbta-stop-times.csv"); macro_rules! bench { ($name:ident, $data:ident, $counter:ident, $result:expr) => { bench!($name, $data, $counter, $result, false); }; ($name:ident, $data:ident, $counter:ident, $result:expr, NFA) => { bench!($name, $data, $counter, $result, true); }; ($name:ident, $data:ident, $counter:ident, $result:expr, $nfa:expr) => { #[bench] fn $name(b: &mut Bencher) { let data = $data.as_bytes(); b.bytes = data.len() as u64; let mut rdr = ReaderBuilder::new().nfa($nfa).build(); b.iter(|| { rdr.reset(); assert_eq!($counter(&mut rdr, data), $result); }) } }; } bench!(count_nfl_field_copy_dfa, NFL, count_fields, 130000); bench!(count_nfl_field_copy_nfa, NFL, count_fields, 130000, NFA); bench!(count_nfl_record_copy_dfa, NFL, count_records, 10000); bench!(count_nfl_record_copy_nfa, NFL, count_records, 10000, NFA); bench!(count_game_field_copy_dfa, GAME, count_fields, 600000); bench!(count_game_field_copy_nfa, GAME, count_fields, 600000, NFA); bench!(count_game_record_copy_dfa, GAME, count_records, 100000); bench!(count_game_record_copy_nfa, GAME, count_records, 100000, NFA); bench!(count_pop_field_copy_dfa, POP, count_fields, 140007); bench!(count_pop_field_copy_nfa, POP, count_fields, 140007, NFA); bench!(count_pop_record_copy_dfa, POP, count_records, 20001); bench!(count_pop_record_copy_nfa, POP, count_records, 20001, NFA); bench!(count_mbta_field_copy_dfa, MBTA, count_fields, 90000); bench!(count_mbta_field_copy_nfa, MBTA, count_fields, 90000, NFA); bench!(count_mbta_record_copy_dfa, MBTA, count_records, 10000); bench!(count_mbta_record_copy_nfa, MBTA, count_records, 10000, NFA); fn count_fields(rdr: &mut Reader, mut data: &[u8]) -> u64 { use csv_core::ReadFieldResult::*; let mut count = 0; let mut field = [0u8; 1024]; loop { let (res, nin, _) = rdr.read_field(data, &mut field); data = &data[nin..]; match res { InputEmpty => {} OutputFull => panic!("field too large"), Field { .. } => { count += 1; } End => break, } } count } fn count_records(rdr: &mut Reader, mut data: &[u8]) -> u64 { use csv_core::ReadRecordResult::*; let mut count = 0; let mut record = [0; 8192]; let mut ends = [0; 32]; loop { let (res, nin, _, _) = rdr.read_record(data, &mut record, &mut ends); data = &data[nin..]; match res { InputEmpty => {} OutputFull | OutputEndsFull => panic!("field too large"), Record => count += 1, End => break, } } count } csv-core-0.1.6/src/lib.rs010064400017500000144000000124671350500243100133620ustar0000000000000000/*! `csv-core` provides a fast CSV reader and writer for use in a `no_std` context. This crate will never use the standard library. `no_std` support is therefore enabled by default. If you're looking for more ergonomic CSV parsing routines, please use the [`csv`](https://docs.rs/csv) crate. # Overview This crate has two primary APIs. The `Reader` API provides a CSV parser, and the `Writer` API provides a CSV writer. # Example: reading CSV This example shows how to count the number of fields and records in CSV data. ``` use csv_core::{Reader, ReadFieldResult}; let data = " foo,bar,baz a,b,c xxx,yyy,zzz "; let mut rdr = Reader::new(); let mut bytes = data.as_bytes(); let mut count_fields = 0; let mut count_records = 0; loop { // We skip handling the output since we don't need it for counting. let (result, nin, _) = rdr.read_field(bytes, &mut [0; 1024]); bytes = &bytes[nin..]; match result { ReadFieldResult::InputEmpty => {}, ReadFieldResult::OutputFull => panic!("field too large"), ReadFieldResult::Field { record_end } => { count_fields += 1; if record_end { count_records += 1; } } ReadFieldResult::End => break, } } assert_eq!(3, count_records); assert_eq!(9, count_fields); ``` # Example: writing CSV This example shows how to use the `Writer` API to write valid CSV data. Proper quoting is handled automatically. ``` use csv_core::Writer; // This is where we'll write out CSV data. let mut out = &mut [0; 1024]; // The number of bytes we've written to `out`. let mut nout = 0; // Create a CSV writer with a default configuration. let mut wtr = Writer::new(); // Write a single field. Note that we ignore the `WriteResult` and the number // of input bytes consumed since we're doing this by hand. let (_, _, n) = wtr.field(&b"foo"[..], &mut out[nout..]); nout += n; // Write a delimiter and then another field that requires quotes. let (_, n) = wtr.delimiter(&mut out[nout..]); nout += n; let (_, _, n) = wtr.field(&b"bar,baz"[..], &mut out[nout..]); nout += n; let (_, n) = wtr.terminator(&mut out[nout..]); nout += n; // Now write another record. let (_, _, n) = wtr.field(&b"a \"b\" c"[..], &mut out[nout..]); nout += n; let (_, n) = wtr.delimiter(&mut out[nout..]); nout += n; let (_, _, n) = wtr.field(&b"quux"[..], &mut out[nout..]); nout += n; // We must always call finish once done writing. // This ensures that any closing quotes are written. let (_, n) = wtr.finish(&mut out[nout..]); nout += n; assert_eq!(&out[..nout], &b"\ foo,\"bar,baz\" \"a \"\"b\"\" c\",quux"[..]); ``` */ #![deny(missing_docs)] #![no_std] pub use crate::reader::{ ReadFieldNoCopyResult, ReadFieldResult, ReadRecordNoCopyResult, ReadRecordResult, Reader, ReaderBuilder, }; pub use crate::writer::{ is_non_numeric, quote, WriteResult, Writer, WriterBuilder, }; mod reader; mod writer; /// A record terminator. /// /// Use this to specify the record terminator while parsing CSV. The default is /// CRLF, which treats `\r`, `\n` or `\r\n` as a single record terminator. #[derive(Clone, Copy, Debug)] pub enum Terminator { /// Parses `\r`, `\n` or `\r\n` as a single record terminator. CRLF, /// Parses the byte given as a record terminator. Any(u8), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl Terminator { /// Checks whether the terminator is set to CRLF. fn is_crlf(&self) -> bool { match *self { Terminator::CRLF => true, Terminator::Any(_) => false, _ => unreachable!(), } } fn equals(&self, other: u8) -> bool { match *self { Terminator::CRLF => other == b'\r' || other == b'\n', Terminator::Any(b) => other == b, _ => unreachable!(), } } } impl Default for Terminator { fn default() -> Terminator { Terminator::CRLF } } /// The quoting style to use when writing CSV data. #[derive(Clone, Copy, Debug)] pub enum QuoteStyle { /// This puts quotes around every field. Always. Always, /// This puts quotes around fields only when necessary. /// /// They are necessary when fields contain a quote, delimiter or record /// terminator. Quotes are also necessary when writing an empty record /// (which is indistinguishable from a record with one empty field). /// /// This is the default. Necessary, /// This puts quotes around all fields that are non-numeric. Namely, when /// writing a field that does not parse as a valid float or integer, then /// quotes will be used even if they aren't strictly necessary. NonNumeric, /// This *never* writes quotes, even if it would produce invalid CSV data. Never, /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl Default for QuoteStyle { fn default() -> QuoteStyle { QuoteStyle::Necessary } } csv-core-0.1.6/src/reader.rs010064400017500000144000002065551350500243100140610ustar0000000000000000use core::fmt; use crate::Terminator; // BE ADVISED // // This may just be one of the more complicated CSV parsers you'll come across. // The implementation never allocates and consists of both a functional NFA // parser and a DFA parser. The DFA parser is the work horse and we could elide // much of the work involved in making the NFA parser work, but the NFA parser // is much easier to debug. The NFA parser is tested alongside the DFA parser, // so they should never be out of sync. // // The basic structure of the implementation is to encode the NFA parser as // an explicit state machine in code. The DFA is then generated by populating // a transition table on the stack by exhaustively enumerating all possible // states on all possible inputs (this is possible because the number of states // and the number of inputs is very small). // // Note that some pieces of the NFA parser (such as the NFA state machine) are // required. In particular, the translation from the NFA to the DFA depends on // the configuration of the CSV parser as given by the caller, and indeed, this // is one of the key performance benefits of the DFA: it doesn't have any // overhead (other than a bigger transition table) associated with the number // of configuration options. // // ADVICE FOR HACKERS // // This code is too clever for its own good. As such, changes to some parts of // the code may have a non-obvious impact on other parts. This is mostly // motivated by trying to keep the DFA transition table as small as possible, // since it is stored on the stack. Here are some tips that may save you some // time: // // * If you add a new NFA state, then you also need to consider how it impacts // the DFA. If all of the incoming transitions into an NFA state are // epsilon transitions, then it probably isn't materialized in the DFA. // If the NFA state indicates that a field or a record has been parsed, then // it should be considered final. Let the comments in `NfaState` be your // guide. // * If you add a new configuration knob to the parser, then you may need to // modify the `TRANS_CLASSES` constant below. The `TRANS_CLASSES` constant // indicates the total number of discriminating bytes in the DFA. And if you // modify `TRANS_CLASSES`, you probably also need to modify `build_dfa` to // add a new class. For example, in order to add parsing support for // comments, I bumped `TRANS_CLASSES` from `6` to `7` and added the comment // byte (if one exists) to the list of classes in `build_dfa`. // * The special DFA start state doubles as the final state once all input // from the caller has been exhausted. We must be careful to guard this // case analysis on whether the input is actually exhausted, since the start // state is an otherwise valid state. /// A pull based CSV reader. /// /// This reader parses CSV data using a finite state machine. Callers can /// extract parsed data incrementally using one of the `read` methods. /// /// Note that this CSV reader is somewhat encoding agnostic. The source data /// needs to be at least ASCII compatible. There is no support for specifying /// the full gamut of Unicode delimiters/terminators/quotes/escapes. Instead, /// any byte can be used, although callers probably want to stick to the ASCII /// subset (`<= 0x7F`). /// /// # Usage /// /// A reader has two different ways to read CSV data, each with their own /// trade offs. /// /// * `read_field` - Copies a single CSV field into an output buffer while /// unescaping quotes. This is simple to use and doesn't require storing an /// entire record contiguously in memory, but it is slower. /// * `read_record` - Copies an entire CSV record into an output buffer while /// unescaping quotes. The ending positions of each field are copied into /// an additional buffer. This is harder to use and requires larger output /// buffers, but it is faster than `read_field` since it amortizes more /// costs. /// /// # RFC 4180 /// /// [RFC 4180](https://tools.ietf.org/html/rfc4180) /// is the closest thing to a specification for CSV data. Unfortunately, /// CSV data that is seen in the wild can vary significantly. Often, the CSV /// data is outright invalid. Instead of fixing the producers of bad CSV data, /// we have seen fit to make consumers much more flexible in what they accept. /// This reader continues that tradition, and therefore, isn't technically /// compliant with RFC 4180. In particular, this reader will never return an /// error and will always find *a* parse. /// /// Here are some detailed differences from RFC 4180: /// /// * CRLF, LF and CR are each treated as a single record terminator by /// default. /// * Records are permitted to be of varying length. /// * Empty lines (that do not include other whitespace) are ignored. #[derive(Clone, Debug)] pub struct Reader { /// A table-based DFA for parsing CSV. dfa: Dfa, /// The current DFA state, if the DFA is used. dfa_state: DfaState, /// The current NFA state, if the NFA is used. nfa_state: NfaState, /// The delimiter that separates fields. delimiter: u8, /// The terminator that separates records. term: Terminator, /// The quotation byte. quote: u8, /// Whether to recognize escaped quotes. escape: Option, /// Whether to recognized doubled quotes. double_quote: bool, /// If enabled, lines beginning with this byte are ignored. comment: Option, /// If enabled (the default), then quotes are respected. When disabled, /// quotes are not treated specially. quoting: bool, /// Whether to use the NFA for parsing. /// /// Generally this is for debugging. There's otherwise no good reason /// to avoid the DFA. use_nfa: bool, /// The current line number. line: u64, /// Whether this parser has ever read anything. has_read: bool, /// The current position in the output buffer when reading a record. output_pos: usize, } impl Default for Reader { fn default() -> Reader { Reader { dfa: Dfa::new(), dfa_state: DfaState::start(), nfa_state: NfaState::StartRecord, delimiter: b',', term: Terminator::default(), quote: b'"', escape: None, double_quote: true, comment: None, quoting: true, use_nfa: false, line: 1, has_read: false, output_pos: 0, } } } /// Builds a CSV reader with various configuration knobs. /// /// This builder can be used to tweak the field delimiter, record terminator /// and more for parsing CSV. Once a CSV `Reader` is built, its configuration /// cannot be changed. #[derive(Debug, Default)] pub struct ReaderBuilder { rdr: Reader, } impl ReaderBuilder { /// Create a new builder. pub fn new() -> ReaderBuilder { ReaderBuilder::default() } /// Build a CSV parser from this configuration. pub fn build(&self) -> Reader { let mut rdr = self.rdr.clone(); rdr.build_dfa(); rdr } /// The field delimiter to use when parsing CSV. /// /// The default is `b','`. pub fn delimiter(&mut self, delimiter: u8) -> &mut ReaderBuilder { self.rdr.delimiter = delimiter; self } /// The record terminator to use when parsing CSV. /// /// A record terminator can be any single byte. The default is a special /// value, `Terminator::CRLF`, which treats any occurrence of `\r`, `\n` /// or `\r\n` as a single record terminator. pub fn terminator(&mut self, term: Terminator) -> &mut ReaderBuilder { self.rdr.term = term; self } /// The quote character to use when parsing CSV. /// /// The default is `b'"'`. pub fn quote(&mut self, quote: u8) -> &mut ReaderBuilder { self.rdr.quote = quote; self } /// The escape character to use when parsing CSV. /// /// In some variants of CSV, quotes are escaped using a special escape /// character like `\` (instead of escaping quotes by doubling them). /// /// By default, recognizing these idiosyncratic escapes is disabled. pub fn escape(&mut self, escape: Option) -> &mut ReaderBuilder { self.rdr.escape = escape; self } /// Enable double quote escapes. /// /// This is enabled by default, but it may be disabled. When disabled, /// doubled quotes are not interpreted as escapes. pub fn double_quote(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.double_quote = yes; self } /// Enable or disable quoting. /// /// This is enabled by default, but it may be disabled. When disabled, /// quotes are not treated specially. pub fn quoting(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.quoting = yes; self } /// The comment character to use when parsing CSV. /// /// If the start of a record begins with the byte given here, then that /// line is ignored by the CSV parser. /// /// This is disabled by default. pub fn comment(&mut self, comment: Option) -> &mut ReaderBuilder { self.rdr.comment = comment; self } /// A convenience method for specifying a configuration to read ASCII /// delimited text. /// /// This sets the delimiter and record terminator to the ASCII unit /// separator (`\x1F`) and record separator (`\x1E`), respectively. pub fn ascii(&mut self) -> &mut ReaderBuilder { self.delimiter(b'\x1F').terminator(Terminator::Any(b'\x1E')) } /// Enable or disable the NFA for parsing CSV. /// /// This is intended to be a debug option useful for debugging. The NFA /// is always slower than the DFA. #[doc(hidden)] pub fn nfa(&mut self, yes: bool) -> &mut ReaderBuilder { self.rdr.use_nfa = yes; self } } /// The result of parsing at most one field from CSV data. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadFieldResult { /// The caller provided input was exhausted before the end of a field or /// record was found. InputEmpty, /// The caller provided output buffer was filled before an entire field /// could be written to it. OutputFull, /// The end of a field was found. /// /// Note that when `record_end` is true, then the end of this field also /// corresponds to the end of a record. Field { /// Whether this was the last field in a record or not. record_end: bool, }, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } impl ReadFieldResult { fn from_nfa( state: NfaState, inpdone: bool, outdone: bool, ) -> ReadFieldResult { match state { NfaState::End => ReadFieldResult::End, NfaState::EndRecord | NfaState::CRLF => { ReadFieldResult::Field { record_end: true } } NfaState::EndFieldDelim => { ReadFieldResult::Field { record_end: false } } _ => { assert!(!state.is_field_final()); if !inpdone && outdone { ReadFieldResult::OutputFull } else { ReadFieldResult::InputEmpty } } } } } /// The result of parsing at most one field from CSV data while ignoring the /// output. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadFieldNoCopyResult { /// The caller provided input was exhausted before the end of a field or /// record was found. InputEmpty, /// The end of a field was found. /// /// Note that when `record_end` is true, then the end of this field also /// corresponds to the end of a record. Field { /// Whether this was the last field in a record or not. record_end: bool, }, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } /// The result of parsing at most one record from CSV data. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadRecordResult { /// The caller provided input was exhausted before the end of a record was /// found. InputEmpty, /// The caller provided output buffer was filled before an entire field /// could be written to it. OutputFull, /// The caller provided output buffer of field end poisitions was filled /// before the next field could be parsed. OutputEndsFull, /// The end of a record was found. Record, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } impl ReadRecordResult { fn is_record(&self) -> bool { *self == ReadRecordResult::Record } fn from_nfa( state: NfaState, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult { match state { NfaState::End => ReadRecordResult::End, NfaState::EndRecord | NfaState::CRLF => ReadRecordResult::Record, _ => { assert!(!state.is_record_final()); if !inpdone && outdone { ReadRecordResult::OutputFull } else if !inpdone && endsdone { ReadRecordResult::OutputEndsFull } else { ReadRecordResult::InputEmpty } } } } } /// The result of parsing at most one record from CSV data while ignoring /// output. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ReadRecordNoCopyResult { /// The caller provided input was exhausted before the end of a record was /// found. InputEmpty, /// The end of a record was found. Record, /// All CSV data has been read. /// /// This state can only be returned when an empty input buffer is provided /// by the caller. End, } /// What should be done with input bytes during an NFA transition #[derive(Clone, Debug, Eq, PartialEq)] enum NfaInputAction { // Do not consume an input byte Epsilon, // Copy input byte to a caller-provided output buffer CopyToOutput, // Consume but do not copy input byte (for example, seeing a field // delimiter will consume an input byte but should not copy it to the // output buffer. Discard, } /// An NFA state is a state that can be visited in the NFA parser. /// /// Given the simplicity of the machine, a subset of NFA states double as DFA /// states. NFA states that only have incoming epsilon transitions are /// optimized out when converting the machine to a DFA. #[derive(Copy, Clone, Debug, Eq, PartialEq)] enum NfaState { // These states aren't used in the DFA, so we // assign them meaningless numbers. EndFieldTerm = 200, InRecordTerm = 201, End = 202, // All states below are DFA states. StartRecord = 0, StartField = 1, InField = 2, InQuotedField = 3, InEscapedQuote = 4, InDoubleEscapedQuote = 5, InComment = 6, // All states below are "final field" states. // Namely, they indicate that a field has been parsed. EndFieldDelim = 7, // All states below are "final record" states. // Namely, they indicate that a record has been parsed. EndRecord = 8, CRLF = 9, } /// A list of NFA states that have an explicit representation in the DFA. const NFA_STATES: &'static [NfaState] = &[ NfaState::StartRecord, NfaState::StartField, NfaState::EndFieldDelim, NfaState::InField, NfaState::InQuotedField, NfaState::InEscapedQuote, NfaState::InDoubleEscapedQuote, NfaState::InComment, NfaState::EndRecord, NfaState::CRLF, ]; impl NfaState { /// Returns true if this state indicates that a field has been parsed. fn is_field_final(&self) -> bool { match *self { NfaState::End | NfaState::EndRecord | NfaState::CRLF | NfaState::EndFieldDelim => true, _ => false, } } /// Returns true if this state indicates that a record has been parsed. fn is_record_final(&self) -> bool { match *self { NfaState::End | NfaState::EndRecord | NfaState::CRLF => true, _ => false, } } } impl Reader { /// Create a new CSV reader with a default parser configuration. pub fn new() -> Reader { ReaderBuilder::new().build() } /// Reset the parser such that it behaves as if it had never been used. /// /// This may be useful when reading CSV data in a random access pattern. pub fn reset(&mut self) { self.dfa_state = self.dfa.new_state(NfaState::StartRecord); self.nfa_state = NfaState::StartRecord; self.line = 1; self.has_read = false; } /// Return the current line number as measured by the number of occurrences /// of `\n`. /// /// Line numbers starts at `1` and are reset when `reset` is called. pub fn line(&self) -> u64 { self.line } /// Set the line number. /// /// This is useful after a call to `reset` where the caller knows the /// line number from some additional context. pub fn set_line(&mut self, line: u64) { self.line = line; } /// Parse a single CSV field in `input` and copy field data to `output`. /// /// This routine requires a caller provided buffer of CSV data as the /// `input` and a caller provided buffer, `output`, in which to store field /// data extracted from `input`. The field data copied to `output` will /// have its quotes unescaped. /// /// Calling this routine parses at most a single field and returns /// three values indicating the state of the parser. The first value, a /// `ReadFieldResult`, tells the caller what to do next. For example, if /// the entire input was read or if the output buffer was filled before /// a full field had been read, then `ReadFieldResult::InputEmpty` or /// `ReadFieldResult::OutputFull` is returned, respectively. See the /// documentation for `ReadFieldResult` for more details. /// /// The other two values returned correspond to the number of bytes /// read from `input` and written to `output`, respectively. /// /// # Termination /// /// This reader interprets an empty `input` buffer as an indication that /// there is no CSV data left to read. Namely, when the caller has /// exhausted all CSV data, the caller should continue to call `read` with /// an empty input buffer until `ReadFieldResult::End` is returned. /// /// # Errors /// /// This CSV reader can never return an error. Instead, it prefers *a* /// parse over *no* parse. pub fn read_field( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { let (input, bom_nin) = self.strip_utf8_bom(input); let (res, nin, nout) = if self.use_nfa { self.read_field_nfa(input, output) } else { self.read_field_dfa(input, output) }; self.has_read = true; (res, nin + bom_nin, nout) } /// Parse a single CSV record in `input` and copy each field contiguously /// to `output`, with the end position of each field written to `ends`. /// /// **NOTE**: This method is more cumbersome to use than `read_field`, but /// it can be faster since it amortizes more work. /// /// This routine requires a caller provided buffer of CSV data as the /// `input` and two caller provided buffers to store the unescaped field /// data (`output`) and the end position of each field in the record /// (`fields`). /// /// Calling this routine parses at most a single record and returns four /// values indicating the state of the parser. The first value, a /// `ReadRecordResult`, tells the caller what to do next. For example, if /// the entire input was read or if the output buffer was filled before a /// full field had been read, then `ReadRecordResult::InputEmpty` or /// `ReadFieldResult::OutputFull` is returned, respectively. Similarly, if /// the `ends` buffer is full, then `ReadFieldResult::OutputEndsFull` is /// returned. See the documentation for `ReadRecordResult` for more /// details. /// /// The other three values correspond to the number of bytes read from /// `input`, the number of bytes written to `output` and the number of /// end positions written to `ends`, respectively. /// /// The end positions written to `ends` are constructed as if there was /// a single contiguous buffer in memory containing the entire row, even /// if `ReadRecordResult::OutputFull` was returned in the middle of reading /// a row. /// /// # Termination /// /// This reader interprets an empty `input` buffer as an indication that /// there is no CSV data left to read. Namely, when the caller has /// exhausted all CSV data, the caller should continue to call `read` with /// an empty input buffer until `ReadRecordResult::End` is returned. /// /// # Errors /// /// This CSV reader can never return an error. Instead, it prefers *a* /// parse over *no* parse. pub fn read_record( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { let (input, bom_nin) = self.strip_utf8_bom(input); let (res, nin, nout, nend) = if self.use_nfa { self.read_record_nfa(input, output, ends) } else { self.read_record_dfa(input, output, ends) }; self.has_read = true; (res, nin + bom_nin, nout, nend) } /// Strip off a possible UTF-8 BOM at the start of a file. Quick note that /// this method will fail to strip off the BOM if only part of the BOM is /// buffered. Hopefully that won't happen very often. fn strip_utf8_bom<'a>(&self, input: &'a [u8]) -> (&'a [u8], usize) { let (input, nin) = if { !self.has_read && input.len() >= 3 && &input[0..3] == b"\xef\xbb\xbf" } { (&input[3..], 3) } else { (input, 0) }; (input, nin) } #[inline(always)] fn read_record_dfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { if input.is_empty() { let s = self.transition_final_dfa(self.dfa_state); let res = self.dfa.new_read_record_result(s, true, false, false, false); // This part is a little tricky. When reading the final record, // the last result the caller will get is an InputEmpty, and while // they'll have everything they need in `output`, they'll be // missing the final end position of the final field in `ends`. // We insert that here, but we must take care to handle the case // where `ends` doesn't have enough space. If it doesn't have // enough space, then we also can't transition to the next state. return match res { ReadRecordResult::Record => { if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } self.dfa_state = s; ends[0] = self.output_pos; self.output_pos = 0; (res, 0, 0, 1) } _ => { self.dfa_state = s; (res, 0, 0, 0) } }; } if output.is_empty() { return (ReadRecordResult::OutputFull, 0, 0, 0); } if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } let (mut nin, mut nout, mut nend) = (0, 0, 0); let mut state = self.dfa_state; while nin < input.len() && nout < output.len() && nend < ends.len() { let (s, has_out) = self.dfa.get_output(state, input[nin]); self.line += (input[nin] == b'\n') as u64; state = s; if has_out { output[nout] = input[nin]; nout += 1; } nin += 1; if state >= self.dfa.final_field { ends[nend] = self.output_pos + nout; nend += 1; if state > self.dfa.final_field { break; } } if state == self.dfa.in_field || state == self.dfa.in_quoted { self.dfa .classes .scan_and_copy(input, &mut nin, output, &mut nout); } } let res = self.dfa.new_read_record_result( state, false, nin >= input.len(), nout >= output.len(), nend >= ends.len(), ); self.dfa_state = state; if res.is_record() { self.output_pos = 0; } else { self.output_pos += nout; } (res, nin, nout, nend) } #[inline(always)] fn read_field_dfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { if input.is_empty() { self.dfa_state = self.transition_final_dfa(self.dfa_state); let res = self.dfa.new_read_field_result( self.dfa_state, true, false, false, ); return (res, 0, 0); } if output.is_empty() { return (ReadFieldResult::OutputFull, 0, 0); } let (mut nin, mut nout) = (0, 0); let mut state = self.dfa_state; while nin < input.len() && nout < output.len() { let b = input[nin]; self.line += (b == b'\n') as u64; let (s, has_out) = self.dfa.get_output(state, b); state = s; if has_out { output[nout] = b; nout += 1; } nin += 1; if state >= self.dfa.final_field { break; } } let res = self.dfa.new_read_field_result( state, false, nin >= input.len(), nout >= output.len(), ); self.dfa_state = state; (res, nin, nout) } /// Perform the final state transition, i.e., when the caller indicates /// that the input has been exhausted. fn transition_final_dfa(&self, state: DfaState) -> DfaState { // If we''ve already emitted a record or think we're ready to start // parsing a new record, then we should sink into the final state // and never move from there. (pro-tip: the start state doubles as // the final state!) if state >= self.dfa.final_record || state.is_start() { self.dfa.new_state_final_end() } else { self.dfa.new_state_final_record() } } /// Write the transition tables for the DFA based on this parser's /// configuration. fn build_dfa(&mut self) { // A naive DFA transition table has // `cells = (# number of states) * (# size of alphabet)`. While we // could get away with that, the table would have `10 * 256 = 2560` // entries. Even worse, in order to avoid a multiplication instruction // when computing the next transition, we store the starting index of // each state's row, which would not be representible in a single byte. // So we'd need a `u16`, which doubles our transition table size to // ~5KB. This is a lot to put on the stack, even though it probably // fits in the L1 cache of most modern CPUs. // // To avoid this, we note that while our "true" alphabet // has 256 distinct possibilities, the DFA itself is only // discriminatory on a very small subset of that alphabet. For // example, assuming neither `a` nor `b` are set as special // quote/comment/escape/delimiter/terminator bytes, they are otherwise // indistinguishable to the DFA, so it would be OK to treat them as // if they were equivalent. That is, they are in the same equivalence // class. // // As it turns out, using this logic, we can shrink our effective // alphabet down to 7 equivalence classes: // // 1. The field delimiter. // 2. The record terminator. // 3. If the record terminator is CRLF, then CR and LF are // distinct equivalence classes. // 4. The quote byte. // 5. The escape byte. // 6. The comment byte. // 7. Everything else. // // We add those equivalence classes here. If more configuration knobs // are added to the parser with more discriminating bytes, then this // logic will need to be adjusted further. // // Even though this requires an extra bit of indirection when computing // the next transition, microbenchmarks say that it doesn't make much // of a difference. Perhaps because everything fits into the L1 cache. self.dfa.classes.add(self.delimiter); if self.quoting { self.dfa.classes.add(self.quote); if let Some(escape) = self.escape { self.dfa.classes.add(escape); } } if let Some(comment) = self.comment { self.dfa.classes.add(comment); } match self.term { Terminator::Any(b) => self.dfa.classes.add(b), Terminator::CRLF => { self.dfa.classes.add(b'\r'); self.dfa.classes.add(b'\n'); } _ => unreachable!(), } // Build the DFA transition table by computing the DFA state for all // possible combinations of state and input byte. for &state in NFA_STATES { for c in (0..256).map(|c| c as u8) { let mut nfa_result = (state, NfaInputAction::Epsilon); // Consume NFA states until we hit a non-epsilon transition. while nfa_result.0 != NfaState::End && nfa_result.1 == NfaInputAction::Epsilon { nfa_result = self.transition_nfa(nfa_result.0, c); } let from = self.dfa.new_state(state); let to = self.dfa.new_state(nfa_result.0); self.dfa.set( from, c, to, nfa_result.1 == NfaInputAction::CopyToOutput, ); } } self.dfa_state = self.dfa.new_state(NfaState::StartRecord); self.dfa.finish(); } // The NFA implementation follows. The transition_final_nfa and // transition_nfa methods are required for the DFA to operate. The // rest are included for completeness (and debugging). Note that this // NFA implementation is included in most of the CSV parser tests below. #[inline(always)] fn read_record_nfa( &mut self, input: &[u8], output: &mut [u8], ends: &mut [usize], ) -> (ReadRecordResult, usize, usize, usize) { if input.is_empty() { let s = self.transition_final_nfa(self.nfa_state); let res = ReadRecordResult::from_nfa(s, false, false, false); return match res { ReadRecordResult::Record => { if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } self.nfa_state = s; ends[0] = self.output_pos; self.output_pos = 0; (res, 0, 0, 1) } _ => { self.nfa_state = s; (res, 0, 0, 0) } }; } if output.is_empty() { return (ReadRecordResult::OutputFull, 0, 0, 0); } if ends.is_empty() { return (ReadRecordResult::OutputEndsFull, 0, 0, 0); } let (mut nin, mut nout, mut nend) = (0, self.output_pos, 0); let mut state = self.nfa_state; while nin < input.len() && nout < output.len() && nend < ends.len() { let (s, io) = self.transition_nfa(state, input[nin]); match io { NfaInputAction::CopyToOutput => { output[nout] = input[nin]; nout += 1; nin += 1; } NfaInputAction::Discard => { nin += 1; } NfaInputAction::Epsilon => {} } state = s; if state.is_field_final() { ends[nend] = nout; nend += 1; if state != NfaState::EndFieldDelim { break; } } } let res = ReadRecordResult::from_nfa( state, nin >= input.len(), nout >= output.len(), nend >= ends.len(), ); self.nfa_state = state; self.output_pos = if res.is_record() { 0 } else { nout }; (res, nin, nout, nend) } #[inline(always)] fn read_field_nfa( &mut self, input: &[u8], output: &mut [u8], ) -> (ReadFieldResult, usize, usize) { if input.is_empty() { self.nfa_state = self.transition_final_nfa(self.nfa_state); let res = ReadFieldResult::from_nfa(self.nfa_state, false, false); return (res, 0, 0); } if output.is_empty() { // If the output buffer is empty, then we can never make progress, // so just quit now. return (ReadFieldResult::OutputFull, 0, 0); } let (mut nin, mut nout) = (0, 0); let mut state = self.nfa_state; while nin < input.len() && nout < output.len() { let (s, io) = self.transition_nfa(state, input[nin]); match io { NfaInputAction::CopyToOutput => { output[nout] = input[nin]; nout += 1; nin += 1; } NfaInputAction::Discard => { nin += 1; } NfaInputAction::Epsilon => (), } state = s; if state.is_field_final() { break; } } let res = ReadFieldResult::from_nfa( state, nin >= input.len(), nout >= output.len(), ); self.nfa_state = state; (res, nin, nout) } /// Compute the final NFA transition after all caller-provided input has /// been exhausted. #[inline(always)] fn transition_final_nfa(&self, state: NfaState) -> NfaState { use self::NfaState::*; match state { End | StartRecord | EndRecord | InComment | CRLF => End, StartField | EndFieldDelim | EndFieldTerm | InField | InQuotedField | InEscapedQuote | InDoubleEscapedQuote | InRecordTerm => EndRecord, } } /// Compute the next NFA state given the current NFA state and the current /// input byte. /// /// This returns the next NFA state along with an NfaInputAction that /// indicates what should be done with the input byte (nothing for an epsilon /// transition, copied to a caller provided output buffer, or discarded). #[inline(always)] fn transition_nfa( &self, state: NfaState, c: u8, ) -> (NfaState, NfaInputAction) { use self::NfaState::*; match state { End => (End, NfaInputAction::Epsilon), StartRecord => { if self.term.equals(c) { (StartRecord, NfaInputAction::Discard) } else if self.comment == Some(c) { (InComment, NfaInputAction::Discard) } else { (StartField, NfaInputAction::Epsilon) } } EndRecord => (StartRecord, NfaInputAction::Epsilon), StartField => { if self.quoting && self.quote == c { (InQuotedField, NfaInputAction::Discard) } else if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } EndFieldDelim => (StartField, NfaInputAction::Epsilon), EndFieldTerm => (InRecordTerm, NfaInputAction::Epsilon), InField => { if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } InQuotedField => { if self.quoting && self.quote == c { (InDoubleEscapedQuote, NfaInputAction::Discard) } else if self.quoting && self.escape == Some(c) { (InEscapedQuote, NfaInputAction::Discard) } else { (InQuotedField, NfaInputAction::CopyToOutput) } } InEscapedQuote => (InQuotedField, NfaInputAction::CopyToOutput), InDoubleEscapedQuote => { if self.quoting && self.double_quote && self.quote == c { (InQuotedField, NfaInputAction::CopyToOutput) } else if self.delimiter == c { (EndFieldDelim, NfaInputAction::Discard) } else if self.term.equals(c) { (EndFieldTerm, NfaInputAction::Epsilon) } else { (InField, NfaInputAction::CopyToOutput) } } InComment => { if b'\n' == c { (StartRecord, NfaInputAction::Discard) } else { (InComment, NfaInputAction::Discard) } } InRecordTerm => { if self.term.is_crlf() && b'\r' == c { (CRLF, NfaInputAction::Discard) } else { (EndRecord, NfaInputAction::Discard) } } CRLF => { if b'\n' == c { (StartRecord, NfaInputAction::Discard) } else { (StartRecord, NfaInputAction::Epsilon) } } } } } /// The number of slots in the DFA transition table. /// /// This number is computed by multiplying the maximum number of transition /// classes (7) by the total number of NFA states that are used in the DFA /// (10). /// /// The number of transition classes is determined by an equivalence class of /// bytes, where every byte in the same equivalence classes is /// indistinguishable from any other byte with respect to the DFA. For example, /// if neither `a` nor `b` are specifed as a delimiter/quote/terminator/escape, /// then the DFA will never discriminate between `a` or `b`, so they can /// effectively be treated as identical. This reduces storage space /// substantially. /// /// The total number of NFA states (13) is greater than the total number of /// NFA states that are in the DFA. In particular, any NFA state that can only /// be reached by epsilon transitions will never have explicit usage in the /// DFA. const TRANS_CLASSES: usize = 7; const DFA_STATES: usize = 10; const TRANS_SIZE: usize = TRANS_CLASSES * DFA_STATES; /// The number of possible transition classes. (See the comment on `TRANS_SIZE` /// for more details.) const CLASS_SIZE: usize = 256; /// A representation of a DFA. /// /// For the most part, this is a transition table, but various optimizations /// have been applied to reduce its memory footprint. struct Dfa { /// The core transition table. Each row corresponds to the transitions for /// each input equivalence class. (Input bytes are mapped to their /// corresponding equivalence class with the `classes` map.) /// /// DFA states are represented as an index corresponding to the start of /// its row in this table. trans: [DfaState; TRANS_SIZE], /// A table with the same layout as `trans`, except its values indicate /// whether a particular `(state, equivalence class)` pair should emit an /// output byte. has_output: [bool; TRANS_SIZE], /// A map from input byte to equivalence class. /// /// This is responsible for reducing the effective alphabet size from /// 256 to `TRANS_CLASSES`. classes: DfaClasses, /// The DFA state corresponding to being inside an unquoted field. in_field: DfaState, /// The DFA state corresponding to being inside an quoted field. in_quoted: DfaState, /// The minimum DFA state that indicates a field has been parsed. All DFA /// states greater than this are also final-field states. final_field: DfaState, /// The minimum DFA state that indicates a record has been parsed. All DFA /// states greater than this are also final-record states. final_record: DfaState, } impl Dfa { fn new() -> Dfa { Dfa { trans: [DfaState(0); TRANS_SIZE], has_output: [false; TRANS_SIZE], classes: DfaClasses::new(), in_field: DfaState(0), in_quoted: DfaState(0), final_field: DfaState(0), final_record: DfaState(0), } } fn new_state(&self, nfa_state: NfaState) -> DfaState { let nclasses = self.classes.num_classes() as u8; let idx = (nfa_state as u8).checked_mul(nclasses).unwrap(); DfaState(idx) } fn new_state_final_end(&self) -> DfaState { self.new_state(NfaState::StartRecord) } fn new_state_final_record(&self) -> DfaState { self.new_state(NfaState::EndRecord) } fn get_output(&self, state: DfaState, c: u8) -> (DfaState, bool) { let cls = self.classes.classes[c as usize]; let idx = state.0 as usize + cls as usize; (self.trans[idx], self.has_output[idx]) } fn set(&mut self, from: DfaState, c: u8, to: DfaState, output: bool) { let cls = self.classes.classes[c as usize]; let idx = from.0 as usize + cls as usize; self.trans[idx] = to; self.has_output[idx] = output; } fn finish(&mut self) { self.in_field = self.new_state(NfaState::InField); self.in_quoted = self.new_state(NfaState::InQuotedField); self.final_field = self.new_state(NfaState::EndFieldDelim); self.final_record = self.new_state(NfaState::EndRecord); } fn new_read_field_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, ) -> ReadFieldResult { if state >= self.final_record { ReadFieldResult::Field { record_end: true } } else if state == self.final_field { ReadFieldResult::Field { record_end: false } } else if is_final_trans && state.is_start() { ReadFieldResult::End } else { debug_assert!(state < self.final_field); if !inpdone && outdone { ReadFieldResult::OutputFull } else { ReadFieldResult::InputEmpty } } } fn new_read_record_result( &self, state: DfaState, is_final_trans: bool, inpdone: bool, outdone: bool, endsdone: bool, ) -> ReadRecordResult { if state >= self.final_record { ReadRecordResult::Record } else if is_final_trans && state.is_start() { ReadRecordResult::End } else { debug_assert!(state < self.final_record); if !inpdone && outdone { ReadRecordResult::OutputFull } else if !inpdone && endsdone { ReadRecordResult::OutputEndsFull } else { ReadRecordResult::InputEmpty } } } } /// A map from input byte to equivalence class. struct DfaClasses { classes: [u8; CLASS_SIZE], next_class: usize, } impl DfaClasses { fn new() -> DfaClasses { DfaClasses { classes: [0; CLASS_SIZE], next_class: 1 } } fn add(&mut self, b: u8) { if self.next_class > CLASS_SIZE { panic!("added too many classes") } self.classes[b as usize] = self.next_class as u8; self.next_class = self.next_class + 1; } fn num_classes(&self) -> usize { self.next_class as usize } /// Scan and copy the input bytes to the output buffer quickly. /// /// This assumes that the current state of the DFA is either `InField` or /// `InQuotedField`. In this case, all bytes corresponding to the first /// equivalence class (i.e., not a delimiter/quote/escape/etc.) are /// guaranteed to never result in a state transition out of the current /// state. This function takes advantage of that copies every byte from /// `input` in the first equivalence class to `output`. Once a byte is seen /// outside the first equivalence class, we quit and should fall back to /// the main DFA loop. #[inline(always)] fn scan_and_copy( &self, input: &[u8], nin: &mut usize, output: &mut [u8], nout: &mut usize, ) { while *nin < input.len() && *nout < output.len() && self.classes[input[*nin] as usize] == 0 { output[*nout] = input[*nin]; *nin += 1; *nout += 1; } } } /// A single DFA state. /// /// A DFA state is represented by the starting index of its corresponding row /// in the DFA transition table. This representation allows us to elide a /// single multiplication instruction when computing the next transition for /// a particular input byte. #[derive(Clone, Copy, Debug, Eq, Ord, PartialEq, PartialOrd)] struct DfaState(u8); impl DfaState { fn start() -> DfaState { DfaState(0) } fn is_start(&self) -> bool { self.0 == 0 } } impl fmt::Debug for Dfa { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "Dfa(N/A)") } } impl fmt::Debug for DfaClasses { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "DfaClasses {{ classes: N/A, next_class: {:?} }}", self.next_class ) } } impl Clone for Dfa { fn clone(&self) -> Dfa { let mut dfa = Dfa::new(); dfa.trans.copy_from_slice(&self.trans); dfa } } impl Clone for DfaClasses { fn clone(&self) -> DfaClasses { let mut x = DfaClasses::new(); x.classes.copy_from_slice(&self.classes); x } } #[cfg(test)] mod tests { use core::str; use arrayvec::{ArrayString, ArrayVec}; use super::{ReadFieldResult, Reader, ReaderBuilder, Terminator}; type Csv = ArrayVec<[Row; 10]>; type Row = ArrayVec<[Field; 10]>; type Field = ArrayString<[u8; 10]>; // OMG I HATE BYTE STRING LITERALS SO MUCH. fn b(s: &str) -> &[u8] { s.as_bytes() } macro_rules! csv { ($([$($field:expr),*]),*) => {{ #[allow(unused_mut)] fn x() -> Csv { let mut csv = Csv::new(); $( let mut row = Row::new(); $( row.push(Field::from($field).unwrap()); )* csv.push(row); )* csv } x() }} } macro_rules! parses_to { ($name:ident, $data:expr, $expected:expr) => { parses_to!($name, $data, $expected, |builder| builder); }; ($name:ident, $data:expr, $expected:expr, $config:expr) => { #[test] fn $name() { let mut builder = ReaderBuilder::new(); builder.nfa(true); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_field(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "nfa by field"); let mut builder = ReaderBuilder::new(); builder.nfa(true); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_record(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "nfa by record"); let mut builder = ReaderBuilder::new(); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_field(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "dfa by field"); let mut builder = ReaderBuilder::new(); $config(&mut builder); let mut rdr = builder.build(); let got = parse_by_record(&mut rdr, $data); let expected = $expected; assert_eq!(expected, got, "dfa by record"); } }; } fn parse_by_field(rdr: &mut Reader, data: &str) -> Csv { let mut data = data.as_bytes(); let mut field = [0u8; 10]; let mut csv = Csv::new(); let mut row = Row::new(); let mut outpos = 0; loop { let (res, nin, nout) = rdr.read_field(data, &mut field[outpos..]); data = &data[nin..]; outpos += nout; match res { ReadFieldResult::InputEmpty => { if !data.is_empty() { panic!("missing input data") } } ReadFieldResult::OutputFull => panic!("field too large"), ReadFieldResult::Field { record_end } => { let s = str::from_utf8(&field[..outpos]).unwrap(); row.push(Field::from(s).unwrap()); outpos = 0; if record_end { csv.push(row); row = Row::new(); } } ReadFieldResult::End => { return csv; } } } } fn parse_by_record(rdr: &mut Reader, data: &str) -> Csv { use crate::ReadRecordResult::*; let mut data = data.as_bytes(); let mut record = [0; 1024]; let mut ends = [0; 10]; let mut csv = Csv::new(); let (mut outpos, mut endpos) = (0, 0); loop { let (res, nin, nout, nend) = rdr.read_record( data, &mut record[outpos..], &mut ends[endpos..], ); data = &data[nin..]; outpos += nout; endpos += nend; match res { InputEmpty => { if !data.is_empty() { panic!("missing input data") } } OutputFull => panic!("record too large (out buffer)"), OutputEndsFull => panic!("record too large (end buffer)"), Record => { let s = str::from_utf8(&record[..outpos]).unwrap(); let mut start = 0; let mut row = Row::new(); for &end in &ends[..endpos] { row.push(Field::from(&s[start..end]).unwrap()); start = end; } csv.push(row); outpos = 0; endpos = 0; } End => return csv, } } } parses_to!(one_row_one_field, "a", csv![["a"]]); parses_to!(one_row_many_fields, "a,b,c", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma, "a,b,", csv![["a", "b", ""]]); parses_to!(one_row_one_field_lf, "a\n", csv![["a"]]); parses_to!(one_row_many_fields_lf, "a,b,c\n", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_lf, "a,b,\n", csv![["a", "b", ""]]); parses_to!(one_row_one_field_crlf, "a\r\n", csv![["a"]]); parses_to!(one_row_many_fields_crlf, "a,b,c\r\n", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_crlf, "a,b,\r\n", csv![["a", "b", ""]]); parses_to!(one_row_one_field_cr, "a\r", csv![["a"]]); parses_to!(one_row_many_fields_cr, "a,b,c\r", csv![["a", "b", "c"]]); parses_to!(one_row_trailing_comma_cr, "a,b,\r", csv![["a", "b", ""]]); parses_to!(many_rows_one_field, "a\nb", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields, "a,b,c\nx,y,z", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma, "a,b,\nx,y,", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_lf, "a\nb\n", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_lf, "a,b,c\nx,y,z\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_lf, "a,b,\nx,y,\n", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_crlf, "a\r\nb\r\n", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_crlf, "a,b,c\r\nx,y,z\r\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_crlf, "a,b,\r\nx,y,\r\n", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!(many_rows_one_field_cr, "a\rb\r", csv![["a"], ["b"]]); parses_to!( many_rows_many_fields_cr, "a,b,c\rx,y,z\r", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( many_rows_trailing_comma_cr, "a,b,\rx,y,\r", csv![["a", "b", ""], ["x", "y", ""]] ); parses_to!( trailing_lines_no_record, "\n\n\na,b,c\nx,y,z\n\n\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( trailing_lines_no_record_cr, "\r\r\ra,b,c\rx,y,z\r\r\r", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!( trailing_lines_no_record_crlf, "\r\n\r\n\r\na,b,c\r\nx,y,z\r\n\r\n\r\n", csv![["a", "b", "c"], ["x", "y", "z"]] ); parses_to!(empty, "", csv![]); parses_to!(empty_lines, "\n\n\n\n", csv![]); parses_to!( empty_lines_interspersed, "\n\na,b\n\n\nx,y\n\n\nm,n\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_crlf, "\r\n\r\n\r\n\r\n", csv![]); parses_to!( empty_lines_interspersed_crlf, "\r\n\r\na,b\r\n\r\n\r\nx,y\r\n\r\n\r\nm,n\r\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_mixed, "\r\n\n\r\n\n", csv![]); parses_to!( empty_lines_interspersed_mixed, "\n\r\na,b\r\n\n\r\nx,y\r\n\n\r\nm,n\r\n", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!(empty_lines_cr, "\r\r\r\r", csv![]); parses_to!( empty_lines_interspersed_cr, "\r\ra,b\r\r\rx,y\r\r\rm,n\r", csv![["a", "b"], ["x", "y"], ["m", "n"]] ); parses_to!( term_weird, "zza,bzc,dzz", csv![["a", "b"], ["c", "d"]], |b: &mut ReaderBuilder| { b.terminator(Terminator::Any(b'z')); } ); parses_to!( ascii_delimited, "a\x1fb\x1ec\x1fd", csv![["a", "b"], ["c", "d"]], |b: &mut ReaderBuilder| { b.ascii(); } ); parses_to!(bom_at_start, "\u{feff}a", csv![["a"]]); parses_to!(bom_in_field, "a\u{feff}", csv![["a\u{feff}"]]); parses_to!(bom_at_field_start, "a,\u{feff}b", csv![["a", "\u{feff}b"]]); parses_to!(quote_empty, "\"\"", csv![[""]]); parses_to!(quote_lf, "\"\"\n", csv![[""]]); parses_to!(quote_space, "\" \"", csv![[" "]]); parses_to!(quote_inner_space, "\" a \"", csv![[" a "]]); parses_to!(quote_outer_space, " \"a\" ", csv![[" \"a\" "]]); parses_to!(quote_change, "zaz", csv![["a"]], |b: &mut ReaderBuilder| { b.quote(b'z'); }); // This one is pretty hokey. // I don't really know what the "right" behavior is. parses_to!( quote_delimiter, ",a,,b", csv![["a,b"]], |b: &mut ReaderBuilder| { b.quote(b','); } ); parses_to!(quote_no_escapes, r#""a\"b""#, csv![[r#"a\b""#]]); parses_to!( quote_escapes_no_double, r#""a""b""#, csv![[r#"a"b""#]], |b: &mut ReaderBuilder| { b.double_quote(false); } ); parses_to!( quote_escapes, r#""a\"b""#, csv![[r#"a"b"#]], |b: &mut ReaderBuilder| { b.escape(Some(b'\\')); } ); parses_to!( quote_escapes_change, r#""az"b""#, csv![[r#"a"b"#]], |b: &mut ReaderBuilder| { b.escape(Some(b'z')); } ); parses_to!( quote_escapes_with_comma, r#""\"A,B\"""#, csv![[r#""A,B""#]], |b: &mut ReaderBuilder| { b.escape(Some(b'\\')).double_quote(false); } ); parses_to!( quoting_disabled, r#""abc,foo""#, csv![[r#""abc"#, r#"foo""#]], |b: &mut ReaderBuilder| { b.quoting(false); } ); parses_to!( delimiter_tabs, "a\tb", csv![["a", "b"]], |b: &mut ReaderBuilder| { b.delimiter(b'\t'); } ); parses_to!( delimiter_weird, "azb", csv![["a", "b"]], |b: &mut ReaderBuilder| { b.delimiter(b'z'); } ); parses_to!(extra_record_crlf_1, "foo\n1\n", csv![["foo"], ["1"]]); parses_to!(extra_record_crlf_2, "foo\r\n1\r\n", csv![["foo"], ["1"]]); parses_to!( comment_1, "foo\n# hi\nbar\n", csv![["foo"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_2, "foo\n # hi\nbar\n", csv![["foo"], [" # hi"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_3, "foo\n# hi\nbar\n", csv![["foo"], ["# hi"], ["bar"]], |b: &mut ReaderBuilder| { b.comment(Some(b'\n')); } ); parses_to!( comment_4, "foo,b#ar,baz", csv![["foo", "b#ar", "baz"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); parses_to!( comment_5, "foo,#bar,baz", csv![["foo", "#bar", "baz"]], |b: &mut ReaderBuilder| { b.comment(Some(b'#')); } ); macro_rules! assert_read { ( $rdr:expr, $input:expr, $output:expr, $expect_in:expr, $expect_out:expr, $expect_res:expr ) => {{ let (res, nin, nout) = $rdr.read_field($input, $output); assert_eq!($expect_in, nin); assert_eq!($expect_out, nout); assert_eq!($expect_res, res); }}; } // This tests that feeding a new reader with an empty buffer sends us // straight to End. #[test] fn stream_empty() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, &[], &mut [], 0, 0, End); } // Test that a single space is treated as a single field. #[test] fn stream_space() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, b(" "), &mut [0], 1, 1, InputEmpty); assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); assert_read!(rdr, &[], &mut [0], 0, 0, End); } // Test that a single comma ... #[test] fn stream_comma() { use crate::ReadFieldResult::*; let mut rdr = Reader::new(); assert_read!(rdr, b(","), &mut [0], 1, 0, Field { record_end: false }); assert_read!(rdr, &[], &mut [0], 0, 0, Field { record_end: true }); assert_read!(rdr, &[], &mut [0], 0, 0, End); } // Test that we can read a single large field in multiple output // buffers. #[test] fn stream_output_chunks() { use crate::ReadFieldResult::*; let mut inp = b("fooquux"); let out = &mut [0; 2]; let mut rdr = Reader::new(); assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("fo")); inp = &inp[2..]; assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("oq")); inp = &inp[2..]; assert_read!(rdr, inp, out, 2, 2, OutputFull); assert_eq!(out, b("uu")); inp = &inp[2..]; assert_read!(rdr, inp, out, 1, 1, InputEmpty); assert_eq!(&out[..1], b("x")); inp = &inp[1..]; assert!(inp.is_empty()); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, inp, out, 0, 0, End); } // Test that we can read a single large field across multiple input // buffers. #[test] fn stream_input_chunks() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("fo"), out, 2, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("oq"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fooq")); assert_read!(rdr, b("uu"), &mut out[4..], 2, 2, InputEmpty); assert_eq!(&out[..6], b("fooquu")); assert_read!(rdr, b("x"), &mut out[6..], 1, 1, InputEmpty); assert_eq!(&out[..7], b("fooquux")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test we can read doubled quotes correctly in a stream. #[test] fn stream_doubled_quotes() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("\"fo\""), out, 4, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fo\"o")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test we can read escaped quotes correctly in a stream. #[test] fn stream_escaped_quotes() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut builder = ReaderBuilder::new(); let mut rdr = builder.escape(Some(b'\\')).build(); assert_read!(rdr, b("\"fo\\"), out, 4, 2, InputEmpty); assert_eq!(&out[..2], b("fo")); assert_read!(rdr, b("\"o"), &mut out[2..], 2, 2, InputEmpty); assert_eq!(&out[..4], b("fo\"o")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test that empty output buffers don't wreak havoc. #[test] fn stream_empty_output() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!( rdr, b("foo,bar"), out, 4, 3, Field { record_end: false } ); assert_eq!(&out[..3], b("foo")); assert_read!(rdr, b("bar"), &mut [], 0, 0, OutputFull); assert_read!(rdr, b("bar"), out, 3, 3, InputEmpty); assert_eq!(&out[..3], b("bar")); assert_read!(rdr, &[], out, 0, 0, Field { record_end: true }); assert_read!(rdr, &[], out, 0, 0, End); } // Test that we can reset the parser mid-stream and count on it to do // the right thing. #[test] fn reset_works() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_read!(rdr, b("\"foo"), out, 4, 3, InputEmpty); assert_eq!(&out[..3], b("foo")); // Without reseting the parser state, the reader will remember that // we're in a quoted field, and therefore interpret the leading double // quotes below as a single quote and the trailing quote as a matching // terminator. With the reset, however, the parser forgets the quoted // field and treats the leading double quotes as a syntax quirk and // drops them, in addition to hanging on to the trailing unmatched // quote. (Matches Python's behavior.) rdr.reset(); assert_read!(rdr, b("\"\"bar\""), out, 6, 4, InputEmpty); assert_eq!(&out[..4], b("bar\"")); } // Test the line number reporting is correct. #[test] fn line_numbers() { use crate::ReadFieldResult::*; let out = &mut [0; 10]; let mut rdr = Reader::new(); assert_eq!(1, rdr.line()); assert_read!(rdr, b("\n\n\n\n"), out, 4, 0, InputEmpty); assert_eq!(5, rdr.line()); assert_read!(rdr, b("foo,"), out, 4, 3, Field { record_end: false }); assert_eq!(5, rdr.line()); assert_read!(rdr, b("bar\n"), out, 4, 3, Field { record_end: true }); assert_eq!(6, rdr.line()); assert_read!(rdr, &[], &mut [0], 0, 0, End); assert_eq!(6, rdr.line()); } macro_rules! assert_read_record { ( $rdr:expr, $input:expr, $output:expr, $ends:expr, $expect_in:expr, $expect_out:expr, $expect_end:expr, $expect_res:expr ) => {{ let (res, nin, nout, nend) = $rdr.read_record($input, $output, $ends); assert_eq!($expect_res, res, "result"); assert_eq!($expect_in, nin, "input"); assert_eq!($expect_out, nout, "output"); assert_eq!($expect_end, nend, "ends"); }}; } // Test that we can incrementally read a record. #[test] fn stream_record() { use crate::ReadRecordResult::*; let mut inp = b("foo,bar\nbaz"); let out = &mut [0; 1024]; let ends = &mut [0; 10]; let mut rdr = Reader::new(); assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); assert_eq!(ends[0], 3); assert_eq!(ends[1], 6); inp = &inp[8..]; assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); inp = &inp[3..]; assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); assert_eq!(ends[0], 3); assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); } // Test that if our output ends are full during the last read that // we get an appropriate state returned. #[test] fn stream_record_last_end_output_full() { use crate::ReadRecordResult::*; let mut inp = b("foo,bar\nbaz"); let out = &mut [0; 1024]; let ends = &mut [0; 10]; let mut rdr = Reader::new(); assert_read_record!(rdr, &inp, out, ends, 8, 6, 2, Record); assert_eq!(ends[0], 3); assert_eq!(ends[1], 6); inp = &inp[8..]; assert_read_record!(rdr, &inp, out, ends, 3, 3, 0, InputEmpty); inp = &inp[3..]; assert_read_record!(rdr, &inp, out, &mut [], 0, 0, 0, OutputEndsFull); assert_read_record!(rdr, &inp, out, ends, 0, 0, 1, Record); assert_eq!(ends[0], 3); assert_read_record!(rdr, &inp, out, ends, 0, 0, 0, End); } } csv-core-0.1.6/src/writer.rs010064400017500000144000001021141350500243100141150ustar0000000000000000use core::fmt; use core::str; use memchr::memchr; use crate::{QuoteStyle, Terminator}; /// A builder for configuring a CSV writer. /// /// This builder permits specifying the CSV delimiter, terminator, quoting /// style and more. #[derive(Debug)] pub struct WriterBuilder { wtr: Writer, } impl WriterBuilder { /// Create a new builder for configuring a CSV writer. pub fn new() -> WriterBuilder { let wtr = Writer { state: WriterState::default(), requires_quotes: [false; 256], delimiter: b',', term: Terminator::Any(b'\n'), style: QuoteStyle::default(), quote: b'"', escape: b'\\', double_quote: true, }; WriterBuilder { wtr: wtr } } /// Builder a CSV writer from this configuration. pub fn build(&self) -> Writer { use crate::Terminator::*; let mut wtr = self.wtr.clone(); wtr.requires_quotes[self.wtr.delimiter as usize] = true; wtr.requires_quotes[self.wtr.quote as usize] = true; if !self.wtr.double_quote { // We only need to quote the escape character if the escape // character is used for escaping quotes. wtr.requires_quotes[self.wtr.escape as usize] = true; } match self.wtr.term { CRLF | Any(b'\n') | Any(b'\r') => { // This is a bit hokey. By default, the record terminator // is '\n', but we still need to quote '\r' (even if our // terminator is only `\n`) because the reader interprets '\r' // as a record terminator by default. wtr.requires_quotes[b'\r' as usize] = true; wtr.requires_quotes[b'\n' as usize] = true; } Any(b) => { wtr.requires_quotes[b as usize] = true; } _ => unreachable!(), } wtr } /// The field delimiter to use when writing CSV. /// /// The default is `b','`. pub fn delimiter(&mut self, delimiter: u8) -> &mut WriterBuilder { self.wtr.delimiter = delimiter; self } /// The record terminator to use when writing CSV. /// /// A record terminator can be any single byte. The default is `\n`. /// /// Note that RFC 4180 specifies that record terminators should be `\r\n`. /// To use `\r\n`, use the special `Terminator::CRLF` value. pub fn terminator(&mut self, term: Terminator) -> &mut WriterBuilder { self.wtr.term = term; self } /// The quoting style to use when writing CSV. /// /// By default, this is set to `QuoteStyle::Necessary`, which will only /// use quotes when they are necessary to preserve the integrity of data. /// /// Note that unless the quote style is set to `Never`, an empty field is /// quoted if it is the only field in a record. pub fn quote_style(&mut self, style: QuoteStyle) -> &mut WriterBuilder { self.wtr.style = style; self } /// The quote character to use when writing CSV. /// /// The default value is `b'"'`. pub fn quote(&mut self, quote: u8) -> &mut WriterBuilder { self.wtr.quote = quote; self } /// The escape character to use when writing CSV. /// /// This is only used when `double_quote` is set to `false`. /// /// The default value is `b'\\'`. pub fn escape(&mut self, escape: u8) -> &mut WriterBuilder { self.wtr.escape = escape; self } /// The quoting escape mechanism to use when writing CSV. /// /// When enabled (which is the default), quotes are escaped by doubling /// them. e.g., `"` escapes to `""`. /// /// When disabled, quotes are escaped with the escape character (which /// is `\\` by default). pub fn double_quote(&mut self, yes: bool) -> &mut WriterBuilder { self.wtr.double_quote = yes; self } } impl Default for WriterBuilder { fn default() -> WriterBuilder { WriterBuilder::new() } } /// The result of writing CSV data. /// /// A value of this type is returned from every interaction with `Writer`. It /// informs the caller how to proceed, namely, by indicating whether more /// input should be given (`InputEmpty`) or if a bigger output buffer is needed /// (`OutputFull`). #[derive(Clone, Debug, Eq, PartialEq)] pub enum WriteResult { /// This result occurs when all of the bytes from the given input have /// been processed. InputEmpty, /// This result occurs when the output buffer was too small to process /// all of the input bytes. Generally, this means the caller must call /// the corresponding method again with the rest of the input and more /// room in the output buffer. OutputFull, } /// A writer for CSV data. /// /// # RFC 4180 /// /// This writer conforms to RFC 4180 with one exception: it doesn't guarantee /// that all records written are of the same length. Instead, the onus is on /// the caller to ensure that all records written are of the same length. /// /// Note that the default configuration of a `Writer` uses `\n` for record /// terminators instead of `\r\n` as specified by RFC 4180. Use the /// `terminator` method on `WriterBuilder` to set the terminator to `\r\n` if /// it's desired. pub struct Writer { state: WriterState, requires_quotes: [bool; 256], delimiter: u8, term: Terminator, style: QuoteStyle, quote: u8, escape: u8, double_quote: bool, } impl Clone for Writer { fn clone(&self) -> Writer { let mut requires_quotes = [false; 256]; for i in 0..256 { requires_quotes[i] = self.requires_quotes[i]; } Writer { state: self.state.clone(), requires_quotes: requires_quotes, delimiter: self.delimiter, term: self.term, style: self.style, quote: self.quote, escape: self.escape, double_quote: self.double_quote, } } } impl fmt::Debug for Writer { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Writer") .field("state", &self.state) .field("delimiter", &self.delimiter) .field("term", &self.term) .field("style", &self.style) .field("quote", &self.quote) .field("escape", &self.escape) .field("double_quote", &self.double_quote) .finish() } } #[derive(Clone, Debug)] struct WriterState { /// This is set whenever we've begun writing the contents of a field, even /// if the contents are empty. We use it to avoid re-computing whether /// quotes are necessary. in_field: bool, /// This is set whenever we've started writing a field that is enclosed in /// quotes. When the writer is finished, or if a delimiter or terminator /// are written, then a closing quote is inserted when this is true. quoting: bool, /// The number of total bytes written for the current record. /// /// If the writer is finished or a terminator is written when this is `0`, /// then an empty field is added as a pair of adjacent quotes. record_bytes: u64, } impl Writer { /// Creates a new CSV writer with the default configuration. pub fn new() -> Writer { Writer::default() } /// Finish writing CSV data to `output`. /// /// This must be called when one is done writing CSV data to `output`. /// In particular, it will write closing quotes if necessary. pub fn finish(&mut self, mut output: &mut [u8]) -> (WriteResult, usize) { let mut nout = 0; if self.state.record_bytes == 0 && self.state.in_field { assert!(!self.state.quoting); let (res, o) = self.write(&[self.quote, self.quote], output); if o == 0 { return (res, 0); } output = &mut moving(output)[o..]; nout += o; self.state.record_bytes += o as u64; } if !self.state.quoting { return (WriteResult::InputEmpty, nout); } let (res, o) = self.write(&[self.quote], output); if o == 0 { return (res, nout); } nout += o; self.state.record_bytes = 0; self.state.in_field = false; self.state.quoting = false; (res, nout) } /// Write a single CSV field from `input` to `output` while employing this /// writer's quoting style. /// /// This returns the result of writing field data, in addition to the /// number of bytes consumed from `input` and the number of bytes /// written to `output`. /// /// The result of writing field data is either `WriteResult::InputEmpty` /// or `WriteResult::OutputFull`. The former occurs when all bytes in /// `input` were copied to `output`, while the latter occurs when `output` /// is too small to fit everything from `input`. The maximum number of /// bytes that can be written to `output` is `2 + (2 * input.len())` /// because of quoting. (The worst case is a field consisting entirely /// of quotes.) /// /// Multiple successive calls to `field` will write more data to the same /// field. Subsequent fields can be written by calling either `delimiter` /// or `terminator` first. /// /// If this writer's quoting style is `QuoteStyle::Necessary`, then `input` /// should contain the *entire* field. Otherwise, whether the field needs /// to be quoted or not cannot be determined. pub fn field( &mut self, input: &[u8], mut output: &mut [u8], ) -> (WriteResult, usize, usize) { let (mut nin, mut nout) = (0, 0); if !self.state.in_field { self.state.quoting = self.should_quote(input); if self.state.quoting { let (res, o) = self.write(&[self.quote], output); if o == 0 { return (res, 0, 0); } output = &mut moving(output)[o..]; nout += o; self.state.record_bytes += o as u64; } self.state.in_field = true; } let (res, i, o) = if self.state.quoting { quote(input, output, self.quote, self.escape, self.double_quote) } else { write_optimistic(input, output) }; nin += i; nout += o; self.state.record_bytes += o as u64; (res, nin, nout) } /// Write the configured field delimiter to `output`. /// /// If the output buffer does not have enough room to fit /// a field delimiter, then nothing is written to `output` /// and `WriteResult::OutputFull` is returned. Otherwise, /// `WriteResult::InputEmpty` is returned along with the number of bytes /// written to `output` (which is always `1`). pub fn delimiter( &mut self, mut output: &mut [u8], ) -> (WriteResult, usize) { let mut nout = 0; if self.state.quoting { let (res, o) = self.write(&[self.quote], output); if o == 0 { return (res, o); } output = &mut moving(output)[o..]; nout += o; self.state.record_bytes += o as u64; self.state.quoting = false; } let (res, o) = self.write(&[self.delimiter], output); if o == 0 { return (res, nout); } nout += o; self.state.record_bytes += o as u64; self.state.in_field = false; (res, nout) } /// Write the configured record terminator to `output`. /// /// If the output buffer does not have enough room to fit a record /// terminator, then no part of the terminator is written and /// `WriteResult::OutputFull` is returned. Otherwise, /// `WriteResult::InputEmpty` is returned along with the number of bytes /// written to `output` (which is always `1` or `2`). pub fn terminator( &mut self, mut output: &mut [u8], ) -> (WriteResult, usize) { let mut nout = 0; if self.state.record_bytes == 0 { assert!(!self.state.quoting); let (res, o) = self.write(&[self.quote, self.quote], output); if o == 0 { return (res, 0); } output = &mut moving(output)[o..]; nout += o; self.state.record_bytes += o as u64; } if self.state.quoting { let (res, o) = self.write(&[self.quote], output); if o == 0 { return (res, o); } output = &mut moving(output)[o..]; nout += o; self.state.record_bytes += o as u64; self.state.quoting = false; } let (res, o) = match self.term { Terminator::CRLF => write_pessimistic(&[b'\r', b'\n'], output), Terminator::Any(b) => write_pessimistic(&[b], output), _ => unreachable!(), }; if o == 0 { return (res, nout); } nout += o; self.state.record_bytes = 0; self.state.in_field = false; (res, nout) } /// Returns true if and only if the given input field *requires* quotes to /// preserve the integrity of `input` while taking into account the current /// configuration of this writer (except for the configured quoting style). #[inline] fn needs_quotes(&self, mut input: &[u8]) -> bool { let mut needs = false; while !needs && input.len() >= 8 { needs = self.requires_quotes[input[0] as usize] || self.requires_quotes[input[1] as usize] || self.requires_quotes[input[2] as usize] || self.requires_quotes[input[3] as usize] || self.requires_quotes[input[4] as usize] || self.requires_quotes[input[5] as usize] || self.requires_quotes[input[6] as usize] || self.requires_quotes[input[7] as usize]; input = &input[8..]; } needs || input.iter().any(|&b| self.is_special_byte(b)) } /// Returns true if and only if the given byte corresponds to a special /// byte in this CSV writer's configuration. /// /// Note that this does **not** take into account this writer's quoting /// style. #[inline] pub fn is_special_byte(&self, b: u8) -> bool { self.requires_quotes[b as usize] } /// Returns true if and only if we should put the given field data /// in quotes. This takes the quoting style into account. #[inline] pub fn should_quote(&self, input: &[u8]) -> bool { match self.style { QuoteStyle::Always => true, QuoteStyle::Never => false, QuoteStyle::NonNumeric => is_non_numeric(input), QuoteStyle::Necessary => self.needs_quotes(input), _ => unreachable!(), } } /// Return the delimiter used for this writer. #[inline] pub fn get_delimiter(&self) -> u8 { self.delimiter } /// Return the terminator used for this writer. #[inline] pub fn get_terminator(&self) -> Terminator { self.term } /// Return the quoting style used for this writer. #[inline] pub fn get_quote_style(&self) -> QuoteStyle { self.style } /// Return the quote character used for this writer. #[inline] pub fn get_quote(&self) -> u8 { self.quote } /// Return the escape character used for this writer. #[inline] pub fn get_escape(&self) -> u8 { self.escape } /// Return whether this writer doubles quotes or not. When the writer /// does not double quotes, it will escape them using the escape character. #[inline] pub fn get_double_quote(&self) -> bool { self.double_quote } fn write(&self, data: &[u8], output: &mut [u8]) -> (WriteResult, usize) { if data.len() > output.len() { (WriteResult::OutputFull, 0) } else { output[..data.len()].copy_from_slice(data); (WriteResult::InputEmpty, data.len()) } } } impl Default for Writer { fn default() -> Writer { WriterBuilder::new().build() } } impl Default for WriterState { fn default() -> WriterState { WriterState { in_field: false, quoting: false, record_bytes: 0 } } } /// Returns true if and only if the given input is non-numeric. pub fn is_non_numeric(input: &[u8]) -> bool { let s = match str::from_utf8(input) { Err(_) => return true, Ok(s) => s, }; // I suppose this could be faster if we wrote validators of numbers instead // of using the actual parser, but that's probably a lot of work for a bit // of a niche feature. !s.parse::().is_ok() && !s.parse::().is_ok() } /// Escape quotes `input` and writes the result to `output`. /// /// If `input` does not have a `quote`, then the contents of `input` are /// copied verbatim to `output`. /// /// If `output` is not big enough to store the fully quoted contents of /// `input`, then `WriteResult::OutputFull` is returned. The `output` buffer /// will require a maximum of storage of `2 * input.len()` in the worst case /// (where every byte is a quote). /// /// In streaming contexts, `quote` should be called in a loop until /// `WriteResult::InputEmpty` is returned. It is possible to write an infinite /// loop if your output buffer is less than 2 bytes in length (the minimum /// storage space required to store an escaped quote). /// /// In addition to the `WriteResult`, the number of consumed bytes from `input` /// and the number of bytes written to `output` are also returned. /// /// `quote` is the quote byte and `escape` is the escape byte. If /// `double_quote` is true, then quotes are escaped by doubling them, /// otherwise, quotes are escaped with the `escape` byte. /// /// N.B. This function is provided for low level usage. It is called /// automatically if you're using a `Writer`. pub fn quote( mut input: &[u8], mut output: &mut [u8], quote: u8, escape: u8, double_quote: bool, ) -> (WriteResult, usize, usize) { let (mut nin, mut nout) = (0, 0); loop { match memchr(quote, input) { None => { let (res, i, o) = write_optimistic(input, output); nin += i; nout += o; return (res, nin, nout); } Some(next_quote) => { let (res, i, o) = write_optimistic(&input[..next_quote], output); input = &input[i..]; output = &mut moving(output)[o..]; nin += i; nout += o; if let WriteResult::OutputFull = res { return (res, nin, nout); } if double_quote { let (res, o) = write_pessimistic(&[quote, quote], output); if let WriteResult::OutputFull = res { return (res, nin, nout); } nout += o; output = &mut moving(output)[o..]; } else { let (res, o) = write_pessimistic(&[escape, quote], output); if let WriteResult::OutputFull = res { return (res, nin, nout); } nout += o; output = &mut moving(output)[o..]; } nin += 1; input = &input[1..]; } } } } /// Copy the bytes from `input` to `output`. If `output` is too small to fit /// everything from `input`, then copy `output.len()` bytes from `input`. /// Otherwise, copy everything from `input` into `output`. /// /// In the first case (`output` is too small), `WriteResult::OutputFull` is /// returned, in addition to the number of bytes consumed from `input` and /// the number of bytes written to `output`. /// /// In the second case (`input` is no bigger than `output`), /// `WriteResult::InputEmpty` is returned, in addition to the number of bytes /// consumed from `input` and the number of bytes written to `output`. fn write_optimistic( input: &[u8], output: &mut [u8], ) -> (WriteResult, usize, usize) { if input.len() > output.len() { let input = &input[..output.len()]; output.copy_from_slice(input); (WriteResult::OutputFull, output.len(), output.len()) } else { output[..input.len()].copy_from_slice(input); (WriteResult::InputEmpty, input.len(), input.len()) } } /// Copy the bytes from `input` to `output` only if `input` is no bigger than /// `output`. If `input` is bigger than `output`, then return /// `WriteResult::OutputFull` and copy nothing into `output`. Otherwise, /// return `WriteResult::InputEmpty` and the number of bytes copied into /// `output`. fn write_pessimistic(input: &[u8], output: &mut [u8]) -> (WriteResult, usize) { if input.len() > output.len() { (WriteResult::OutputFull, 0) } else { output[..input.len()].copy_from_slice(input); (WriteResult::InputEmpty, input.len()) } } /// This avoids reborrowing. /// See: https://bluss.github.io/rust/fun/2015/10/11/stuff-the-identity-function-does/ fn moving(x: T) -> T { x } #[cfg(test)] mod tests { use crate::writer::WriteResult::*; use crate::writer::{quote, QuoteStyle, Writer, WriterBuilder}; // OMG I HATE BYTE STRING LITERALS SO MUCH. fn b(s: &str) -> &[u8] { s.as_bytes() } fn s(b: &[u8]) -> &str { ::core::str::from_utf8(b).unwrap() } macro_rules! assert_field { ( $wtr:expr, $inp:expr, $out:expr, $expect_in:expr, $expect_out:expr, $expect_res:expr, $expect_data:expr ) => {{ let (res, i, o) = $wtr.field($inp, $out); assert_eq!($expect_res, res, "result"); assert_eq!($expect_in, i, "input"); assert_eq!($expect_out, o, "output"); assert_eq!($expect_data, s(&$out[..o]), "data"); }}; } macro_rules! assert_write { ( $wtr:expr, $which:ident, $out:expr, $expect_out:expr, $expect_res:expr, $expect_data:expr ) => {{ let (res, o) = $wtr.$which($out); assert_eq!($expect_res, res, "result"); assert_eq!($expect_out, o, "output"); assert_eq!($expect_data, s(&$out[..o]), "data"); }}; } #[test] fn writer_one_field() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); n += 3; assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); } #[test] fn writer_one_empty_field_terminator() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, ""); } #[test] fn writer_one_empty_field_finish() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\""); } #[test] fn writer_many_one_empty_field_finish() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, finish, &mut out[..], 2, InputEmpty, "\"\""); } #[test] fn writer_many_one_empty_field_terminator() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); assert_field!(wtr, b(""), &mut out[..], 0, 0, InputEmpty, ""); assert_write!(wtr, terminator, &mut out[..], 3, InputEmpty, "\"\"\n"); assert_write!(wtr, finish, &mut out[..], 0, InputEmpty, ""); } #[test] fn writer_one_field_quote() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!( wtr, b("a\"bc"), &mut out[n..], 4, 6, InputEmpty, "\"a\"\"bc" ); n += 6; assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); } #[test] fn writer_one_field_stream() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); n += 3; assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x"); n += 1; assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); } #[test] fn writer_one_field_stream_quote() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!( wtr, b("abc\""), &mut out[n..], 4, 6, InputEmpty, "\"abc\"\"" ); n += 6; assert_field!(wtr, b("x"), &mut out[n..], 1, 1, InputEmpty, "x"); n += 1; assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); } #[test] fn writer_one_field_stream_quote_partial() { let mut wtr = Writer::new(); let out = &mut [0; 4]; assert_field!(wtr, b("ab\"xyz"), out, 2, 3, OutputFull, "\"ab"); assert_field!(wtr, b("\"xyz"), out, 3, 4, OutputFull, "\"\"xy"); assert_field!(wtr, b("z"), out, 1, 1, InputEmpty, "z"); assert_write!(wtr, finish, out, 1, InputEmpty, "\""); } #[test] fn writer_two_fields() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); n += 3; assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); n += 1; assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz"); n += 2; assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); assert_eq!("abc,yz", s(&out[..n])); } #[test] fn writer_two_fields_non_numeric() { let mut wtr = WriterBuilder::new().quote_style(QuoteStyle::NonNumeric).build(); let out = &mut [0; 1024]; let mut n = 0; assert_field!(wtr, b("abc"), &mut out[n..], 3, 4, InputEmpty, "\"abc"); n += 4; assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); n += 2; assert_field!(wtr, b("5.2"), &mut out[n..], 3, 3, InputEmpty, "5.2"); n += 3; assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); n += 1; assert_field!(wtr, b("98"), &mut out[n..], 2, 2, InputEmpty, "98"); n += 2; assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); assert_eq!("\"abc\",5.2,98", s(&out[..n])); } #[test] fn writer_two_fields_quote() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!( wtr, b("a,bc"), &mut out[n..], 4, 5, InputEmpty, "\"a,bc" ); n += 5; assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); n += 2; assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz"); n += 3; assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); n += 1; assert_eq!("\"a,bc\",\"\nz\"", s(&out[..n])); } #[test] fn writer_two_fields_two_records() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!(wtr, b("abc"), &mut out[n..], 3, 3, InputEmpty, "abc"); n += 3; assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); n += 1; assert_field!(wtr, b("yz"), &mut out[n..], 2, 2, InputEmpty, "yz"); n += 2; assert_write!(wtr, terminator, &mut out[n..], 1, InputEmpty, "\n"); n += 1; assert_field!(wtr, b("foo"), &mut out[n..], 3, 3, InputEmpty, "foo"); n += 3; assert_write!(wtr, delimiter, &mut out[n..], 1, InputEmpty, ","); n += 1; assert_field!(wtr, b("quux"), &mut out[n..], 4, 4, InputEmpty, "quux"); n += 4; assert_write!(wtr, finish, &mut out[n..], 0, InputEmpty, ""); assert_eq!("abc,yz\nfoo,quux", s(&out[..n])); } #[test] fn writer_two_fields_two_records_quote() { let mut wtr = Writer::new(); let out = &mut [0; 1024]; let mut n = 0; assert_field!( wtr, b("a,bc"), &mut out[n..], 4, 5, InputEmpty, "\"a,bc" ); n += 5; assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); n += 2; assert_field!(wtr, b("\nz"), &mut out[n..], 2, 3, InputEmpty, "\"\nz"); n += 3; assert_write!(wtr, terminator, &mut out[n..], 2, InputEmpty, "\"\n"); n += 2; assert_field!( wtr, b("f\"oo"), &mut out[n..], 4, 6, InputEmpty, "\"f\"\"oo" ); n += 6; assert_write!(wtr, delimiter, &mut out[n..], 2, InputEmpty, "\","); n += 2; assert_field!( wtr, b("quux,"), &mut out[n..], 5, 6, InputEmpty, "\"quux," ); n += 6; assert_write!(wtr, finish, &mut out[n..], 1, InputEmpty, "\""); n += 1; assert_eq!("\"a,bc\",\"\nz\"\n\"f\"\"oo\",\"quux,\"", s(&out[..n])); } macro_rules! assert_quote { ( $inp:expr, $out:expr, $expect_in:expr, $expect_out:expr, $expect_res:expr, $expect_data:expr ) => { assert_quote!( $inp, $out, $expect_in, $expect_out, $expect_res, $expect_data, true ); }; ( $inp:expr, $out:expr, $expect_in:expr, $expect_out:expr, $expect_res:expr, $expect_data:expr, $double_quote:expr ) => {{ let (res, i, o) = quote($inp, $out, b'"', b'\\', $double_quote); assert_eq!($expect_res, res, "result"); assert_eq!($expect_in, i, "input"); assert_eq!($expect_out, o, "output"); assert_eq!(b($expect_data), &$out[..o], "data"); }}; } #[test] fn quote_empty() { let inp = b(""); let out = &mut [0; 1024]; assert_quote!(inp, out, 0, 0, InputEmpty, ""); } #[test] fn quote_no_quotes() { let inp = b("foobar"); let out = &mut [0; 1024]; assert_quote!(inp, out, 6, 6, InputEmpty, "foobar"); } #[test] fn quote_one_quote() { let inp = b("\""); let out = &mut [0; 1024]; assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#); } #[test] fn quote_two_quotes() { let inp = b("\"\""); let out = &mut [0; 1024]; assert_quote!(inp, out, 2, 4, InputEmpty, r#""""""#); } #[test] fn quote_escaped_one() { let inp = b("\""); let out = &mut [0; 1024]; assert_quote!(inp, out, 1, 2, InputEmpty, r#"\""#, false); } #[test] fn quote_escaped_two() { let inp = b("\"\""); let out = &mut [0; 1024]; assert_quote!(inp, out, 2, 4, InputEmpty, r#"\"\""#, false); } #[test] fn quote_misc() { let inp = b(r#"foo "bar" baz "quux"?"#); let out = &mut [0; 1024]; assert_quote!( inp, out, 21, 25, InputEmpty, r#"foo ""bar"" baz ""quux""?"# ); } #[test] fn quote_stream_no_quotes() { let mut inp = b("fooba"); let out = &mut [0; 2]; assert_quote!(inp, out, 2, 2, OutputFull, "fo"); inp = &inp[2..]; assert_quote!(inp, out, 2, 2, OutputFull, "ob"); inp = &inp[2..]; assert_quote!(inp, out, 1, 1, InputEmpty, "a"); } #[test] fn quote_stream_quotes() { let mut inp = b(r#"a"bc"d""#); let out = &mut [0; 2]; assert_quote!(inp, out, 1, 1, OutputFull, "a"); inp = &inp[1..]; assert_quote!(inp, out, 1, 2, OutputFull, r#""""#); inp = &inp[1..]; assert_quote!(inp, out, 2, 2, OutputFull, "bc"); inp = &inp[2..]; assert_quote!(inp, out, 1, 2, OutputFull, r#""""#); inp = &inp[1..]; assert_quote!(inp, out, 1, 1, OutputFull, "d"); inp = &inp[1..]; assert_quote!(inp, out, 1, 2, InputEmpty, r#""""#); } } csv-core-0.1.6/.cargo_vcs_info.json0000644000000001120000000000000126120ustar00{ "git": { "sha1": "eafc208ccef047d9fe81452f16588c3d48fd6a5d" } }