bytelines-2.4.0/.github/workflows/ci.yml000064400000000000000000000016710072674642500163750ustar 00000000000000name: CI on: push: branches: [ master ] pull_request: branches: [ master ] jobs: build: runs-on: ${{ matrix.os }} strategy: fail-fast: false matrix: os: - macos-latest - ubuntu-latest - windows-latest rust: - stable - beta - nightly steps: - uses: actions/checkout@v2 - uses: actions-rs/toolchain@v1 with: profile: minimal toolchain: ${{ matrix.rust }} override: true components: rustfmt, clippy - uses: actions-rs/cargo@v1 with: command: build - uses: actions-rs/cargo@v1 with: command: test - uses: actions-rs/cargo@v1 with: command: fmt args: --all -- --check - uses: actions-rs/cargo@v1 with: command: clippy args: --all --all-features --profile test bytelines-2.4.0/.gitignore000064400000000000000000000000360072674642500136440ustar 00000000000000/target **/*.rs.bk Cargo.lock bytelines-2.4.0/Cargo.toml0000644000000017210000000000100110340ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "bytelines" version = "2.4.0" authors = ["Isaac Whitfield "] description = "Read input lines as byte slices for high efficiency" readme = "README.md" keywords = ["lines", "bytes"] categories = ["parsing", "text-processing"] license = "MIT" repository = "https://github.com/whitfin/bytelines" [dependencies.futures] version = "0.3" [dependencies.tokio] version = "1.14" features = ["fs", "io-util"] [dev-dependencies.tokio] version = "1.14" features = ["full"] bytelines-2.4.0/Cargo.toml.orig000064400000000000000000000010310072674642500145370ustar 00000000000000[package] name = "bytelines" version = "2.4.0" # remember to update html_root_url authors = ["Isaac Whitfield "] description = "Read input lines as byte slices for high efficiency" repository = "https://github.com/whitfin/bytelines" keywords = ["lines", "bytes"] categories = ["parsing", "text-processing"] readme = "README.md" edition = "2018" license = "MIT" [dependencies] futures = "0.3" tokio = { version = "1.14", features = ["fs", "io-util"] } [dev-dependencies] tokio = { version = "1.14", features = ["full"] } bytelines-2.4.0/README.md000064400000000000000000000056710072674642500131450ustar 00000000000000# bytelines [![Crates.io](https://img.shields.io/crates/v/bytelines.svg)](https://crates.io/crates/bytelines) [![Build Status](https://img.shields.io/github/workflow/status/whitfin/bytelines/CI)](https://github.com/whitfin/bytelines/actions) This library provides an easy way to read in input lines as byte slices for high efficiency. It's basically [lines](https://doc.rust-lang.org/std/io/trait.BufRead.html#method.lines) from the standard library, but it reads each line as a byte slice (`&[u8]`). This performs significantly faster than `lines()` in the case you don't particularly care about unicode, and basically as fast as writing the loops out by hand. Although the code itself is somewhat trivial, I've had to roll this in at least 4 tools I've written recently and so I figured it was time to have a convenience crate for it. ### Installation This tool will be available via [Crates.io](https://crates.io/crates/bytelines), so you can add it as a dependency in your `Cargo.toml`: ```toml [dependencies] bytelines = "2.4" ``` ### Usage It's quite simple; in the place you would typically call `lines` on a `BufRead` implementor, you can now use `bytelines` to retrieve a structure used to walk over lines as `&[u8]` (and thus avoid allocations). There are two ways to use the API, and both are shown below: ```rust // our input file we're going to walk over lines of, and our reader let file = File::open("./my-input.txt").expect("able to open file"); let reader = BufReader::new(file); let mut lines = ByteLines::new(reader); // Option 1: Walk using a `while` loop. // // This is the most performant option, as it avoids an allocation by // simply referencing bytes inside the reading structure. This means // that there's no copying at all, until the developer chooses to. while let Some(line) = lines.next() { // do something with the line } // Option 2: Use the `Iterator` trait. // // This is more idiomatic, but requires allocating each line into // an owned `Vec` to avoid potential memory safety issues. Although // there is an allocation here, the overhead should be negligible // except in cases where performance is paramount. for line in lines.into_iter() { // do something with the line } ``` As of v2.3 this crate includes fairly minimal support for Tokio, namely the `AsyncBufRead` trait. This looks fairly similar to the base APIs, and can be used in much the same way. ```rust // configure our inputs again, using `AsyncByteLines`. let file = File::open("./my-input.txt").await?; let reader = BufReader::new(file); let mut lines = AsyncByteLines::new(reader); // walk through all lines using a `while` loop while let Some(line) = lines.next().await? { // do something with the line } // walk through all lines using `Stream` APIs lines.into_stream().for_each(|line| { }); ``` The main difference is that the Tokio implementations yield `Result, _>` instead of `Option>` for consistency with the exiting Tokio APIs. bytelines-2.4.0/TODO.txt000064400000000000000000000001050072674642500131570ustar 00000000000000- Remove ByteLinesReader (also in README) - Only Tokio feature flags bytelines-2.4.0/res/empty.txt000064400000000000000000000000010072674642500143340ustar 00000000000000 bytelines-2.4.0/res/numbers.txt000064400000000000000000000000240072674642500146560ustar 000000000000000 1 2 3 4 5 6 7 8 9 bytelines-2.4.0/src/lib.rs000064400000000000000000000024430072674642500135630ustar 00000000000000//! `Bytelines` is a simple library crate which offers line iteration for //! `BufRead` via `&[u8]` rather than `String`. //! //! Due to the removal of checking for `String` validity, this is typically //! much faster for reading in raw data and much more flexible. The APIs //! offered in this crate are intended to function exactly the same as the //! `lines` function inside the `BufRead` trait, except that the bytes which //! precede the line delimiter are not validated. //! //! Performance of [ByteLines](enum.ByteLines.html) is practically identical //! to that of writing a `loop` manually, due to the avoidance of allocations. #![doc(html_root_url = "https://docs.rs/bytelines/2.4.0")] use ::std::io::BufRead; use ::tokio::io::AsyncBufRead; // mods mod std; mod tokio; mod util; // expose all public APIs to keep the v2.x interface the same pub use crate::std::{ByteLines, ByteLinesIter, ByteLinesReader}; pub use crate::tokio::AsyncByteLines; /// Creates a new line reader from a stdlib `BufRead`. #[inline] pub fn from_std(reader: B) -> ByteLines where B: BufRead, { ByteLines::new(reader) } /// Creates a new line reader from a Tokio `AsyncBufRead`. #[inline] pub fn from_tokio(reader: B) -> AsyncByteLines where B: AsyncBufRead + Unpin, { AsyncByteLines::new(reader) } bytelines-2.4.0/src/std.rs000064400000000000000000000130610072674642500136050ustar 00000000000000//! Module exposing APIs based around `BufRead` from stdlib. use std::io::{BufRead, Error}; /// Provides iteration over bytes of input, split by line. /// /// Unlike the implementation in the standard library, this requires /// no allocations and simply references the input lines from the /// internal buffer. In order to do this safely, we must sacrifice /// the `Iterator` API, and operate using `while` syntax: /// /// ```rust /// use bytelines::*; /// use std::fs::File; /// use std::io::BufReader; /// /// // construct our iterator from our file input /// let file = File::open("./res/numbers.txt").unwrap(); /// let reader = BufReader::new(file); /// let mut lines = ByteLines::new(reader); /// /// // walk our lines using `while` syntax /// while let Some(line) = lines.next() { /// // do something with the line, which is Result<&[u8], _> /// } /// ``` /// /// For those who prefer the `Iterator` API, this structure implements /// the `IntoIterator` trait to provide it. This comes at the cost of /// an allocation of a `Vec` for each line in the `Iterator`. This is /// negligible in many cases, so often it comes down to which syntax /// is preferred: /// /// ```rust /// use bytelines::*; /// use std::fs::File; /// use std::io::BufReader; /// /// // construct our iterator from our file input /// let file = File::open("./res/numbers.txt").unwrap(); /// let reader = BufReader::new(file); /// let mut lines = ByteLines::new(reader); /// /// // walk our lines using `for` syntax /// for line in lines.into_iter() { /// // do something with the line, which is Result, _> /// } /// ``` pub struct ByteLines where B: BufRead, { buffer: Vec, reader: B, } impl ByteLines where B: BufRead, { /// Constructs a new `ByteLines` from an input `BufRead`. pub fn new(buf: B) -> Self { Self { buffer: Vec::new(), reader: buf, } } /// Retrieves a reference to the next line of bytes in the reader (if any). pub fn next(&mut self) -> Option> { self.buffer.clear(); crate::util::handle_line( self.reader.read_until(b'\n', &mut self.buffer), &mut self.buffer, ) } } /// `IntoIterator` conversion for `ByteLines` to provide `Iterator` APIs. impl IntoIterator for ByteLines where B: BufRead, { type Item = Result, Error>; type IntoIter = ByteLinesIter; /// Constructs a `ByteLinesIter` to provide an `Iterator` API. #[inline] fn into_iter(self) -> ByteLinesIter { ByteLinesIter { inner: self } } } /// `Iterator` implementation of `ByteLines` to provide `Iterator` APIs. /// /// This structure enables developers the use of the `Iterator` API in /// their code, at the cost of an allocation per input line: /// /// ```rust /// use bytelines::*; /// use std::fs::File; /// use std::io::BufReader; /// /// // construct our iterator from our file input /// let file = File::open("./res/numbers.txt").unwrap(); /// let lines = BufReader::new(file); /// let lines = bytelines::from_std(lines); /// /// // walk our lines using `for` syntax /// for line in lines.into_iter() { /// // do something with the line, which is Result, _> /// } /// ``` pub struct ByteLinesIter where B: BufRead, { inner: ByteLines, } impl Iterator for ByteLinesIter where B: BufRead, { type Item = Result, Error>; /// Retrieves the next line in the iterator (if any). #[inline] fn next(&mut self) -> Option, Error>> { self.inner.next().map(|r| r.map(|s| s.to_vec())) } } /// Represents anything which can provide iterators of byte lines. pub trait ByteLinesReader where B: BufRead, { /// Returns a structure used to iterate the lines of this reader as `Result<&[u8], _>`. fn byte_lines(self) -> ByteLines; } /// Blanket implementation for all `BufRead`. impl ByteLinesReader for B where B: BufRead, { /// Returns a structure used to iterate the lines of this reader as Result<&[u8], _>. #[inline] fn byte_lines(self) -> ByteLines { super::from_std(self) } } #[cfg(test)] #[allow(clippy::needless_range_loop)] mod tests { use super::*; use std::fs::File; use std::io::BufReader; #[test] fn test_basic_loop() { let file = File::open("./res/numbers.txt").unwrap(); let mut brdr = BufReader::new(file).byte_lines(); let mut lines = Vec::new(); while let Some(line) = brdr.next() { let line = line.unwrap().to_vec(); let line = String::from_utf8(line).unwrap(); lines.push(line); } for i in 0..9 { assert_eq!(lines[i], format!("{}", i)); } } #[test] fn test_basic_iterator() { let file = File::open("./res/numbers.txt").unwrap(); let mut lines = Vec::new(); for line in BufReader::new(file).byte_lines().into_iter() { let line = line.unwrap(); let line = String::from_utf8(line).unwrap(); lines.push(line); } for i in 0..9 { assert_eq!(lines[i], format!("{}", i)); } } #[test] fn test_empty_line() { let file = File::open("./res/empty.txt").unwrap(); let mut lines = Vec::new(); for line in BufReader::new(file).byte_lines().into_iter() { let line = line.unwrap(); let line = String::from_utf8(line).unwrap(); lines.push(line); } assert_eq!(lines.len(), 1); assert_eq!(lines[0], ""); } } bytelines-2.4.0/src/tokio.rs000064400000000000000000000072010072674642500141370ustar 00000000000000//! Module exposing APIs based around `AsyncBufRead` from Tokio. use futures::stream::{self, Stream}; use tokio::io::{AsyncBufRead, AsyncBufReadExt}; use std::io::Error; /// Provides async iteration over bytes of input, split by line. /// /// ```rust ignore /// use bytelines::*; /// use std::fs::File; /// use std::io::BufReader; /// /// // construct our iterator from our file input /// let file = File::open("./res/numbers.txt").await?; /// let reader = BufReader::new(file); /// let mut lines = AsyncByteLines::new(reader); /// /// // walk our lines using `while` syntax /// while let Some(line) = lines.next().await? { /// // do something with the line, which is &[u8] /// } /// /// This differs from the `stdlib` version of the API as it fits /// more closely with the Tokio API for types. /// /// For those who prefer the `Stream` API, this structure can be /// converted using `into_stream`. This comes at the cost of an /// allocation of a `Vec` for each line in the `Stream`. This is /// negligible in many cases, so often it comes down to which /// syntax is preferred: /// /// ```rust ignore /// use bytelines::*; /// use std::fs::File; /// use std::io::BufReader; /// /// // construct our iterator from our file input /// let file = File::open("./res/numbers.txt").await?; /// let reader = BufReader::new(file); /// let mut lines = AsyncByteLines::new(reader); /// /// // walk our lines using `Stream` syntax /// lines.into_stream().for_each(|line| { /// /// }); /// ``` pub struct AsyncByteLines where B: AsyncBufRead + Unpin, { buffer: Vec, reader: B, } impl AsyncByteLines where B: AsyncBufRead + Unpin, { /// Constructs a new `ByteLines` from an input `AsyncBufRead`. pub fn new(buf: B) -> Self { Self { buffer: Vec::new(), reader: buf, } } /// Retrieves a reference to the next line of bytes in the reader (if any). pub async fn next(&mut self) -> Result, Error> { self.buffer.clear(); let handled = crate::util::handle_line( self.reader.read_until(b'\n', &mut self.buffer).await, &mut self.buffer, ); handled.transpose() } /// Converts this wrapper to provide a `Stream` API. pub fn into_stream(self) -> impl Stream, Error>> { stream::try_unfold(self, |mut lines| async { Ok(lines .next() .await? .map(|line| line.to_vec()) .map(|line| (line, lines))) }) } } #[cfg(test)] #[allow(clippy::needless_range_loop)] mod tests { use tokio::fs::File; use tokio::io::BufReader; #[tokio::test] async fn test_basic_loop() { let file = File::open("./res/numbers.txt").await.unwrap(); let brdr = BufReader::new(file); let mut brdr = crate::from_tokio(brdr); let mut lines = Vec::new(); while let Some(line) = brdr.next().await.unwrap() { let line = line.to_vec(); let line = String::from_utf8(line).unwrap(); lines.push(line); } for i in 0..9 { assert_eq!(lines[i], format!("{}", i)); } } #[tokio::test] async fn test_basic_stream() { use futures::StreamExt; let file = File::open("./res/numbers.txt").await.unwrap(); let brdr = BufReader::new(file); let lines = crate::from_tokio(brdr) .into_stream() .map(|line| String::from_utf8(line.unwrap()).unwrap()) .collect::>() .await; for i in 0..9 { assert_eq!(lines[i], format!("{}", i)); } } } bytelines-2.4.0/src/util.rs000064400000000000000000000014310072674642500137660ustar 00000000000000//! Module exposing utility handlers across read types. use std::io::Result; /// Handles a line of input and maps into the provided buffer and returns a reference. pub fn handle_line(input: Result, buffer: &mut Vec) -> Option> { match input { // short circuit on error Err(e) => Some(Err(e)), // no input, done Ok(0) => None, // bytes! Ok(mut n) => { // always "pop" the delim if buffer[n - 1] == b'\n' { n -= 1; // also "pop" a potential leading \r if n > 0 && buffer[n - 1] == b'\r' { n -= 1; } } // pass back the byte slice Some(Ok(&buffer[..n])) } } }