char_reader-0.1.1/.cargo_vcs_info.json0000644000000001120000000000100132640ustar { "git": { "sha1": "1e9dc67aa83fca4ea6bc636e5c6e5197fb84c8f3" } } char_reader-0.1.1/.gitignore000064400000000000000000000000230072674642500140750ustar 00000000000000/target Cargo.lock char_reader-0.1.1/Cargo.toml0000644000000014420000000000100112710ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "char_reader" version = "0.1.1" authors = ["dystroy "] description = "Safely read wild streams as chars or lines" readme = "README.md" keywords = ["unicode", "char", "reader"] categories = [] license = "MIT" repository = "https://github.com/Canop/char_reader" [dependencies] char_reader-0.1.1/Cargo.toml.orig000064400000000000000000000005170072674642500150040ustar 00000000000000[package] name = "char_reader" version = "0.1.1" authors = ["dystroy "] edition = "2018" keywords = ["unicode", "char", "reader"] license = "MIT" categories = [] description = "Safely read wild streams as chars or lines" repository = "https://github.com/Canop/char_reader" readme = "README.md" [dependencies] char_reader-0.1.1/README.md000064400000000000000000000031710072674642500133730ustar 00000000000000[![MIT][s2]][l2] [![Latest Version][s1]][l1] [![docs][s3]][l3] [![Chat on Miaou][s4]][l4] [s1]: https://img.shields.io/crates/v/char_reader.svg [l1]: https://crates.io/crates/char_reader [s2]: https://img.shields.io/badge/license-MIT-blue.svg [l2]: LICENSE [s3]: https://docs.rs/char_reader/badge.svg [l3]: https://docs.rs/char_reader/ [s4]: https://miaou.dystroy.org/static/shields/room.svg [l4]: https://miaou.dystroy.org/3 BufRead's read_line may be a problem when you need performance and safety on unvetted streams: You may wait forever or get an out of memory panic if there's no newline in the stream. And even if there's one, it may be way past what you need: you'll have to keep everything in memory just to get to the start of the following line. `CharReader` is a buffered reader fixing those problems. * you can read lines without choking on an infinite stream without newlines * you can read lines and not store more than necessary if you just want the beginning * there's a `next_char` function to read only one char It's suitable when you'd like to read UTF8 lines and aren't sure the data are kind enough. When reading a line, you pass two parameters: * the max number of chars you want to get (rest of line will be dropped) * the max number of chars before giving out with an error (thus protecting against infinite streams) All errors are `io::Error`: * UTF8 errors are of kind `InvalidData` * Lines exceeding your threshold are of kind `Other` **Alternative:** If you know in advance how many lines you'd need and you always want whole lines, the standard `take` method of `BufReader` protects you against memory overflows. char_reader-0.1.1/src/lib.rs000064400000000000000000000024540072674642500140220ustar 00000000000000/*! BufRead's read_line may be a problem when you need performance and safety on unvetted streams: You may wait forever or get an out of memory panic if there's no newline in the stream. And even if there's one, it may be way past what you need: you'll have to keep everything in memory just to get to the start of the following line. `CharReader` is a buffered reader fixing those problems. * you can read lines without choking on an infinite stream without newlines * you can read lines and not store more than necessary if you just want the beginning * there's a `next_char` function to read only one char It's suitable when you'd like to read UTF8 lines and aren't sure the data are kind enough. When reading a line, you pass two parameters: * the max number of chars you want to get (rest of line will be dropped) * the max number of chars before giving out with an error (thus protecting against infinite streams) All errors are `io::Error`: * UTF8 errors are of kind `InvalidData` * Lines exceeding your threshold are of kind `Other` **Alternative:** If you know in advance how many lines you'd need and you always want whole lines, the standard `take` method of [std::io::BufReader] protects you against memory overflows. */ mod reader; mod unicode; #[cfg(test)] mod tests; pub use reader::CharReader; char_reader-0.1.1/src/reader.rs000064400000000000000000000131100072674642500145050ustar 00000000000000use { crate::unicode, std::{ io::{self, Read}, }, }; /// A buffered reader able to read chars or lines without crashing /// when the stream doesn't finish and/or doesn't contain newlines. /// /// It's also able to avoid storying whole lines if you're only /// interested in their beginning. /// /// Bad UTF8 is reported as io::Error with kind InvalidData. pub struct CharReader { src: R, buffer: Box<[u8]>, pos: usize, len: usize, } const DEFAULT_BUF_SIZE: usize = 5_000; impl CharReader { pub fn new(src: R) -> Self { let buf_size = DEFAULT_BUF_SIZE; let buffer = vec![0; buf_size].into_boxed_slice(); // we might be abte to skip filling with 0 with some unsafe Self { src, buffer, pos: 0, len: 0, } } /// ensure there's at least one char in the buffer, and returns it with /// its size in bytes (or None if the underlying stream is finished). /// /// You probably don't need this function but next_char. pub fn load_char(&mut self) -> io::Result> { if self.pos >= self.len { // buffer empty self.len = self.src.read(&mut self.buffer)?; if self.len == 0 { return Ok(None); } self.pos = 0; } let b = self.buffer[self.pos]; let char_size = unicode::utf8_char_width(b); if self.pos + char_size > self.len { // there's not enough bytes in buffer // we start by moving what we have at the start of the buffer to make some room self.buffer.copy_within(self.pos..self.len, 0); self.len -= self.pos; self.len += self.src.read(&mut self.buffer[self.len..])?; if self.len < char_size { // we may ignore one to 3 bytes not being correct UTF8 at the // very end of the stream (ie return None instead of an error) return Ok(None); } self.pos = 0; } let code_point = unicode::read_code_point(&self.buffer, self.pos, char_size); match std::char::from_u32(code_point) { Some(c) => Ok(Some((c, char_size))), None => Err(io::Error::new(io::ErrorKind::InvalidData, "Not UTF8")) } } /// read and return the next char, or NONE in case of EOF pub fn next_char(&mut self) -> io::Result> { Ok(match self.load_char()? { Some(cw) => { self.pos += cw.1; Some(cw.0) } None => None, }) } /// return the next char, but doesn't advance the cursor pub fn peek_char(&mut self) -> io::Result> { self.load_char().map(|cw| cw.map(|cw| cw.0)) } /// append the next line, if any, but with some protection against /// wild stream content: /// - don't store chars after the drop_after threshold /// - throw an error after the fail_after threshold /// /// Thresholds are in chars, not bytes nor cols nor graphemes. /// Only difference with next_line is that you pass (and may reuse) /// the string to fill. /// /// Return Ok(false) when there was no error but nothing to read /// (stream finished or paused). /// /// This function may return Ok(true) and not have written anything: /// it means there was an empty line (i.e. next char will be a CR or LF) pub fn read_line( &mut self, line: &mut String, // the line to append to drop_after: usize, // don't put in the string chars after that threshold fail_after: usize, // throw an error if there's no new line before that threshold ) -> io::Result { let mut chars_count = 0; // chars seen loop { match self.next_char() { Err(e) => { return Err(e); } Ok(None) => { return Ok(chars_count > 0); } Ok(Some(c)) => { if c == '\r' { if let Ok(Some(('\n', 1))) = self.load_char() { // we consume the LF following the CR self.pos += 1; } return Ok(true); } else if c == '\n' { return Ok(true); } else if chars_count >= fail_after { return Err(io::Error::new(io::ErrorKind::Other, "Line too long")); } else if chars_count >= drop_after { //debug!("dropping char {}", c); } else { line.push(c); } chars_count += 1; } } } } /// return the next line, if any, but with some protection against /// wild stream content: /// - don't store chars after the drop_after threshold /// - throw an error after the fail_after threshold /// /// Thresholds are in chars, not bytes nor cols nor graphemes. pub fn next_line( &mut self, drop_after: usize, // don't put in the string chars after that threshold fail_after: usize, // throw an error if there's no new line before that threshold ) -> io::Result> { let mut line = String::new(); match self.read_line(&mut line, drop_after, fail_after) { Ok(true) => Ok(Some(line)), Ok(false) => Ok(None), Err(e) => Err(e), } } } char_reader-0.1.1/src/tests.rs000064400000000000000000000027040072674642500144140ustar 00000000000000use { super::*, std::io, }; static TEXT: &str = "Comunicações\n概\n要éléphants blancs\nالعاشر ليونيكود\n\nhi?\n"; static INVALID: &[u8] = &[b'a', 0xff, 0x4a]; // invalid as UTF8 #[test] fn test_chars(){ let bytes = TEXT.as_bytes(); // a Read, mocking a file or stream let mut reader = CharReader::new(bytes); for str_char in TEXT.chars() { let read_char = reader.next_char().unwrap(); assert_eq!(read_char, Some(str_char)); } assert_eq!(reader.next_char().unwrap(), None); } #[test] fn test_lines(){ let bytes = TEXT.as_bytes(); let mut reader = CharReader::new(bytes); for str_line in TEXT.lines() { let cr_line = reader.next_line(50, 500).unwrap(); assert_eq!(&cr_line.unwrap(), str_line); } assert_eq!(reader.next_line(50, 500).unwrap(), None); } #[test] fn test_thresholds(){ let bytes = TEXT.as_bytes(); let mut reader = CharReader::new(bytes); assert_eq!(&reader.next_line(5, 15).unwrap().unwrap(), "Comun"); assert_eq!(&reader.next_line(5, 15).unwrap().unwrap(), "概"); assert_eq!( reader.next_line(5, 15).map_err(|e| e.kind()), Err(io::ErrorKind::Other), // too long ); } #[test] fn check_utf8_error(){ let mut reader = CharReader::new(INVALID); assert_eq!(reader.next_char().unwrap(), Some('a')); assert_eq!( reader.next_char().map_err(|e| e.kind()), Err(io::ErrorKind::InvalidData), ); } char_reader-0.1.1/src/unicode.rs000064400000000000000000000057220072674642500147030ustar 00000000000000// Everyting in this file has been taken in std::str, where // it hides behind an unstable feature flag. // I hope to trash this file at some point. /// assuming there's enough bytes in the buffer, read /// the code point at current position, don't increment it. /// /// This code mimics the one of str::next_code_point /// (currently unstable) /// As we already have the size it could be simplified. pub fn read_code_point(buffer: &[u8], pos: usize, char_size: usize) -> u32 { let x = buffer[pos]; if char_size == 1 { return x as u32; } // Multibyte case follows // Decode from a byte combination out of: [[[x y] z] w] let init = utf8_first_byte(x, 2); let y = buffer[pos + 1]; let mut ch = utf8_acc_cont_byte(init, y); if x >= 0xE0 { // [[x y z] w] case // 5th bit in 0xE0 .. 0xEF is always clear, so `init` is still valid let z = buffer[pos + 2]; let y_z = utf8_acc_cont_byte((y & CONT_MASK) as u32, z); ch = init << 12 | y_z; if x >= 0xF0 { // [x y z w] case // use only the lower 3 bits of `init` let w = buffer[pos + 3]; ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); } } ch } /// Returns the initial codepoint accumulator for the first byte. /// The first byte is special, only want bottom 5 bits for width 2, 4 bits /// for width 3, and 3 bits for width 4. /// /// taken in str unstable internals #[inline] fn utf8_first_byte(byte: u8, width: u32) -> u32 { (byte & (0x7F >> width)) as u32 } /// Returns the value of `ch` updated with continuation byte `byte`. /// /// taken in str unstable internals #[inline] fn utf8_acc_cont_byte(ch: u32, byte: u8) -> u32 { (ch << 6) | (byte & CONT_MASK) as u32 } // https://tools.ietf.org/html/rfc3629 static UTF8_CHAR_WIDTH: [u8; 256] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x5F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x7F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x9F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xBF 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xDF 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF ]; /// Given a first byte, determines how many bytes are in this UTF-8 character. #[inline] pub fn utf8_char_width(b: u8) -> usize { UTF8_CHAR_WIDTH[b as usize] as usize } /// Mask of the value bits of a continuation byte. const CONT_MASK: u8 = 0b0011_1111;