utf-8-0.7.5/.gitignore010064400017500001750000000000221262463173300127000ustar0000000000000000target Cargo.lock utf-8-0.7.5/Cargo.toml.orig010064400017500001750000000005711341463375000136070ustar0000000000000000[package] name = "utf-8" version = "0.7.5" authors = ["Simon Sapin "] description = "Incremental, zero-copy UTF-8 decoding with error handling" license = "MIT OR Apache-2.0" repository = "https://github.com/SimonSapin/rust-utf8" [lib] name = "utf8" test = false bench = false [dependencies] [profile.test] #opt-level = 3 [profile.bench] #debug = true utf-8-0.7.5/Cargo.toml0000644000000015470000000000000100540ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g. crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "utf-8" version = "0.7.5" authors = ["Simon Sapin "] description = "Incremental, zero-copy UTF-8 decoding with error handling" license = "MIT OR Apache-2.0" repository = "https://github.com/SimonSapin/rust-utf8" [profile.test] [profile.bench] [lib] name = "utf8" test = false bench = false [dependencies] utf-8-0.7.5/README.md010064400017500001750000000001451305562236700121770ustar0000000000000000# rust-utf8 Incremental, zero-copy UTF-8 decoding for Rust [Documentation](https://docs.rs/utf-8/) utf-8-0.7.5/benches/from_utf8_lossy.rs010064400017500001750000000015621305737605100160410ustar0000000000000000#![feature(test)] extern crate test; extern crate utf8; #[path = "../tests/shared/data.rs"] mod data; #[path = "../tests/shared/string_from_utf8_lossy.rs"] mod string_from_utf8_lossy; #[bench] fn bench_our_string_from_utf8_lossy(bencher: &mut test::Bencher) { bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); bencher.iter(|| { for &(input, _expected) in data::DECODED_LOSSY { test::black_box(string_from_utf8_lossy::string_from_utf8_lossy(input)); } }) } #[bench] fn bench_std_string_from_utf8_lossy(bencher: &mut test::Bencher) { bencher.bytes = data::DECODED_LOSSY.iter().map(|&(input, _expected)| input.len() as u64).sum(); bencher.iter(|| { for &(input, _expected) in data::DECODED_LOSSY { test::black_box(String::from_utf8_lossy(input)); } }) } utf-8-0.7.5/src/lib.rs010064400017500001750000000144321341463375000126240ustar0000000000000000mod lossy; mod read; pub use lossy::LossyDecoder; pub use read::{BufReadDecoder, BufReadDecoderError}; use std::cmp; use std::error::Error; use std::fmt; use std::str; /// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error. pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}"; #[derive(Debug, Copy, Clone)] pub enum DecodeError<'a> { /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`, /// then call `decode()` again with `remaining_input`. Invalid { valid_prefix: &'a str, invalid_sequence: &'a [u8], remaining_input: &'a [u8], }, /// Call the `incomplete_suffix.try_complete` method with more input when available. /// If no more input is available, this is an invalid byte sequence. Incomplete { valid_prefix: &'a str, incomplete_suffix: Incomplete, }, } impl<'a> fmt::Display for DecodeError<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { DecodeError::Invalid { valid_prefix, invalid_sequence, remaining_input, } => write!( f, "found invalid byte sequence {invalid_sequence:02x?} after \ {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \ unprocessed bytes", invalid_sequence = invalid_sequence, valid_byte_count = valid_prefix.len(), unprocessed_byte_count = remaining_input.len() ), DecodeError::Incomplete { valid_prefix, incomplete_suffix, } => write!( f, "found incomplete byte sequence {incomplete_suffix:02x?} after \ {valid_byte_count} bytes", incomplete_suffix = incomplete_suffix, valid_byte_count = valid_prefix.len() ), } } } impl<'a> Error for DecodeError<'a> {} #[derive(Debug, Copy, Clone)] pub struct Incomplete { pub buffer: [u8; 4], pub buffer_len: u8, } pub fn decode(input: &[u8]) -> Result<&str, DecodeError> { let error = match str::from_utf8(input) { Ok(valid) => return Ok(valid), Err(error) => error, }; // FIXME: separate function from here to guide inlining? let (valid, after_valid) = input.split_at(error.valid_up_to()); let valid = unsafe { str::from_utf8_unchecked(valid) }; match error.error_len() { Some(invalid_sequence_length) => { let (invalid, rest) = after_valid.split_at(invalid_sequence_length); Err(DecodeError::Invalid { valid_prefix: valid, invalid_sequence: invalid, remaining_input: rest }) } None => { Err(DecodeError::Incomplete { valid_prefix: valid, incomplete_suffix: Incomplete::new(after_valid), }) } } } impl Incomplete { pub fn empty() -> Self { Incomplete { buffer: [0, 0, 0, 0], buffer_len: 0, } } pub fn is_empty(&self) -> bool { self.buffer_len == 0 } pub fn new(bytes: &[u8]) -> Self { let mut buffer = [0, 0, 0, 0]; let len = bytes.len(); buffer[..len].copy_from_slice(bytes); Incomplete { buffer: buffer, buffer_len: len as u8, } } /// * `None`: still incomplete, call `try_complete` again with more input. /// If no more input is available, this is invalid byte sequence. /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`. /// To keep decoding, pass `remaining_input` to `decode()`. pub fn try_complete<'input>(&mut self, input: &'input [u8]) -> Option<(Result<&str, &[u8]>, &'input [u8])> { let (consumed, opt_result) = self.try_complete_offsets(input); let result = opt_result?; let remaining_input = &input[consumed..]; let result_bytes = self.take_buffer(); let result = match result { Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }), Err(()) => Err(result_bytes), }; Some((result, remaining_input)) } fn take_buffer(&mut self) -> &[u8] { let len = self.buffer_len as usize; self.buffer_len = 0; &self.buffer[..len as usize] } /// (consumed_from_input, None): not enough input /// (consumed_from_input, Some(Err(()))): error bytes in buffer /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option>) { let initial_buffer_len = self.buffer_len as usize; let copied_from_input; { let unwritten = &mut self.buffer[initial_buffer_len..]; copied_from_input = cmp::min(unwritten.len(), input.len()); unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]); } let spliced = &self.buffer[..initial_buffer_len + copied_from_input]; match str::from_utf8(spliced) { Ok(_) => { self.buffer_len = spliced.len() as u8; (copied_from_input, Some(Ok(()))) } Err(error) => { let valid_up_to = error.valid_up_to(); if valid_up_to > 0 { let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap(); self.buffer_len = valid_up_to as u8; (consumed, Some(Ok(()))) } else { match error.error_len() { Some(invalid_sequence_length) => { let consumed = invalid_sequence_length .checked_sub(initial_buffer_len).unwrap(); self.buffer_len = invalid_sequence_length as u8; (consumed, Some(Err(()))) } None => { self.buffer_len = spliced.len() as u8; (copied_from_input, None) } } } } } } } utf-8-0.7.5/src/lossy.rs010064400017500001750000000057111326447100400132220ustar0000000000000000use super::*; /// A push-based, lossy decoder for UTF-8. /// Errors are replaced with the U+FFFD replacement character. /// /// Users “push” bytes into the decoder, which in turn “pushes” `&str` slices into a callback. /// /// For example, `String::from_utf8_lossy` (but returning `String` instead of `Cow`) /// can be rewritten as: /// /// ```rust /// fn string_from_utf8_lossy(input: &[u8]) -> String { /// let mut string = String::new(); /// utf8::LossyDecoder::new(|s| string.push_str(s)).feed(input); /// string /// } /// ``` /// /// **Note:** Dropping the decoder signals the end of the input: /// If the last input chunk ended with an incomplete byte sequence for a code point, /// this is an error and a replacement character is emitted. /// Use `std::mem::forget` to inhibit this behavior. pub struct LossyDecoder { push_str: F, incomplete: Incomplete, } impl LossyDecoder { /// Create a new decoder from a callback. #[inline] pub fn new(push_str: F) -> Self { LossyDecoder { push_str: push_str, incomplete: Incomplete { buffer: [0, 0, 0, 0], buffer_len: 0, }, } } /// Feed one chunk of input into the decoder. /// /// The input is decoded lossily /// and the callback called once or more with `&str` string slices. /// /// If the UTF-8 byte sequence for one code point was split into this bytes chunk /// and previous bytes chunks, it will be correctly pieced back together. pub fn feed(&mut self, mut input: &[u8]) { if self.incomplete.buffer_len > 0 { match self.incomplete.try_complete(input) { Some((Ok(s), remaining)) => { (self.push_str)(s); input = remaining } Some((Err(_), remaining)) => { (self.push_str)(REPLACEMENT_CHARACTER); input = remaining } None => { return } } } loop { match decode(input) { Ok(s) => { (self.push_str)(s); return } Err(DecodeError::Incomplete { valid_prefix, incomplete_suffix }) => { (self.push_str)(valid_prefix); self.incomplete = incomplete_suffix; return } Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { (self.push_str)(valid_prefix); (self.push_str)(REPLACEMENT_CHARACTER); input = remaining_input } } } } } impl Drop for LossyDecoder { #[inline] fn drop(&mut self) { if self.incomplete.buffer_len > 0 { (self.push_str)(REPLACEMENT_CHARACTER) } } } utf-8-0.7.5/src/read.rs010064400017500001750000000136411341463375000127720ustar0000000000000000use std::io::{self, BufRead}; use std::error::Error; use std::fmt; use std::str; use super::*; /// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8. pub struct BufReadDecoder { buf_read: B, bytes_consumed: usize, incomplete: Incomplete, } #[derive(Debug)] pub enum BufReadDecoderError<'a> { /// Represents one UTF-8 error in the byte stream. /// /// In lossy decoding, each such error should be replaced with U+FFFD. /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.) InvalidByteSequence(&'a [u8]), /// An I/O error from the underlying byte stream Io(io::Error), } impl<'a> BufReadDecoderError<'a> { /// Replace UTF-8 errors with U+FFFD pub fn lossy(self) -> Result<&'static str, io::Error> { match self { BufReadDecoderError::Io(error) => Err(error), BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER), } } } impl<'a> fmt::Display for BufReadDecoderError<'a> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match *self { BufReadDecoderError::InvalidByteSequence(bytes) => { write!(f, "invalid byte sequence: {:02x?}", bytes) } BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err), } } } impl<'a> Error for BufReadDecoderError<'a> { fn source(&self) -> Option<&(dyn Error + 'static)> { match *self { BufReadDecoderError::InvalidByteSequence(_) => None, BufReadDecoderError::Io(ref err) => Some(err), } } } impl BufReadDecoder { /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`. pub fn read_to_string_lossy(buf_read: B) -> io::Result { let mut decoder = Self::new(buf_read); let mut string = String::new(); while let Some(result) = decoder.next_lossy() { string.push_str(result?) } Ok(string) } pub fn new(buf_read: B) -> Self { Self { buf_read, bytes_consumed: 0, incomplete: Incomplete::empty(), } } /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD. pub fn next_lossy(&mut self) -> Option> { self.next_strict().map(|result| result.or_else(|e| e.lossy())) } /// Decode and consume the next chunk of UTF-8 input. /// /// This method is intended to be called repeatedly until it returns `None`, /// which represents EOF from the underlying byte stream. /// This is similar to `Iterator::next`, /// except that decoded chunks borrow the decoder (~iterator) /// so they need to be handled or copied before the next chunk can start decoding. pub fn next_strict(&mut self) -> Option> { enum BytesSource { BufRead(usize), Incomplete, } macro_rules! try_io { ($io_result: expr) => { match $io_result { Ok(value) => value, Err(error) => return Some(Err(BufReadDecoderError::Io(error))) } } } let (source, result) = loop { if self.bytes_consumed > 0 { self.buf_read.consume(self.bytes_consumed); self.bytes_consumed = 0; } let buf = try_io!(self.buf_read.fill_buf()); // Force loop iteration to go through an explicit `continue` enum Unreachable {} let _: Unreachable = if self.incomplete.is_empty() { if buf.is_empty() { return None // EOF } match str::from_utf8(buf) { Ok(_) => { break (BytesSource::BufRead(buf.len()), Ok(())) } Err(error) => { let valid_up_to = error.valid_up_to(); if valid_up_to > 0 { break (BytesSource::BufRead(valid_up_to), Ok(())) } match error.error_len() { Some(invalid_sequence_length) => { break (BytesSource::BufRead(invalid_sequence_length), Err(())) } None => { self.bytes_consumed = buf.len(); self.incomplete = Incomplete::new(buf); // need more input bytes continue } } } } } else { if buf.is_empty() { break (BytesSource::Incomplete, Err(())) // EOF with incomplete code point } let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf); self.bytes_consumed = consumed; match opt_result { None => { // need more input bytes continue } Some(result) => { break (BytesSource::Incomplete, result) } } }; }; let bytes = match source { BytesSource::BufRead(byte_count) => { self.bytes_consumed = byte_count; let buf = try_io!(self.buf_read.fill_buf()); &buf[..byte_count] } BytesSource::Incomplete => { self.incomplete.take_buffer() } }; match result { Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })), Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))), } } } utf-8-0.7.5/tests/unit.rs010064400017500001750000000145411333011330600133740ustar0000000000000000extern crate utf8; use std::borrow::Cow; use std::collections::VecDeque; use std::io; use utf8::*; /// A re-implementation of std::str::from_utf8 pub fn str_from_utf8(input: &[u8]) -> Result<&str, usize> { match decode(input) { Ok(s) => return Ok(s), Err(DecodeError::Invalid { valid_prefix, .. }) | Err(DecodeError::Incomplete { valid_prefix, .. }) => Err(valid_prefix.len()), } } #[test] fn test_str_from_utf8() { let xs = b"hello"; assert_eq!(str_from_utf8(xs), Ok("hello")); let xs = "ศไทย中华Việt Nam".as_bytes(); assert_eq!(str_from_utf8(xs), Ok("ศไทย中华Việt Nam")); let xs = b"hello\xFF"; assert!(str_from_utf8(xs).is_err()); } #[test] fn test_is_utf8() { // Chars of 1, 2, 3, and 4 bytes assert!(str_from_utf8("eé€\u{10000}".as_bytes()).is_ok()); // invalid prefix assert!(str_from_utf8(&[0x80]).is_err()); // invalid 2 byte prefix assert!(str_from_utf8(&[0xc0]).is_err()); assert!(str_from_utf8(&[0xc0, 0x10]).is_err()); // invalid 3 byte prefix assert!(str_from_utf8(&[0xe0]).is_err()); assert!(str_from_utf8(&[0xe0, 0x10]).is_err()); assert!(str_from_utf8(&[0xe0, 0xff, 0x10]).is_err()); // invalid 4 byte prefix assert!(str_from_utf8(&[0xf0]).is_err()); assert!(str_from_utf8(&[0xf0, 0x10]).is_err()); assert!(str_from_utf8(&[0xf0, 0xff, 0x10]).is_err()); assert!(str_from_utf8(&[0xf0, 0xff, 0xff, 0x10]).is_err()); // deny overlong encodings assert!(str_from_utf8(&[0xc0, 0x80]).is_err()); assert!(str_from_utf8(&[0xc0, 0xae]).is_err()); assert!(str_from_utf8(&[0xe0, 0x80, 0x80]).is_err()); assert!(str_from_utf8(&[0xe0, 0x80, 0xaf]).is_err()); assert!(str_from_utf8(&[0xe0, 0x81, 0x81]).is_err()); assert!(str_from_utf8(&[0xf0, 0x82, 0x82, 0xac]).is_err()); assert!(str_from_utf8(&[0xf4, 0x90, 0x80, 0x80]).is_err()); // deny surrogates assert!(str_from_utf8(&[0xED, 0xA0, 0x80]).is_err()); assert!(str_from_utf8(&[0xED, 0xBF, 0xBF]).is_err()); assert!(str_from_utf8(&[0xC2, 0x80]).is_ok()); assert!(str_from_utf8(&[0xDF, 0xBF]).is_ok()); assert!(str_from_utf8(&[0xE0, 0xA0, 0x80]).is_ok()); assert!(str_from_utf8(&[0xED, 0x9F, 0xBF]).is_ok()); assert!(str_from_utf8(&[0xEE, 0x80, 0x80]).is_ok()); assert!(str_from_utf8(&[0xEF, 0xBF, 0xBF]).is_ok()); assert!(str_from_utf8(&[0xF0, 0x90, 0x80, 0x80]).is_ok()); assert!(str_from_utf8(&[0xF4, 0x8F, 0xBF, 0xBF]).is_ok()); } /// A re-implementation of String::from_utf8_lossy pub fn string_from_utf8_lossy(input: &[u8]) -> Cow { let mut result = decode(input); if let Ok(s) = result { return s.into() } let mut string = String::with_capacity(input.len() + REPLACEMENT_CHARACTER.len()); loop { match result { Ok(s) => { string.push_str(s); return string.into() } Err(DecodeError::Incomplete { valid_prefix, .. }) => { string.push_str(valid_prefix); string.push_str(REPLACEMENT_CHARACTER); return string.into() } Err(DecodeError::Invalid { valid_prefix, remaining_input, .. }) => { string.push_str(valid_prefix); string.push_str(REPLACEMENT_CHARACTER); result = decode(remaining_input); } } } } pub const DECODED_LOSSY: &'static [(&'static [u8], &'static str)] = &[ (b"hello", "hello"), (b"\xe0\xb8\xa8\xe0\xb9\x84\xe0\xb8\x97\xe0\xb8\xa2\xe4\xb8\xad\xe5\x8d\x8e", "ศไทย中华"), (b"Vi\xe1\xbb\x87t Nam", "Việt Nam"), (b"Hello\xC2 There\xFF ", "Hello\u{FFFD} There\u{FFFD} "), (b"Hello\xC0\x80 There", "Hello\u{FFFD}\u{FFFD} There"), (b"\xE6\x83 Goodbye", "\u{FFFD} Goodbye"), (b"\xF5foo\xF5\x80bar", "\u{FFFD}foo\u{FFFD}\u{FFFD}bar"), (b"\xF5foo\xF5\xC2", "\u{FFFD}foo\u{FFFD}\u{FFFD}"), (b"\xF1foo\xF1\x80bar\xF1\x80\x80baz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}baz"), (b"\xF4foo\xF4\x80bar\xF4\xBFbaz", "\u{FFFD}foo\u{FFFD}bar\u{FFFD}\u{FFFD}baz"), (b"\xF0\x80\x80\x80foo\xF0\x90\x80\x80bar", "\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}foo\u{10000}bar"), (b"\xF0\x90\x80foo", "\u{FFFD}foo"), // surrogates (b"\xED\xA0\x80foo\xED\xBF\xBFbar", "\u{FFFD}\u{FFFD}\u{FFFD}foo\u{FFFD}\u{FFFD}\u{FFFD}bar"), ]; #[test] fn test_string_from_utf8_lossy() { for &(input, expected) in DECODED_LOSSY { assert_eq!(string_from_utf8_lossy(input), expected); } } pub fn all_partitions<'a, F>(input: &'a [u8], f: F) where F: Fn(&[&[u8]]) { fn all_partitions_inner<'a, F>(chunks: &mut Vec<&'a [u8]>, input: &'a [u8], f: &F) where F: Fn(&[&[u8]]) { if input.is_empty() { f(chunks) } for i in 1..(input.len() + 1) { chunks.push(&input[..i]); all_partitions_inner(chunks, &input[i..], f); chunks.pop(); } } let mut chunks = Vec::new(); all_partitions_inner(&mut chunks, input, &f); assert_eq!(chunks.len(), 0); } #[test] fn test_incremental_decoder() { for &(input, expected) in DECODED_LOSSY { all_partitions(input, |chunks| { let mut string = String::new(); { let mut decoder = LossyDecoder::new(|s| string.push_str(s)); for &chunk in &*chunks { decoder.feed(chunk); } } assert_eq!(string, expected); }); } } #[test] fn test_bufread_decoder() { for &(input, expected) in DECODED_LOSSY { all_partitions(input, |chunks| { let chunks = Chunks(chunks.to_vec().into()); let string = BufReadDecoder::read_to_string_lossy(chunks).unwrap(); assert_eq!(string, expected) }); } } struct Chunks<'a>(VecDeque<&'a [u8]>); impl<'a> io::Read for Chunks<'a> { fn read(&mut self, _: &mut [u8]) -> io::Result { unimplemented!() } } impl<'a> io::BufRead for Chunks<'a> { fn fill_buf(&mut self) -> io::Result<&[u8]> { Ok(*self.0.front().unwrap()) } fn consume(&mut self, bytes: usize) { { let front = self.0.front_mut().unwrap(); *front = &front[bytes..]; if !front.is_empty() { return } } if self.0.len() > 1 { self.0.pop_front(); } } } utf-8-0.7.5/.cargo_vcs_info.json0000644000000001120000000000000120410ustar00{ "git": { "sha1": "515a835ad7e71ac161b829aa59535abba1a0564f" } }