utf8-width-0.1.5/.cargo_vcs_info.json0000644000000001120000000000000130330ustar { "git": { "sha1": "146420f013ccdfc168fe389252455dda5a2c3b05" } } utf8-width-0.1.5/Cargo.toml0000644000000021530000000000000110400ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "utf8-width" version = "0.1.5" authors = ["Magic Len "] include = ["src/**/*", "Cargo.toml", "README.md", "LICENSE", "benches/bench.rs"] description = "To determine the width of a UTF-8 character by providing its first byte." homepage = "https://magiclen.org/utf8-width" readme = "README.md" keywords = ["utf-8", "width", "length", "character"] categories = ["no-std", "encoding"] license = "MIT" repository = "https://github.com/magiclen/utf8-width" [[bench]] name = "bench" harness = false [dependencies] [dev-dependencies.bencher] version = "0.1.5" utf8-width-0.1.5/Cargo.toml.orig000064400000000000000000000011260000000000000144760ustar 00000000000000[package] name = "utf8-width" version = "0.1.5" authors = ["Magic Len "] edition = "2018" repository = "https://github.com/magiclen/utf8-width" homepage = "https://magiclen.org/utf8-width" keywords = ["utf-8", "width", "length", "character"] categories = ["no-std", "encoding"] description = "To determine the width of a UTF-8 character by providing its first byte." readme = "README.md" license = "MIT" include = ["src/**/*", "Cargo.toml", "README.md", "LICENSE", "benches/bench.rs"] [dependencies] [dev-dependencies] bencher = "0.1.5" [[bench]] name = "bench" harness = falseutf8-width-0.1.5/LICENSE000064400000000000000000000020660000000000000126200ustar 00000000000000MIT License Copyright (c) 2020 magiclen.org (Ron Li) Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. utf8-width-0.1.5/README.md000064400000000000000000000011570000000000000130720ustar 00000000000000UTF-8 Width ==================== [![CI](https://github.com/magiclen/utf8-width/actions/workflows/ci.yml/badge.svg)](https://github.com/magiclen/utf8-width/actions/workflows/ci.yml) To determine the width of a UTF-8 character by providing its first byte. References: https://tools.ietf.org/html/rfc3629 ## Examples ```rust extern crate utf8_width; assert_eq!(1, utf8_width::get_width(b'1')); assert_eq!(3, utf8_width::get_width("中".as_bytes()[0])); ``` ## Benchmark ```bash cargo bench ``` ## Crates.io https://crates.io/crates/utf8-width ## Documentation https://docs.rs/utf8-width ## License [MIT](LICENSE)utf8-width-0.1.5/benches/bench.rs000064400000000000000000000064250000000000000146520ustar 00000000000000extern crate utf8_width; #[macro_use] extern crate bencher; use std::fs; use bencher::Bencher; #[cfg(unix)] const TEXT_PATH: &str = "benches/data/wikipedia-rust.txt"; #[cfg(windows)] const TEXT_PATH: &str = r"benches\data\wikipedia-rust.txt"; static UTF8_CHAR_WIDTH: [usize; 256] = [ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x1F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x3F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x5F 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0x7F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0x9F 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xBF 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 0xDF 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xEF 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0xFF ]; fn retrieve_get_width(bencher: &mut Bencher) { let bytes = fs::read(TEXT_PATH).unwrap(); let length = bytes.len(); bencher.iter(|| { let mut widths = Vec::new(); let mut p = 0; loop { let e = bytes[p]; let width = utf8_width::get_width(e); widths.push(width); p += width; if p == length { break; } } widths }); bencher.bytes = length as u64; } fn retrieve_get_width_assume_valid(bencher: &mut Bencher) { let bytes = fs::read(TEXT_PATH).unwrap(); let length = bytes.len(); bencher.iter(|| { let mut widths = Vec::new(); let mut p = 0; let length = bytes.len(); loop { let e = bytes[p]; let width = unsafe { utf8_width::get_width_assume_valid(e) }; widths.push(width); p += width; if p == length { break; } } widths }); bencher.bytes = length as u64; } fn retrieve_get_width_by_looking_table(bencher: &mut Bencher) { let bytes = fs::read(TEXT_PATH).unwrap(); let length = bytes.len(); bencher.iter(|| { let mut widths = Vec::new(); let mut p = 0; let length = bytes.len(); loop { let e = bytes[p]; let width = UTF8_CHAR_WIDTH[e as usize]; widths.push(width); p += width; if p == length { break; } } widths }); bencher.bytes = length as u64; } fn retrieve_get_width_by_chars(bencher: &mut Bencher) { let text = fs::read_to_string(TEXT_PATH).unwrap(); let length = text.len(); bencher.iter(|| { let mut widths = Vec::new(); for c in text.chars() { widths.push(c.len_utf8()) } widths }); bencher.bytes = length as u64; } benchmark_group!(get_width, retrieve_get_width, retrieve_get_width_assume_valid, retrieve_get_width_by_looking_table, retrieve_get_width_by_chars); benchmark_main!(get_width); utf8-width-0.1.5/src/lib.rs000064400000000000000000000040460000000000000135160ustar 00000000000000/*! # UTF-8 Width To determine the width of a UTF-8 character by providing its first byte. References: https://tools.ietf.org/html/rfc3629 ## Examples ```rust extern crate utf8_width; assert_eq!(1, utf8_width::get_width(b'1')); assert_eq!(3, utf8_width::get_width("中".as_bytes()[0])); ``` ## Benchmark ```bash cargo bench ``` */ #![no_std] pub const MIN_0_1: u8 = 0x80; pub const MAX_0_1: u8 = 0xC1; pub const MIN_0_2: u8 = 0xF5; pub const MAX_0_2: u8 = 0xFF; pub const MIN_1: u8 = 0x00; pub const MAX_1: u8 = 0x7F; pub const MIN_2: u8 = 0xC2; pub const MAX_2: u8 = 0xDF; pub const MIN_3: u8 = 0xE0; pub const MAX_3: u8 = 0xEF; pub const MIN_4: u8 = 0xF0; pub const MAX_4: u8 = 0xF4; #[inline] pub fn is_width_1(byte: u8) -> bool { byte <= MAX_1 // no need to check `MIN_1 <= byte` } #[inline] pub fn is_width_2(byte: u8) -> bool { MIN_2 <= byte && byte <= MAX_2 } #[inline] pub fn is_width_3(byte: u8) -> bool { MIN_3 <= byte && byte <= MAX_3 } #[inline] pub fn is_width_4(byte: u8) -> bool { MIN_4 <= byte && byte <= MAX_4 } #[inline] pub fn is_width_0(byte: u8) -> bool { MIN_0_1 <= byte && byte <= MAX_0_1 || MIN_0_2 <= byte // no need to check `byte <= MAX_0_2` } /// Given a first byte, determines how many bytes are in this UTF-8 character. If the UTF-8 character is invalid, returns `0`, otherwise returns `1` ~ `4`, #[inline] pub fn get_width(byte: u8) -> usize { if is_width_1(byte) { 1 } else if is_width_2(byte) { 2 } else if byte <= MAX_3 { // no need to check `MIN_3 <= byte` 3 } else if byte <= MAX_4 { // no need to check `MIN_4 <= byte` 4 } else { 0 } } #[allow(clippy::missing_safety_doc)] /// *Assume the input first byte is from a valid UTF-8 character.* Given a first byte, determines how many bytes are in this UTF-8 character. It returns `1` ~ `4`, #[inline] pub unsafe fn get_width_assume_valid(byte: u8) -> usize { if byte <= MAX_1 { 1 } else if byte <= MAX_2 { 2 } else if byte <= MAX_3 { 3 } else { 4 } }