str_indices-0.4.3/.cargo_vcs_info.json0000644000000001360000000000100133460ustar { "git": { "sha1": "5a38c5f88ca713cc7bdb93c7d417295d1f46e372" }, "path_in_vcs": "" }str_indices-0.4.3/.gitignore000064400000000000000000000000711046102023000141240ustar 00000000000000/target Cargo.lock **/*.rs.bk perf.data* cachegrind.out* str_indices-0.4.3/CHANGELOG.md000064400000000000000000000043401046102023000137500ustar 00000000000000# Changelog ## [Unreleased] ## [0.4.3] - 2023-11-07 - Fix regression in minimum supported Rust version. No functional changes. ## [0.4.2] - 2023-10-18 - Add SIMD support for Aarch64, and corresponding performance improvements on that platform for all functions. - Performance improvements on x86-64 as well for `char` and LF counting/conversion functions. ## [0.4.1] - 2022-12-18 - Updated readme with note about the `simd` feature flag. ## [0.4.0] - 2022-05-25 ### New Features - Added "simd" feature flag to allow disabling simd usage in the library. ## [0.3.2] - 2022-03-22 ### Performance - Substantially improved performance for `chars::count()` and `lines_lf::count_breaks()` on very short strings, in some cases up to 2x faster. ## [0.3.1] - 2022-03-14 ### Performance - `utf16::to_byte_idx()` is actually optimized now (it was the last remaining non-optimized function), for a ~6x improvement in speed. - Substantially improved performance on Apple M1 platforms (over 6x for some functions). - Mild-to-moderate performance improvements across the board on x86/64. ## [0.3.0] - 2022-03-12 ### New Features - Added `lines_lf` module, a line-feed-only variant of the `lines` module. - Added `lines_crlf` module, a line feed and carriage return variant of the `lines` module. ### Test Suite - Added property testing. - Added fuzzing. ## [0.2.0] - 2022-03-11 - Major clean up of the code and API. - Added minimal documentation. ## [0.1.0] - 2022-03-11 - First release. - Split off from [Ropey](https://crates.io/crates/ropey). [Unreleased]: https://github.com/cessen/str_indices/compare/v0.4.3...HEAD [0.4.3]: https://github.com/cessen/str_indices/compare/v0.4.2...v0.4.3 [0.4.2]: https://github.com/cessen/str_indices/compare/v0.4.1...v0.4.2 [0.4.1]: https://github.com/cessen/str_indices/compare/v0.4.0...v0.4.1 [0.4.0]: https://github.com/cessen/str_indices/compare/v0.3.2...v0.4.0 [0.3.2]: https://github.com/cessen/str_indices/compare/v0.3.1...v0.3.2 [0.3.1]: https://github.com/cessen/str_indices/compare/v0.3.0...v0.3.1 [0.3.0]: https://github.com/cessen/str_indices/compare/v0.2.0...v0.3.0 [0.2.0]: https://github.com/cessen/str_indices/compare/v0.1.0...v0.2.0 [0.1.0]: https://github.com/cessen/str_indices/releases/tag/v0.1.0 str_indices-0.4.3/Cargo.toml0000644000000022710000000000100113460ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "str_indices" version = "0.4.3" authors = ["Nathan Vegdahl "] exclude = [ "tests/", "benches/text/", ".github/", ] description = "Count and convert between indexing schemes on string slices." documentation = "https://docs.rs/str_indices" readme = "README.md" keywords = [ "text", "string", "nostd", ] categories = ["text-processing"] license = "MIT OR Apache-2.0" repository = "https://github.com/cessen/str_indices" [profile.release] lto = "thin" [[bench]] name = "all" harness = false [dependencies] [dev-dependencies.criterion] version = "0.3" features = ["html_reports"] [dev-dependencies.proptest] version = "1.0" [features] default = ["simd"] simd = [] str_indices-0.4.3/Cargo.toml.orig000064400000000000000000000014351046102023000150300ustar 00000000000000[package] name = "str_indices" version = "0.4.3" edition = "2021" authors = ["Nathan Vegdahl "] description = "Count and convert between indexing schemes on string slices." documentation = "https://docs.rs/str_indices" repository = "https://github.com/cessen/str_indices" readme = "README.md" license = "MIT OR Apache-2.0" keywords = ["text", "string", "nostd"] categories = ["text-processing"] exclude = ["tests/", "benches/text/", ".github/"] [features] default = ["simd"] simd = [] # Enable explicit SIMD optimizations on supported platforms. [dependencies] [dev-dependencies] proptest = "1.0" criterion = { version = "0.3", features = ["html_reports"] } [profile.release] lto = "thin" #----------------------------------------- [[bench]] name = "all" harness = false str_indices-0.4.3/LICENSE-APACHE000064400000000000000000000251441046102023000140700ustar 00000000000000 Apache License Version 2.0, January 2004 https://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. str_indices-0.4.3/LICENSE-MIT000064400000000000000000000020141046102023000135670ustar 00000000000000MIT License Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. str_indices-0.4.3/README.md000064400000000000000000000043461046102023000134240ustar 00000000000000# Str Indices [![Latest Release][crates-io-badge]][crates-io-url] [![Documentation][docs-rs-img]][docs-rs-url] Count and convert between different indexing schemes on utf8 string slices. The following schemes are currently supported: * Chars (Unicode scalar values). * UTF16 code units. * Lines, with three options for recognized line break characters: * Line feed only. * Line feed and carriage return. * All Unicode line break characters, as specified in [Unicode Annex #14](https://www.unicode.org/reports/tr14/). ## Feature Flags The `simd` feature flag (enabled by default) enables explicit SIMD optimizations on supported platforms. Disabling it will use the fallback scalar code path on all platforms. This feature flag only affects performance, not behavior. ## Unsafe Code Str Indices uses unsafe code for performance optimizations, primarily for SIMD intrinsics. The unsafe code is kept minimal, mostly compartmentalized, and easy to audit. The entire code base is also fuzz tested. If you would like to run the fuzz testing suite yourself, install Rust nightly and [cargo fuzz](https://github.com/rust-fuzz/cargo-fuzz/) and run the following command from the repository root: ``` cargo +nightly fuzz run random_strings ``` Additional runs of the fuzz testing suite are very much appreciated, as is general auditing of the code base. If you find any unsoundness, _please_ file an issue! ## License This project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or http://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or http://opensource.org/licenses/MIT) at your option. ## Contributing Unless you explicitly state otherwise, any contribution intentionally submitted for inclusion in Str Indices by you will be licensed as above, without any additional terms or conditions. This crate is no-std, doesn't allocate, and has zero dependencies, and aims to remain that way. Please adhere to this in any submitted contributions. [crates-io-badge]: https://img.shields.io/crates/v/str_indices.svg [crates-io-url]: https://crates.io/crates/str_indices [docs-rs-img]: https://docs.rs/str_indices/badge.svg [docs-rs-url]: https://docs.rs/str_indices str_indices-0.4.3/benches/all.rs000064400000000000000000000317151046102023000146720ustar 00000000000000#![allow(clippy::uninlined_format_args)] use std::{fs, path::Path}; use criterion::{black_box, criterion_group, criterion_main, Criterion, Throughput}; use str_indices::{chars, lines, lines_crlf, lines_lf, utf16}; fn all(c: &mut Criterion) { let root = Path::new(env!("CARGO_MANIFEST_DIR")).join("benches/text"); let read_text = |name: &str| fs::read_to_string(root.join(name)).expect("cannot find benchmark text at"); // Load benchmark strings. let test_strings = vec![ ("en_0001", "E".into()), ("en_0010", read_text("en_10.txt")), ("en_0100", read_text("en_100.txt")), ("en_1000", read_text("en_1000.txt")), ("en_10000", read_text("en_1000.txt").repeat(10)), ("jp_0003", "日".into()), ("jp_0102", read_text("jp_102.txt")), ("jp_1001", read_text("jp_1001.txt")), ("jp_10000", read_text("jp_1001.txt").repeat(10)), ]; let line_strings = vec![ ("lines_100", read_text("lines.txt")), ("lines_1000", read_text("lines.txt").repeat(10)), ("lines_10000", read_text("lines.txt").repeat(100)), ]; //--------------------------------------------------------- // Chars. // chars::count() { let mut group = c.benchmark_group("chars::count"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(chars::count(text)); }) }); } } { // Equivalent implementations using stdlib functions, // for performance comparisons. let mut group = c.benchmark_group("chars::count_std"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(text.chars().count()); }) }); } } // chars::from_byte_idx() { let mut group = c.benchmark_group("chars::from_byte_idx"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = text.len(); bench.iter(|| { black_box(chars::from_byte_idx(text, idx)); }) }); } } { // Equivalent implementations using stdlib functions, // for performance comparisons. let mut group = c.benchmark_group("chars::from_byte_idx_std"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("std::{}", text_name), |bench| { let idx = text.len(); bench.iter(|| { black_box({ let mut byte_idx = idx; // Find the beginning of the code point. while !text.is_char_boundary(byte_idx) { byte_idx -= 1; } // Count the number of chars until the // char that begins at `byte_idx`. text[..byte_idx].chars().count() }) }) }); } } // chars::to_byte_idx() { let mut group = c.benchmark_group("chars::to_byte_idx"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = chars::count(text); bench.iter(|| { black_box(chars::to_byte_idx(text, idx)); }) }); } } { // Equivalent implementations using stdlib functions, // for performance comparisons. let mut group = c.benchmark_group("chars::to_byte_idx_std"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("std::{}", text_name), |bench| { let idx = chars::count(text) - 1; // Minus 1 so we can unwrap below. bench.iter(|| { black_box(text.char_indices().nth(idx).unwrap().0); }) }); } } //--------------------------------------------------------- // UTF16. // utf16::count() { let mut group = c.benchmark_group("utf16::count"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(utf16::count(text)); }) }); } } // utf16::count_surrogates() { let mut group = c.benchmark_group("utf16::count_surrogates"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(utf16::count_surrogates(text)); }) }); } } // utf16::from_byte_idx() { let mut group = c.benchmark_group("utf16::from_byte_idx"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = text.len(); bench.iter(|| { black_box(utf16::from_byte_idx(text, idx)); }) }); } } // utf16::to_byte_idx() { let mut group = c.benchmark_group("utf16::to_byte_idx"); for (text_name, text) in test_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = utf16::count(text); bench.iter(|| { black_box(utf16::to_byte_idx(text, idx)); }) }); } } //--------------------------------------------------------- // Lines (unicode). let unicode_line_breaks = [ ("LF", "\u{000A}"), ("VT", "\u{000B}"), ("FF", "\u{000C}"), ("CR", "\u{000D}"), ("NEL", "\u{0085}"), ("LS", "\u{2028}"), ("PS", "\u{2029}"), ("CRLF", "\u{000D}\u{000A}"), ]; // lines::count_breaks() { let mut group = c.benchmark_group("lines::count_breaks"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in unicode_line_breaks { let text = text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { bench.iter(|| { black_box(lines::count_breaks(&text)); }) }); } } } // lines::from_byte_idx() { let mut group = c.benchmark_group("lines::from_byte_idx"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in unicode_line_breaks { let text = text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { let idx = text.len(); bench.iter(|| { black_box(lines::from_byte_idx(&text, idx)); }) }); } } } // lines::to_byte_idx() { let mut group = c.benchmark_group("lines::to_byte_idx"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in unicode_line_breaks { let text = &text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { let idx = lines::count_breaks(text) + 1; bench.iter(|| { black_box(lines::to_byte_idx(text, idx)); }) }); } } } //--------------------------------------------------------- // Lines (LF). // lines_lf::count_breaks() { let mut group = c.benchmark_group("lines_lf::count_breaks"); for (text_name, text) in line_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(lines_lf::count_breaks(text)); }) }); } } { // Version implemented with stdlib functions, // for performance comparisons. Note: this // isn't exactly identical in behavior, since // stdlib ignores document-final line breaks. // But it should be close enough for perf // comparisons. let mut group = c.benchmark_group("lines_lf::count_breaks_std"); for (text_name, text) in line_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { bench.iter(|| { black_box(text.lines().count()); }) }); } } // lines_lf::from_byte_idx() { let mut group = c.benchmark_group("lines_lf::from_byte_idx"); for (text_name, text) in line_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = text.len(); bench.iter(|| { black_box(lines_lf::from_byte_idx(text, idx)); }) }); } } // lines_lf::to_byte_idx() { let mut group = c.benchmark_group("lines_lf::to_byte_idx"); for (text_name, text) in line_strings.iter() { group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(*text_name, |bench| { let idx = lines_lf::count_breaks(text) + 1; bench.iter(|| { black_box(lines_lf::to_byte_idx(text, idx)); }) }); } } //--------------------------------------------------------- // Lines (CRLF). let crlf_line_breaks = [ ("LF", "\u{000A}"), ("CR", "\u{000D}"), ("CRLF", "\u{000D}\u{000A}"), ]; // lines_crlf::count_breaks() { let mut group = c.benchmark_group("lines_crlf::count_breaks"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in crlf_line_breaks { let text = &text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { bench.iter(|| { black_box(lines_crlf::count_breaks(text)); }) }); } } } // lines_crlf::from_byte_idx() { let mut group = c.benchmark_group("lines_crlf::from_byte_idx"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in crlf_line_breaks { let text = &text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { let idx = text.len(); bench.iter(|| { black_box(lines_crlf::from_byte_idx(text, idx)); }) }); } } } // lines_crlf::to_byte_idx() { let mut group = c.benchmark_group("lines_crlf::to_byte_idx"); for (text_name, text) in line_strings.iter() { for (break_name, line_break) in crlf_line_breaks { let text = &text.replace('\n', line_break); group.throughput(Throughput::Bytes(text.len() as u64)); group.bench_function(format!("{text_name}_{break_name}"), |bench| { let idx = lines_crlf::count_breaks(text) + 1; bench.iter(|| { black_box(lines_crlf::to_byte_idx(text, idx)); }) }); } } } } //------------------------------------------------------------- criterion_group!(benches, all,); criterion_main!(benches); str_indices-0.4.3/src/byte_chunk.rs000064400000000000000000000274561046102023000154440ustar 00000000000000#[cfg(target_arch = "x86_64")] use core::arch::x86_64; #[cfg(target_arch = "aarch64")] use core::arch::aarch64; // Which type to actually use at build time. #[cfg(all(feature = "simd", target_arch = "x86_64"))] pub(crate) type Chunk = x86_64::__m128i; #[cfg(all(feature = "simd", target_arch = "aarch64"))] pub(crate) type Chunk = aarch64::uint8x16_t; #[cfg(any( not(feature = "simd"), not(any(target_arch = "x86_64", target_arch = "aarch64")) ))] pub(crate) type Chunk = usize; /// Interface for working with chunks of bytes at a time, providing the /// operations needed for the functionality in str_utils. pub(crate) trait ByteChunk: Copy + Clone { /// Size of the chunk in bytes. const SIZE: usize; /// Maximum number of iterations the chunk can accumulate /// before sum_bytes() becomes inaccurate. const MAX_ACC: usize; /// Creates a new chunk with all bytes set to zero. fn zero() -> Self; /// Creates a new chunk with all bytes set to n. fn splat(n: u8) -> Self; /// Returns whether all bytes are zero or not. fn is_zero(&self) -> bool; /// Shifts bytes back lexographically by n bytes. fn shift_back_lex(&self, n: usize) -> Self; /// Shifts bits to the right by n bits. fn shr(&self, n: usize) -> Self; /// Compares bytes for equality with the given byte. /// /// Bytes that are equal are set to 1, bytes that are not /// are set to 0. fn cmp_eq_byte(&self, byte: u8) -> Self; /// Compares bytes to see if they're in the non-inclusive range (a, b), /// where a < b <= 127. /// /// Bytes in the range are set to 1, bytes not in the range are set to 0. fn bytes_between_127(&self, a: u8, b: u8) -> Self; /// Performs a bitwise and on two chunks. fn bitand(&self, other: Self) -> Self; /// Adds the bytes of two chunks together. fn add(&self, other: Self) -> Self; /// Subtracts other's bytes from this chunk. fn sub(&self, other: Self) -> Self; /// Increments the nth-from-last lexographic byte by 1. fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self; /// Decrements the last lexographic byte by 1. fn dec_last_lex_byte(&self) -> Self; /// Returns the sum of all bytes in the chunk. fn sum_bytes(&self) -> usize; } impl ByteChunk for usize { const SIZE: usize = core::mem::size_of::(); const MAX_ACC: usize = (256 / core::mem::size_of::()) - 1; #[inline(always)] fn zero() -> Self { 0 } #[inline(always)] fn splat(n: u8) -> Self { const ONES: usize = core::usize::MAX / 0xFF; ONES * n as usize } #[inline(always)] fn is_zero(&self) -> bool { *self == 0 } #[inline(always)] fn shift_back_lex(&self, n: usize) -> Self { if cfg!(target_endian = "little") { *self >> (n * 8) } else { *self << (n * 8) } } #[inline(always)] fn shr(&self, n: usize) -> Self { *self >> n } #[inline(always)] fn cmp_eq_byte(&self, byte: u8) -> Self { const ONES: usize = core::usize::MAX / 0xFF; const ONES_HIGH: usize = ONES << 7; let word = *self ^ (byte as usize * ONES); (!(((word & !ONES_HIGH) + !ONES_HIGH) | word) & ONES_HIGH) >> 7 } #[inline(always)] fn bytes_between_127(&self, a: u8, b: u8) -> Self { const ONES: usize = core::usize::MAX / 0xFF; const ONES_HIGH: usize = ONES << 7; let tmp = *self & (ONES * 127); (((ONES * (127 + b as usize) - tmp) & !*self & (tmp + (ONES * (127 - a as usize)))) & ONES_HIGH) >> 7 } #[inline(always)] fn bitand(&self, other: Self) -> Self { *self & other } #[inline(always)] fn add(&self, other: Self) -> Self { *self + other } #[inline(always)] fn sub(&self, other: Self) -> Self { *self - other } #[inline(always)] fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self { if cfg!(target_endian = "little") { *self + (1 << ((Self::SIZE - 1 - n) * 8)) } else { *self + (1 << (n * 8)) } } #[inline(always)] fn dec_last_lex_byte(&self) -> Self { if cfg!(target_endian = "little") { *self - (1 << ((Self::SIZE - 1) * 8)) } else { *self - 1 } } #[inline(always)] fn sum_bytes(&self) -> usize { const ONES: usize = core::usize::MAX / 0xFF; self.wrapping_mul(ONES) >> ((Self::SIZE - 1) * 8) } } // Note: use only SSE2 and older instructions, since these are // guaranteed on all x86_64 platforms. #[cfg(target_arch = "x86_64")] impl ByteChunk for x86_64::__m128i { const SIZE: usize = core::mem::size_of::(); const MAX_ACC: usize = 255; #[inline(always)] fn zero() -> Self { unsafe { x86_64::_mm_setzero_si128() } } #[inline(always)] fn splat(n: u8) -> Self { unsafe { x86_64::_mm_set1_epi8(n as i8) } } #[inline(always)] fn is_zero(&self) -> bool { let tmp = unsafe { core::mem::transmute::(*self) }; tmp.0 == 0 && tmp.1 == 0 } #[inline(always)] fn shift_back_lex(&self, n: usize) -> Self { match n { 0 => *self, 1 => unsafe { x86_64::_mm_srli_si128(*self, 1) }, 2 => unsafe { x86_64::_mm_srli_si128(*self, 2) }, 3 => unsafe { x86_64::_mm_srli_si128(*self, 3) }, 4 => unsafe { x86_64::_mm_srli_si128(*self, 4) }, _ => unreachable!(), } } #[inline(always)] fn shr(&self, n: usize) -> Self { match n { 0 => *self, 1 => unsafe { x86_64::_mm_srli_epi64(*self, 1) }, 2 => unsafe { x86_64::_mm_srli_epi64(*self, 2) }, 3 => unsafe { x86_64::_mm_srli_epi64(*self, 3) }, 4 => unsafe { x86_64::_mm_srli_epi64(*self, 4) }, _ => unreachable!(), } } #[inline(always)] fn cmp_eq_byte(&self, byte: u8) -> Self { let tmp = unsafe { x86_64::_mm_cmpeq_epi8(*self, Self::splat(byte)) }; unsafe { x86_64::_mm_and_si128(tmp, Self::splat(1)) } } #[inline(always)] fn bytes_between_127(&self, a: u8, b: u8) -> Self { let tmp1 = unsafe { x86_64::_mm_cmpgt_epi8(*self, Self::splat(a)) }; let tmp2 = unsafe { x86_64::_mm_cmplt_epi8(*self, Self::splat(b)) }; let tmp3 = unsafe { x86_64::_mm_and_si128(tmp1, tmp2) }; unsafe { x86_64::_mm_and_si128(tmp3, Self::splat(1)) } } #[inline(always)] fn bitand(&self, other: Self) -> Self { unsafe { x86_64::_mm_and_si128(*self, other) } } #[inline(always)] fn add(&self, other: Self) -> Self { unsafe { x86_64::_mm_add_epi8(*self, other) } } #[inline(always)] fn sub(&self, other: Self) -> Self { unsafe { x86_64::_mm_sub_epi8(*self, other) } } #[inline(always)] fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self { let mut tmp = unsafe { core::mem::transmute::(*self) }; tmp[15 - n] += 1; unsafe { core::mem::transmute::<[u8; 16], Self>(tmp) } } #[inline(always)] fn dec_last_lex_byte(&self) -> Self { let mut tmp = unsafe { core::mem::transmute::(*self) }; tmp[15] -= 1; unsafe { core::mem::transmute::<[u8; 16], Self>(tmp) } } #[inline(always)] fn sum_bytes(&self) -> usize { let half_sum = unsafe { x86_64::_mm_sad_epu8(*self, x86_64::_mm_setzero_si128()) }; let (low, high) = unsafe { core::mem::transmute::(half_sum) }; (low + high) as usize } } #[cfg(target_arch = "aarch64")] impl ByteChunk for aarch64::uint8x16_t { const SIZE: usize = core::mem::size_of::(); const MAX_ACC: usize = 255; #[inline(always)] fn zero() -> Self { unsafe { aarch64::vdupq_n_u8(0) } } #[inline(always)] fn splat(n: u8) -> Self { unsafe { aarch64::vdupq_n_u8(n) } } #[inline(always)] fn is_zero(&self) -> bool { unsafe { aarch64::vmaxvq_u8(*self) == 0 } } #[inline(always)] fn shift_back_lex(&self, n: usize) -> Self { unsafe { match n { 1 => aarch64::vextq_u8(*self, Self::zero(), 1), 2 => aarch64::vextq_u8(*self, Self::zero(), 2), _ => unreachable!(), } } } #[inline(always)] fn shr(&self, n: usize) -> Self { unsafe { let u64_vec = aarch64::vreinterpretq_u64_u8(*self); let result = match n { 1 => aarch64::vshrq_n_u64(u64_vec, 1), _ => unreachable!(), }; aarch64::vreinterpretq_u8_u64(result) } } #[inline(always)] fn cmp_eq_byte(&self, byte: u8) -> Self { unsafe { let equal = aarch64::vceqq_u8(*self, Self::splat(byte)); aarch64::vshrq_n_u8(equal, 7) } } #[inline(always)] fn bytes_between_127(&self, a: u8, b: u8) -> Self { use aarch64::vreinterpretq_s8_u8 as cast; unsafe { let a_gt = aarch64::vcgtq_s8(cast(*self), cast(Self::splat(a))); let b_gt = aarch64::vcltq_s8(cast(*self), cast(Self::splat(b))); let in_range = aarch64::vandq_u8(a_gt, b_gt); aarch64::vshrq_n_u8(in_range, 7) } } #[inline(always)] fn bitand(&self, other: Self) -> Self { unsafe { aarch64::vandq_u8(*self, other) } } #[inline(always)] fn add(&self, other: Self) -> Self { unsafe { aarch64::vaddq_u8(*self, other) } } #[inline(always)] fn sub(&self, other: Self) -> Self { unsafe { aarch64::vsubq_u8(*self, other) } } #[inline(always)] fn inc_nth_from_end_lex_byte(&self, n: usize) -> Self { const END: i32 = Chunk::SIZE as i32 - 1; match n { 0 => unsafe { let lane = aarch64::vgetq_lane_u8(*self, END); aarch64::vsetq_lane_u8(lane + 1, *self, END) }, 1 => unsafe { let lane = aarch64::vgetq_lane_u8(*self, END - 1); aarch64::vsetq_lane_u8(lane + 1, *self, END - 1) }, _ => unreachable!(), } } #[inline(always)] fn dec_last_lex_byte(&self) -> Self { const END: i32 = Chunk::SIZE as i32 - 1; unsafe { let last = aarch64::vgetq_lane_u8(*self, END); aarch64::vsetq_lane_u8(last - 1, *self, END) } } #[inline(always)] fn sum_bytes(&self) -> usize { unsafe { aarch64::vaddlvq_u8(*self).into() } } } //============================================================= #[cfg(test)] mod tests { use super::*; #[test] fn usize_flag_bytes_01() { let v: usize = 0xE2_09_08_A6_E2_A6_E2_09; assert_eq!(0x00_00_00_00_00_00_00_00, v.cmp_eq_byte(0x07)); assert_eq!(0x00_00_01_00_00_00_00_00, v.cmp_eq_byte(0x08)); assert_eq!(0x00_01_00_00_00_00_00_01, v.cmp_eq_byte(0x09)); assert_eq!(0x00_00_00_01_00_01_00_00, v.cmp_eq_byte(0xA6)); assert_eq!(0x01_00_00_00_01_00_01_00, v.cmp_eq_byte(0xE2)); } #[test] fn usize_bytes_between_127_01() { let v: usize = 0x7E_09_00_A6_FF_7F_08_07; assert_eq!(0x01_01_00_00_00_00_01_01, v.bytes_between_127(0x00, 0x7F)); assert_eq!(0x00_01_00_00_00_00_01_00, v.bytes_between_127(0x07, 0x7E)); assert_eq!(0x00_01_00_00_00_00_00_00, v.bytes_between_127(0x08, 0x7E)); } #[cfg(all(feature = "simd", any(target_arch = "x86_64", target_arch = "aarch64")))] #[test] fn sum_bytes_simd() { let ones = Chunk::splat(1); let mut acc = Chunk::zero(); for _ in 0..Chunk::MAX_ACC { acc = acc.add(ones); } assert_eq!(acc.sum_bytes(), Chunk::SIZE * Chunk::MAX_ACC); } } str_indices-0.4.3/src/chars.rs000064400000000000000000000246241046102023000144030ustar 00000000000000//! Index by chars. use crate::byte_chunk::{ByteChunk, Chunk}; /// Counts the chars in a string slice. /// /// Runs in O(N) time. #[inline] pub fn count(text: &str) -> usize { count_impl::(text.as_bytes()) } /// Converts from byte-index to char-index in a string slice. /// /// If the byte is in the middle of a multi-byte char, returns the index of /// the char that the byte belongs to. /// /// Any past-the-end index will return the one-past-the-end char index. /// /// Runs in O(N) time. #[inline] pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize { let bytes = text.as_bytes(); // Ensure the index is either a char boundary or is off the end of // the text. let mut i = byte_idx; while Some(true) == bytes.get(i).map(is_trailing_byte) { i -= 1; } count_impl::(&bytes[0..i.min(bytes.len())]) } /// Converts from char-index to byte-index in a string slice. /// /// Any past-the-end index will return the one-past-the-end byte index. /// /// Runs in O(N) time. #[inline] pub fn to_byte_idx(text: &str, char_idx: usize) -> usize { to_byte_idx_impl::(text.as_bytes(), char_idx) } //------------------------------------------------------------- #[inline(always)] fn to_byte_idx_impl(text: &[u8], char_idx: usize) -> usize { if text.len() <= T::SIZE { // Bypass the more complex routine for short strings, where the // complexity hurts performance. let mut char_count = 0; for (i, byte) in text.iter().enumerate() { char_count += is_leading_byte(byte) as usize; if char_count > char_idx { return i; } } return text.len(); } // Get `middle` so we can do more efficient chunk-based counting. // We can't use this to get `end`, however, because the start index of // `end` actually depends on the accumulating char counts during the // counting process. let (start, middle, _) = unsafe { text.align_to::() }; let mut byte_count = 0; let mut char_count = 0; // Take care of any unaligned bytes at the beginning. for byte in start.iter() { char_count += is_leading_byte(byte) as usize; if char_count > char_idx { return byte_count; } byte_count += 1; } // Process chunks in the fast path. Ensure that we don't go past the number // of chars we are counting towards let fast_path_chunks = middle.len().min((char_idx - char_count) / T::SIZE); let bytes = T::SIZE * 4; for chunks in middle[..fast_path_chunks].chunks_exact(4) { let val1 = count_trailing_chunk(chunks[0]); let val2 = count_trailing_chunk(chunks[1]); let val3 = count_trailing_chunk(chunks[2]); let val4 = count_trailing_chunk(chunks[3]); char_count += bytes - val1.add(val2).add(val3.add(val4)).sum_bytes(); byte_count += bytes; } // Process the rest of chunks in the slow path. for chunk in middle[(fast_path_chunks - fast_path_chunks % 4)..].iter() { let new_char_count = char_count + T::SIZE - count_trailing_chunk(*chunk).sum_bytes(); if new_char_count >= char_idx { break; } char_count = new_char_count; byte_count += T::SIZE; } // Take care of any unaligned bytes at the end. let end = &text[byte_count..]; for byte in end.iter() { char_count += is_leading_byte(byte) as usize; if char_count > char_idx { break; } byte_count += 1; } byte_count } #[inline(always)] pub(crate) fn count_impl(text: &[u8]) -> usize { if text.len() < T::SIZE { // Bypass the more complex routine for short strings, where the // complexity hurts performance. return text.iter().map(|x| is_leading_byte(x) as usize).sum(); } // Get `middle` for more efficient chunk-based counting. let (start, middle, end) = unsafe { text.align_to::() }; let mut inv_count = 0; // Take care of unaligned bytes at the beginning. inv_count += start.iter().filter(|x| is_trailing_byte(x)).count(); // Take care of the middle bytes in big chunks. Loop unrolled. for chunks in middle.chunks_exact(4) { let val1 = count_trailing_chunk(chunks[0]); let val2 = count_trailing_chunk(chunks[1]); let val3 = count_trailing_chunk(chunks[2]); let val4 = count_trailing_chunk(chunks[3]); inv_count += val1.add(val2).add(val3.add(val4)).sum_bytes(); } let mut acc = T::zero(); for chunk in middle.chunks_exact(4).remainder() { acc = acc.add(count_trailing_chunk(*chunk)); } inv_count += acc.sum_bytes(); // Take care of unaligned bytes at the end. inv_count += end.iter().filter(|x| is_trailing_byte(x)).count(); text.len() - inv_count } #[inline(always)] fn is_leading_byte(byte: &u8) -> bool { (byte & 0xC0) != 0x80 } #[inline(always)] fn is_trailing_byte(byte: &u8) -> bool { (byte & 0xC0) == 0x80 } #[inline(always)] fn count_trailing_chunk(val: T) -> T { val.bitand(T::splat(0xc0)).cmp_eq_byte(0x80) } //============================================================= #[cfg(test)] mod tests { use super::*; // 124 bytes, 100 chars, 4 lines const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ a fine day, isn't it?\nAren't you glad \ we're alive?\nこんにちは、みんなさん!"; #[test] fn count_01() { let text = "Hello せかい! Hello せかい! Hello せかい! Hello せかい! Hello せかい!"; assert_eq!(54, count(text)); } #[test] fn count_02() { assert_eq!(100, count(TEXT_LINES)); } #[test] fn from_byte_idx_01() { let text = "Hello せかい!"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(6, from_byte_idx(text, 6)); assert_eq!(6, from_byte_idx(text, 7)); assert_eq!(6, from_byte_idx(text, 8)); assert_eq!(7, from_byte_idx(text, 9)); assert_eq!(7, from_byte_idx(text, 10)); assert_eq!(7, from_byte_idx(text, 11)); assert_eq!(8, from_byte_idx(text, 12)); assert_eq!(8, from_byte_idx(text, 13)); assert_eq!(8, from_byte_idx(text, 14)); assert_eq!(9, from_byte_idx(text, 15)); assert_eq!(10, from_byte_idx(text, 16)); assert_eq!(10, from_byte_idx(text, 17)); assert_eq!(10, from_byte_idx(text, 18)); assert_eq!(10, from_byte_idx(text, 19)); } #[test] fn from_byte_idx_02() { let text = ""; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 1)); let text = "h"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(1, from_byte_idx(text, 2)); let text = "hi"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(2, from_byte_idx(text, 2)); assert_eq!(2, from_byte_idx(text, 3)); } #[test] fn from_byte_idx_03() { let text = "せかい"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 1)); assert_eq!(0, from_byte_idx(text, 2)); assert_eq!(1, from_byte_idx(text, 3)); assert_eq!(1, from_byte_idx(text, 4)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(2, from_byte_idx(text, 6)); assert_eq!(2, from_byte_idx(text, 7)); assert_eq!(2, from_byte_idx(text, 8)); assert_eq!(3, from_byte_idx(text, 9)); assert_eq!(3, from_byte_idx(text, 10)); assert_eq!(3, from_byte_idx(text, 11)); assert_eq!(3, from_byte_idx(text, 12)); } #[test] fn from_byte_idx_04() { // Ascii range for i in 0..88 { assert_eq!(i, from_byte_idx(TEXT_LINES, i)); } // Hiragana characters for i in 88..125 { assert_eq!(88 + ((i - 88) / 3), from_byte_idx(TEXT_LINES, i)); } // Past the end for i in 125..130 { assert_eq!(100, from_byte_idx(TEXT_LINES, i)); } } #[test] fn to_byte_idx_01() { let text = "Hello せかい!"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(1, to_byte_idx(text, 1)); assert_eq!(2, to_byte_idx(text, 2)); assert_eq!(5, to_byte_idx(text, 5)); assert_eq!(6, to_byte_idx(text, 6)); assert_eq!(12, to_byte_idx(text, 8)); assert_eq!(15, to_byte_idx(text, 9)); assert_eq!(16, to_byte_idx(text, 10)); } #[test] fn to_byte_idx_02() { let text = "せかい"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(3, to_byte_idx(text, 1)); assert_eq!(6, to_byte_idx(text, 2)); assert_eq!(9, to_byte_idx(text, 3)); } #[test] fn to_byte_idx_03() { let text = "Hello world!"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(1, to_byte_idx(text, 1)); assert_eq!(8, to_byte_idx(text, 8)); assert_eq!(11, to_byte_idx(text, 11)); assert_eq!(12, to_byte_idx(text, 12)); } #[test] fn to_byte_idx_04() { let text = "Hello world! Hello せかい! Hello world! Hello せかい! \ Hello world! Hello せかい! Hello world! Hello せかい! \ Hello world! Hello せかい! Hello world! Hello せかい! \ Hello world! Hello せかい! Hello world! Hello せかい!"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(30, to_byte_idx(text, 24)); assert_eq!(60, to_byte_idx(text, 48)); assert_eq!(90, to_byte_idx(text, 72)); assert_eq!(115, to_byte_idx(text, 93)); assert_eq!(120, to_byte_idx(text, 96)); assert_eq!(150, to_byte_idx(text, 120)); assert_eq!(180, to_byte_idx(text, 144)); assert_eq!(210, to_byte_idx(text, 168)); assert_eq!(239, to_byte_idx(text, 191)); } #[test] fn to_byte_idx_05() { // Ascii range for i in 0..88 { assert_eq!(i, to_byte_idx(TEXT_LINES, i)); } // Hiragana characters for i in 88..100 { assert_eq!(88 + ((i - 88) * 3), to_byte_idx(TEXT_LINES, i)); } // Past the end for i in 100..110 { assert_eq!(124, to_byte_idx(TEXT_LINES, i)); } } } str_indices-0.4.3/src/lib.rs000064400000000000000000000102001046102023000140320ustar 00000000000000#![no_std] //! Count and convert between different indexing schemes on utf8 string //! slices. //! //! This crate is organized into modules by indexing scheme. Each module //! contains functions for counting relevant metrics for that scheme as //! well as functions for converting to/from byte indices. //! //! None of the functions in this crate panic: all inputs have a defined //! output. mod byte_chunk; pub mod chars; pub mod lines; pub mod lines_crlf; pub mod lines_lf; pub mod utf16; /// Returns the alignment difference between the start of `bytes` and the /// type `T`. /// /// Or put differently: returns how many bytes into `bytes` you need to walk /// to reach the alignment of `T` in memory. /// /// Will return 0 if already aligned at the start, and will return the length /// of `bytes` if alignment is beyond the end of `bytes`. #[inline(always)] fn alignment_diff(bytes: &[u8]) -> usize { let alignment = core::mem::align_of::(); let ptr = bytes.as_ptr() as usize; (alignment - ((ptr - 1) & (alignment - 1)) - 1).min(bytes.len()) } /// Utility function used in some of the lines modules. #[inline(always)] fn is_not_crlf_middle(byte_idx: usize, text: &[u8]) -> bool { byte_idx == 0 || byte_idx >= text.len() || (text[byte_idx - 1] != 0x0D) || (text[byte_idx] != 0x0A) } //====================================================================== #[cfg(test)] mod tests { use super::*; // 124 bytes, 100 chars, 4 lines const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ a fine day, isn't it?\nAren't you glad \ we're alive?\nこんにちは、みんなさん!"; fn char_to_line_idx(text: &str, idx: usize) -> usize { lines::from_byte_idx(text, chars::to_byte_idx(text, idx)) } fn line_to_char_idx(text: &str, idx: usize) -> usize { chars::from_byte_idx(text, lines::to_byte_idx(text, idx)) } #[test] fn char_to_line_idx_01() { let text = "Hello せ\nか\nい!"; assert_eq!(0, char_to_line_idx(text, 0)); assert_eq!(0, char_to_line_idx(text, 7)); assert_eq!(1, char_to_line_idx(text, 8)); assert_eq!(1, char_to_line_idx(text, 9)); assert_eq!(2, char_to_line_idx(text, 10)); } #[test] fn char_to_line_idx_02() { // Line 0 for i in 0..32 { assert_eq!(0, char_to_line_idx(TEXT_LINES, i)); } // Line 1 for i in 32..59 { assert_eq!(1, char_to_line_idx(TEXT_LINES, i)); } // Line 2 for i in 59..88 { assert_eq!(2, char_to_line_idx(TEXT_LINES, i)); } // Line 3 for i in 88..100 { assert_eq!(3, char_to_line_idx(TEXT_LINES, i)); } // Past the end for i in 100..110 { assert_eq!(3, char_to_line_idx(TEXT_LINES, i)); } } #[test] fn line_to_char_idx_01() { let text = "Hello せ\nか\nい!"; assert_eq!(0, line_to_char_idx(text, 0)); assert_eq!(8, line_to_char_idx(text, 1)); assert_eq!(10, line_to_char_idx(text, 2)); } #[test] fn line_to_char_idx_02() { assert_eq!(0, line_to_char_idx(TEXT_LINES, 0)); assert_eq!(32, line_to_char_idx(TEXT_LINES, 1)); assert_eq!(59, line_to_char_idx(TEXT_LINES, 2)); assert_eq!(88, line_to_char_idx(TEXT_LINES, 3)); // Past end assert_eq!(100, line_to_char_idx(TEXT_LINES, 4)); assert_eq!(100, line_to_char_idx(TEXT_LINES, 5)); assert_eq!(100, line_to_char_idx(TEXT_LINES, 6)); } #[test] fn line_char_round_trip() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(6, line_to_char_idx(text, char_to_line_idx(text, 6))); assert_eq!(2, char_to_line_idx(text, line_to_char_idx(text, 2))); assert_eq!(0, line_to_char_idx(text, char_to_line_idx(text, 0))); assert_eq!(0, char_to_line_idx(text, line_to_char_idx(text, 0))); assert_eq!(21, line_to_char_idx(text, char_to_line_idx(text, 21))); assert_eq!(5, char_to_line_idx(text, line_to_char_idx(text, 5))); } } str_indices-0.4.3/src/lines.rs000064400000000000000000000324761046102023000144210ustar 00000000000000//! Index by lines (all Unicode line breaks). //! //! This module recognizes all line breaks defined in //! [Unicode Annex #14](https://www.unicode.org/reports/tr14/): //! //! - `U+000A` — LF (Line Feed) //! - `U+000B` — VT (Vertical Tab) //! - `U+000C` — FF (Form Feed) //! - `U+000D` — CR (Carriage Return) //! - `U+0085` — NEL (Next Line) //! - `U+2028` — Line Separator //! - `U+2029` — Paragraph Separator //! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed) use crate::alignment_diff; use crate::byte_chunk::{ByteChunk, Chunk}; /// Counts the line breaks in a string slice. /// /// Runs in O(N) time. #[inline] pub fn count_breaks(text: &str) -> usize { count_breaks_impl::(text.as_bytes()) } /// Converts from byte-index to line-index in a string slice. /// /// Line break characters are considered to be a part of the line they /// end. And a string that ends with a line break is considered to have /// a final empty line. So this function is equivalent to counting the /// line breaks before the specified byte. /// /// Any past-the-end index will return the last line index. /// /// Runs in O(N) time. #[inline] pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize { let mut i = byte_idx.min(text.len()); while !text.is_char_boundary(i) { i -= 1; } let nl_count = count_breaks_impl::(&text.as_bytes()[..i]); if crate::is_not_crlf_middle(i, text.as_bytes()) { nl_count } else { nl_count - 1 } } /// Converts from line-index to byte-index in a string slice. /// /// Returns the byte index of the start of the specified line. Line 0 is /// the start of the string, and subsequent lines start immediately /// *after* each line break character. /// /// Any past-the-end index will return the one-past-the-end byte index. /// /// Runs in O(N) time. #[inline] pub fn to_byte_idx(text: &str, line_idx: usize) -> usize { to_byte_idx_impl::(text, line_idx) } //------------------------------------------------------------- #[inline(always)] fn to_byte_idx_impl(text: &str, line_idx: usize) -> usize { let mut bytes = text.as_bytes(); let mut line_break_count = 0; // Handle unaligned bytes at the start. let aligned_idx = alignment_diff::(bytes); if aligned_idx > 0 { let result = count_breaks_up_to(bytes, aligned_idx, line_idx); line_break_count += result.0; bytes = &bytes[result.1..]; } // Count line breaks in big chunks. if alignment_diff::(bytes) == 0 { while bytes.len() >= T::SIZE { // Unsafe because the called function depends on correct alignment. let tmp = unsafe { count_breaks_in_chunk_from_ptr::(bytes) }.sum_bytes(); if tmp + line_break_count >= line_idx { break; } line_break_count += tmp; bytes = &bytes[T::SIZE..]; } } // Handle unaligned bytes at the end. let result = count_breaks_up_to(bytes, bytes.len(), line_idx - line_break_count); bytes = &bytes[result.1..]; // Finish up let mut byte_idx = text.len() - bytes.len(); while !text.is_char_boundary(byte_idx) { byte_idx += 1; } byte_idx } /// Counts the line breaks in a utf8 encoded string. /// /// The following unicode sequences are considered newlines by this function: /// - u{000A} (Line Feed) /// - u{000B} (Vertical Tab) /// - u{000C} (Form Feed) /// - u{000D} (Carriage Return) /// - u{000D}u{000A} (Carriage Return + Line Feed) /// - u{0085} (Next Line) /// - u{2028} (Line Separator) /// - u{2029} (Paragraph Separator) #[inline(always)] fn count_breaks_impl(text: &[u8]) -> usize { let mut bytes = text; let mut count = 0; // Handle unaligned bytes at the start. let aligned_idx = alignment_diff::(bytes); if aligned_idx > 0 { let result = count_breaks_up_to(bytes, aligned_idx, bytes.len()); count += result.0; bytes = &bytes[result.1..]; } // Count line breaks in big chunks. let mut i = 0; let mut acc = T::zero(); while bytes.len() >= T::SIZE { // Unsafe because the called function depends on correct alignment. acc = acc.add(unsafe { count_breaks_in_chunk_from_ptr::(bytes) }); i += 1; if i == T::MAX_ACC { i = 0; count += acc.sum_bytes(); acc = T::zero(); } bytes = &bytes[T::SIZE..]; } count += acc.sum_bytes(); // Handle unaligned bytes at the end. count += count_breaks_up_to(bytes, bytes.len(), bytes.len()).0; count } /// Used internally in the line-break counting functions. /// /// Counts line breaks a byte at a time up to a maximum number of bytes and /// line breaks, and returns the counted lines and how many bytes were processed. #[inline(always)] #[allow(clippy::if_same_then_else)] fn count_breaks_up_to(bytes: &[u8], max_bytes: usize, max_breaks: usize) -> (usize, usize) { let mut ptr = 0; let mut count = 0; while ptr < max_bytes && count < max_breaks { let byte = bytes[ptr]; // Handle u{000A}, u{000B}, u{000C}, and u{000D} if (0x0A..=0x0D).contains(&byte) { count += 1; // Check for CRLF and and subtract 1 if it is, // since it will be caught in the next iteration // with the LF. if byte == 0x0D && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x0A { count -= 1; } } // Handle u{0085} else if byte == 0xC2 && (ptr + 1) < bytes.len() && bytes[ptr + 1] == 0x85 { count += 1; } // Handle u{2028} and u{2029} else if byte == 0xE2 && (ptr + 2) < bytes.len() && bytes[ptr + 1] == 0x80 && (bytes[ptr + 2] >> 1) == 0x54 { count += 1; } ptr += 1; } (count, ptr) } /// Used internally in the line-break counting functions. /// /// The start of `bytes` MUST be aligned as type T, and `bytes` MUST be at /// least as large (in bytes) as T. If these invariants are not met, bad /// things could potentially happen. Hence why this function is unsafe. #[inline(always)] unsafe fn count_breaks_in_chunk_from_ptr(bytes: &[u8]) -> T { let c = { // The only unsafe bits of the function are in this block. debug_assert_eq!(bytes.align_to::().0.len(), 0); debug_assert!(bytes.len() >= T::SIZE); // This unsafe cast is for performance reasons: going through e.g. // `align_to()` results in a significant drop in performance. *(bytes.as_ptr() as *const T) }; let end_i = T::SIZE; let mut acc = T::zero(); // Calculate the flags we're going to be working with. let nl_1_flags = c.cmp_eq_byte(0xC2); let sp_1_flags = c.cmp_eq_byte(0xE2); let all_flags = c.bytes_between_127(0x09, 0x0E); let cr_flags = c.cmp_eq_byte(0x0D); // Next Line: u{0085} if !nl_1_flags.is_zero() { let nl_2_flags = c.cmp_eq_byte(0x85).shift_back_lex(1); let flags = nl_1_flags.bitand(nl_2_flags); acc = acc.add(flags); // Handle ending boundary if bytes.len() > end_i && bytes[end_i - 1] == 0xC2 && bytes[end_i] == 0x85 { acc = acc.inc_nth_from_end_lex_byte(0); } } // Line Separator: u{2028} // Paragraph Separator: u{2029} if !sp_1_flags.is_zero() { let sp_2_flags = c.cmp_eq_byte(0x80).shift_back_lex(1).bitand(sp_1_flags); if !sp_2_flags.is_zero() { let sp_3_flags = c .shr(1) .bitand(T::splat(!0x80)) .cmp_eq_byte(0x54) .shift_back_lex(2); let sp_flags = sp_2_flags.bitand(sp_3_flags); acc = acc.add(sp_flags); } // Handle ending boundary if bytes.len() > end_i && bytes[end_i - 2] == 0xE2 && bytes[end_i - 1] == 0x80 && (bytes[end_i] >> 1) == 0x54 { acc = acc.inc_nth_from_end_lex_byte(1); } else if bytes.len() > (end_i + 1) && bytes[end_i - 1] == 0xE2 && bytes[end_i] == 0x80 && (bytes[end_i + 1] >> 1) == 0x54 { acc = acc.inc_nth_from_end_lex_byte(0); } } // Line Feed: u{000A} // Vertical Tab: u{000B} // Form Feed: u{000C} // Carriage Return: u{000D} // Carriage Return + Line Feed: u{000D}u{000A} acc = acc.add(all_flags); if !cr_flags.is_zero() { // Handle CRLF let lf_flags = c.cmp_eq_byte(0x0A); let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1)); acc = acc.sub(crlf_flags); if bytes.len() > end_i && bytes[end_i - 1] == 0x0D && bytes[end_i] == 0x0A { acc = acc.dec_last_lex_byte(); } } acc } //============================================================= #[cfg(test)] mod tests { use super::*; // 124 bytes, 100 chars, 4 lines const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ a fine day, isn't it?\nAren't you glad \ we're alive?\nこんにちは、みんなさん!"; #[test] fn count_breaks_01() { let text = "\u{000A}Hello\u{000D}\u{000A}\u{000D}せ\u{000B}か\u{000C}い\u{0085}. \ There\u{2028}is something.\u{2029}"; assert_eq!(48, text.len()); assert_eq!(8, count_breaks(text)); } #[test] fn from_byte_idx_01() { let text = "Here\nare\nsome\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 8)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(2, from_byte_idx(text, 13)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(3, from_byte_idx(text, 19)); } #[test] fn from_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(2, from_byte_idx(text, 6)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(3, from_byte_idx(text, 10)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(4, from_byte_idx(text, 15)); assert_eq!(4, from_byte_idx(text, 20)); assert_eq!(5, from_byte_idx(text, 21)); } #[test] fn from_byte_idx_03() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(0, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 6)); assert_eq!(1, from_byte_idx(text, 9)); assert_eq!(1, from_byte_idx(text, 10)); assert_eq!(2, from_byte_idx(text, 11)); assert_eq!(2, from_byte_idx(text, 15)); assert_eq!(2, from_byte_idx(text, 16)); assert_eq!(3, from_byte_idx(text, 17)); } #[test] fn from_byte_idx_04() { // Line 0 for i in 0..32 { assert_eq!(0, from_byte_idx(TEXT_LINES, i)); } // Line 1 for i in 32..59 { assert_eq!(1, from_byte_idx(TEXT_LINES, i)); } // Line 2 for i in 59..88 { assert_eq!(2, from_byte_idx(TEXT_LINES, i)); } // Line 3 for i in 88..125 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } // Past the end for i in 125..130 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } } #[test] fn to_byte_idx_01() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(6, to_byte_idx(text, 1)); assert_eq!(11, to_byte_idx(text, 2)); assert_eq!(17, to_byte_idx(text, 3)); } #[test] fn to_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(1, to_byte_idx(text, 1)); assert_eq!(6, to_byte_idx(text, 2)); assert_eq!(10, to_byte_idx(text, 3)); assert_eq!(15, to_byte_idx(text, 4)); assert_eq!(21, to_byte_idx(text, 5)); } #[test] fn to_byte_idx_03() { assert_eq!(0, to_byte_idx(TEXT_LINES, 0)); assert_eq!(32, to_byte_idx(TEXT_LINES, 1)); assert_eq!(59, to_byte_idx(TEXT_LINES, 2)); assert_eq!(88, to_byte_idx(TEXT_LINES, 3)); // Past end assert_eq!(124, to_byte_idx(TEXT_LINES, 4)); assert_eq!(124, to_byte_idx(TEXT_LINES, 5)); assert_eq!(124, to_byte_idx(TEXT_LINES, 6)); } #[test] fn line_byte_round_trip() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6))); assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2))); assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0))); assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0))); assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21))); assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5))); } } str_indices-0.4.3/src/lines_crlf.rs000064400000000000000000000260701046102023000154200ustar 00000000000000//! Index by lines (carriage return and line feed). //! //! This module recognizes the following as line breaks: //! //! - `U+000A` — LF (Line Feed) //! - `U+000D` — CR (Carriage Return) //! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed) //! //! (Note: if you only want to recognize LF and CRLF, without //! recognizing CR individually, see the [`lines_lf`](crate::lines_lf) module.) use crate::byte_chunk::{ByteChunk, Chunk}; /// Counts the line breaks in a string slice. /// /// Runs in O(N) time. #[inline] pub fn count_breaks(text: &str) -> usize { count_breaks_impl::(text.as_bytes()) } /// Converts from byte-index to line-index in a string slice. /// /// Line break characters are considered to be a part of the line they /// end. And a string that ends with a line break is considered to have /// a final empty line. So this function is equivalent to counting the /// line breaks before the specified byte. /// /// Any past-the-end index will return the last line index. /// /// Runs in O(N) time. #[inline] pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize { let i = byte_idx.min(text.len()); let nl_count = count_breaks_impl::(&text.as_bytes()[..i]); if crate::is_not_crlf_middle(i, text.as_bytes()) { nl_count } else { nl_count - 1 } } /// Converts from line-index to byte-index in a string slice. /// /// Returns the byte index of the start of the specified line. Line 0 is /// the start of the string, and subsequent lines start immediately /// *after* each line break character. /// /// Any past-the-end index will return the one-past-the-end byte index. /// /// Runs in O(N) time. #[inline] pub fn to_byte_idx(text: &str, line_idx: usize) -> usize { to_byte_idx_impl::(text.as_bytes(), line_idx) } //------------------------------------------------------------- #[inline(always)] fn to_byte_idx_impl(text: &[u8], line_idx: usize) -> usize { // Get `middle` so we can do more efficient chunk-based counting. // We can't use this to get `end`, however, because the start index of // `end` actually depends on the accumulating line counts during the // counting process. let (start, middle, _) = unsafe { text.align_to::() }; let mut byte_count = 0; let mut break_count = 0; // Take care of any unaligned bytes at the beginning. for byte in start.iter() { if break_count == line_idx { break; } break_count += (*byte == 0x0A || (*byte == 0x0D && text.get(byte_count + 1) != Some(&0x0A))) as usize; byte_count += 1; } // Process chunks in the fast path. let mut chunks = middle; let mut max_round_len = (line_idx - break_count) / T::MAX_ACC; while max_round_len > 0 && !chunks.is_empty() { // Choose the largest number of chunks we can do this round // that will neither overflow `max_acc` nor blast past the // remaining line breaks we're looking for. let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len()); max_round_len -= round_len; let round = &chunks[..round_len]; chunks = &chunks[round_len..]; // Process the chunks in this round. let mut acc = T::zero(); for chunk in round.iter() { let lf_flags = chunk.cmp_eq_byte(0x0A); let cr_flags = chunk.cmp_eq_byte(0x0D); let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1)); acc = acc.add(lf_flags).add(cr_flags.sub(crlf_flags)); } break_count += acc.sum_bytes(); // Handle CRLFs at chunk boundaries in this round. let mut i = byte_count; while i < (byte_count + T::SIZE * round_len) { i += T::SIZE; break_count -= (text[i - 1] == 0x0D && text.get(i) == Some(&0x0A)) as usize; } byte_count += T::SIZE * round_len; } // Process chunks in the slow path. for chunk in chunks.iter() { let breaks = { let lf_flags = chunk.cmp_eq_byte(0x0A); let cr_flags = chunk.cmp_eq_byte(0x0D); let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1)); lf_flags.add(cr_flags.sub(crlf_flags)).sum_bytes() }; let boundary_crlf = { let i = byte_count + T::SIZE; (text[i - 1] == 0x0D && text.get(i) == Some(&0x0A)) as usize }; let new_break_count = break_count + breaks - boundary_crlf; if new_break_count >= line_idx { break; } break_count = new_break_count; byte_count += T::SIZE; } // Take care of any unaligned bytes at the end. let end = &text[byte_count..]; for byte in end.iter() { if break_count == line_idx { break; } break_count += (*byte == 0x0A || (*byte == 0x0D && text.get(byte_count + 1) != Some(&0x0A))) as usize; byte_count += 1; } // Finish up byte_count } /// Counts the line breaks in a utf8 encoded string. /// /// The following unicode sequences are considered newlines by this function: /// - u{000A} (Line Feed) /// - u{000D} (Carriage Return) #[inline(always)] fn count_breaks_impl(text: &[u8]) -> usize { // Get `middle` so we can do more efficient chunk-based counting. let (start, middle, end) = unsafe { text.align_to::() }; let mut count = 0; // Take care of unaligned bytes at the beginning. let mut last_was_cr = false; for byte in start.iter().copied() { let is_lf = byte == 0x0A; let is_cr = byte == 0x0D; count += (is_cr | (is_lf & !last_was_cr)) as usize; last_was_cr = is_cr; } // Take care of the middle bytes in big chunks. for chunks in middle.chunks(T::MAX_ACC) { let mut acc = T::zero(); for chunk in chunks.iter() { let lf_flags = chunk.cmp_eq_byte(0x0A); let cr_flags = chunk.cmp_eq_byte(0x0D); let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1)); acc = acc.add(lf_flags).add(cr_flags.sub(crlf_flags)); } count += acc.sum_bytes(); } // Check chunk boundaries for CRLF. let mut i = start.len(); while i < (text.len() - end.len()) { if text[i] == 0x0A { count -= (text.get(i.saturating_sub(1)) == Some(&0x0D)) as usize; } i += T::SIZE; } // Take care of unaligned bytes at the end. let mut last_was_cr = text.get((text.len() - end.len()).saturating_sub(1)) == Some(&0x0D); for byte in end.iter().copied() { let is_lf = byte == 0x0A; let is_cr = byte == 0x0D; count += (is_cr | (is_lf & !last_was_cr)) as usize; last_was_cr = is_cr; } count } //============================================================= #[cfg(test)] mod tests { use super::*; // 124 bytes, 100 chars, 4 lines const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ a fine day, isn't it?\nAren't you glad \ we're alive?\nこんにちは、みんなさん!"; #[test] fn count_breaks_01() { let text = "\u{000A}Hello\u{000D}\u{000A}せ\u{000B}か\u{000C}い\u{0085}. \ There\u{000A}is something.\u{2029}"; assert_eq!(45, text.len()); assert_eq!(3, count_breaks(text)); } #[test] fn from_byte_idx_01() { let text = "Here\nare\nsome\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 8)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(2, from_byte_idx(text, 13)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(3, from_byte_idx(text, 19)); } #[test] fn from_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(2, from_byte_idx(text, 6)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(3, from_byte_idx(text, 10)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(4, from_byte_idx(text, 15)); assert_eq!(4, from_byte_idx(text, 20)); assert_eq!(5, from_byte_idx(text, 21)); } #[test] fn from_byte_idx_03() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(0, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 6)); assert_eq!(1, from_byte_idx(text, 9)); assert_eq!(1, from_byte_idx(text, 10)); assert_eq!(2, from_byte_idx(text, 11)); assert_eq!(2, from_byte_idx(text, 15)); assert_eq!(2, from_byte_idx(text, 16)); assert_eq!(3, from_byte_idx(text, 17)); } #[test] fn from_byte_idx_04() { // Line 0 for i in 0..32 { assert_eq!(0, from_byte_idx(TEXT_LINES, i)); } // Line 1 for i in 32..59 { assert_eq!(1, from_byte_idx(TEXT_LINES, i)); } // Line 2 for i in 59..88 { assert_eq!(2, from_byte_idx(TEXT_LINES, i)); } // Line 3 for i in 88..125 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } // Past the end for i in 125..130 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } } #[test] fn to_byte_idx_01() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(6, to_byte_idx(text, 1)); assert_eq!(11, to_byte_idx(text, 2)); assert_eq!(17, to_byte_idx(text, 3)); } #[test] fn to_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(1, to_byte_idx(text, 1)); assert_eq!(6, to_byte_idx(text, 2)); assert_eq!(10, to_byte_idx(text, 3)); assert_eq!(15, to_byte_idx(text, 4)); assert_eq!(21, to_byte_idx(text, 5)); } #[test] fn to_byte_idx_03() { assert_eq!(0, to_byte_idx(TEXT_LINES, 0)); assert_eq!(32, to_byte_idx(TEXT_LINES, 1)); assert_eq!(59, to_byte_idx(TEXT_LINES, 2)); assert_eq!(88, to_byte_idx(TEXT_LINES, 3)); // Past end assert_eq!(124, to_byte_idx(TEXT_LINES, 4)); assert_eq!(124, to_byte_idx(TEXT_LINES, 5)); assert_eq!(124, to_byte_idx(TEXT_LINES, 6)); } #[test] fn line_byte_round_trip() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6))); assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2))); assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0))); assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0))); assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21))); assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5))); } } str_indices-0.4.3/src/lines_lf.rs000064400000000000000000000224331046102023000150720ustar 00000000000000//! Index by lines (line feed only). //! //! This module recognizes the following as line breaks: //! //! - `U+000A` — LF (Line Feed) //! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed) //! — by coincidence due to ignoring CR. use crate::byte_chunk::{ByteChunk, Chunk}; /// Counts the line breaks in a string slice. /// /// Runs in O(N) time. #[inline] pub fn count_breaks(text: &str) -> usize { count_breaks_impl::(text.as_bytes()) } /// Converts from byte-index to line-index in a string slice. /// /// Line break characters are considered to be a part of the line they /// end. And a string that ends with a line break is considered to have /// a final empty line. So this function is equivalent to counting the /// line breaks before the specified byte. /// /// Any past-the-end index will return the last line index. /// /// Runs in O(N) time. #[inline] pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize { let i = byte_idx.min(text.len()); count_breaks_impl::(&text.as_bytes()[..i]) } /// Converts from line-index to byte-index in a string slice. /// /// Returns the byte index of the start of the specified line. Line 0 is /// the start of the string, and subsequent lines start immediately /// *after* each line break character. /// /// Any past-the-end index will return the one-past-the-end byte index. /// /// Runs in O(N) time. #[inline] pub fn to_byte_idx(text: &str, line_idx: usize) -> usize { to_byte_idx_impl::(text.as_bytes(), line_idx) } //------------------------------------------------------------- #[inline(always)] fn to_byte_idx_impl(text: &[u8], line_idx: usize) -> usize { let mut byte_count = 0; let mut lf_count = 0; // Get `middle` so we can do more efficient chunk-based counting. // We can't use this to get `end`, however, because the start index of // `end` actually depends on the accumulating line counts during the // counting process. let (start, middle, _) = unsafe { text.align_to::() }; // Take care of any unaligned bytes at the beginning. for byte in start.iter() { if lf_count == line_idx { return byte_count; } if *byte == 0x0A { lf_count += 1; } byte_count += 1; } // Process the chunks 4 at a time let mut chunk_count = 0; for chunks in middle.chunks_exact(4) { let val1 = chunks[0].cmp_eq_byte(0x0A); let val2 = chunks[1].cmp_eq_byte(0x0A); let val3 = chunks[2].cmp_eq_byte(0x0A); let val4 = chunks[3].cmp_eq_byte(0x0A); let new_lf_count = lf_count + val1.add(val2).add(val3.add(val4)).sum_bytes(); if new_lf_count >= line_idx { break; } lf_count = new_lf_count; byte_count += T::SIZE * 4; chunk_count += 4; } // Process the rest of the chunks for chunk in middle[chunk_count..].iter() { let new_lf_count = lf_count + chunk.cmp_eq_byte(0x0A).sum_bytes(); if new_lf_count >= line_idx { break; } lf_count = new_lf_count; byte_count += T::SIZE; } // Take care of any unaligned bytes at the end. for byte in &text[byte_count..] { if lf_count == line_idx { break; } lf_count += (*byte == 0x0A) as usize; byte_count += 1; } byte_count } /// Counts the line breaks in a utf8 encoded string. /// /// The following unicode sequences are considered newlines by this function: /// - u{000A} (Line Feed) #[inline(always)] fn count_breaks_impl(text: &[u8]) -> usize { if text.len() < T::SIZE { // Bypass the more complex routine for short strings, where the // complexity hurts performance. text.iter().map(|byte| (*byte == 0x0A) as usize).sum() } else { // Get `middle` so we can do more efficient chunk-based counting. let (start, middle, end) = unsafe { text.align_to::() }; let mut count = 0; // Take care of unaligned bytes at the beginning. count += start .iter() .map(|byte| (*byte == 0x0A) as usize) .sum::(); // Take care of the middle bytes in big chunks. Loop unrolled. for chunks in middle.chunks_exact(4) { let val1 = chunks[0].cmp_eq_byte(0x0A); let val2 = chunks[1].cmp_eq_byte(0x0A); let val3 = chunks[2].cmp_eq_byte(0x0A); let val4 = chunks[3].cmp_eq_byte(0x0A); count += val1.add(val2).add(val3.add(val4)).sum_bytes(); } // Chunk remainder let mut acc = T::zero(); for chunk in middle.chunks_exact(4).remainder() { acc = acc.add(chunk.cmp_eq_byte(0x0A)); } count += acc.sum_bytes(); // Take care of unaligned bytes at the end. count + end .iter() .map(|byte| (*byte == 0x0A) as usize) .sum::() } } //============================================================= #[cfg(test)] mod tests { use super::*; // 124 bytes, 100 chars, 4 lines const TEXT_LINES: &str = "Hello there! How're you doing?\nIt's \ a fine day, isn't it?\nAren't you glad \ we're alive?\nこんにちは、みんなさん!"; #[test] fn count_breaks_01() { let text = "\nHello\u{000D}\nせ\u{000B}か\u{000C}い\u{0085}. \ There\nis something.\u{2029}"; assert_eq!(45, text.len()); assert_eq!(3, count_breaks(text)); } #[test] fn from_byte_idx_01() { let text = "Here\nare\nsome\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 8)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(2, from_byte_idx(text, 13)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(3, from_byte_idx(text, 19)); } #[test] fn from_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(1, from_byte_idx(text, 1)); assert_eq!(1, from_byte_idx(text, 5)); assert_eq!(2, from_byte_idx(text, 6)); assert_eq!(2, from_byte_idx(text, 9)); assert_eq!(3, from_byte_idx(text, 10)); assert_eq!(3, from_byte_idx(text, 14)); assert_eq!(4, from_byte_idx(text, 15)); assert_eq!(4, from_byte_idx(text, 20)); assert_eq!(5, from_byte_idx(text, 21)); } #[test] fn from_byte_idx_03() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, from_byte_idx(text, 0)); assert_eq!(0, from_byte_idx(text, 4)); assert_eq!(0, from_byte_idx(text, 5)); assert_eq!(1, from_byte_idx(text, 6)); assert_eq!(1, from_byte_idx(text, 9)); assert_eq!(1, from_byte_idx(text, 10)); assert_eq!(2, from_byte_idx(text, 11)); assert_eq!(2, from_byte_idx(text, 15)); assert_eq!(2, from_byte_idx(text, 16)); assert_eq!(3, from_byte_idx(text, 17)); } #[test] fn from_byte_idx_04() { // Line 0 for i in 0..32 { assert_eq!(0, from_byte_idx(TEXT_LINES, i)); } // Line 1 for i in 32..59 { assert_eq!(1, from_byte_idx(TEXT_LINES, i)); } // Line 2 for i in 59..88 { assert_eq!(2, from_byte_idx(TEXT_LINES, i)); } // Line 3 for i in 88..125 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } // Past the end for i in 125..130 { assert_eq!(3, from_byte_idx(TEXT_LINES, i)); } } #[test] fn to_byte_idx_01() { let text = "Here\r\nare\r\nsome\r\nwords"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(6, to_byte_idx(text, 1)); assert_eq!(11, to_byte_idx(text, 2)); assert_eq!(17, to_byte_idx(text, 3)); } #[test] fn to_byte_idx_02() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(0, to_byte_idx(text, 0)); assert_eq!(1, to_byte_idx(text, 1)); assert_eq!(6, to_byte_idx(text, 2)); assert_eq!(10, to_byte_idx(text, 3)); assert_eq!(15, to_byte_idx(text, 4)); assert_eq!(21, to_byte_idx(text, 5)); } #[test] fn to_byte_idx_03() { assert_eq!(0, to_byte_idx(TEXT_LINES, 0)); assert_eq!(32, to_byte_idx(TEXT_LINES, 1)); assert_eq!(59, to_byte_idx(TEXT_LINES, 2)); assert_eq!(88, to_byte_idx(TEXT_LINES, 3)); // Past end assert_eq!(124, to_byte_idx(TEXT_LINES, 4)); assert_eq!(124, to_byte_idx(TEXT_LINES, 5)); assert_eq!(124, to_byte_idx(TEXT_LINES, 6)); } #[test] fn line_byte_round_trip() { let text = "\nHere\nare\nsome\nwords\n"; assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6))); assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2))); assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0))); assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0))); assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21))); assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5))); } } str_indices-0.4.3/src/utf16.rs000064400000000000000000000172571046102023000142540ustar 00000000000000//! Index by utf16 code units. use crate::byte_chunk::{ByteChunk, Chunk}; /// Counts the utf16 code units that would be in a string slice if it /// were encoded as utf16. /// /// Runs in O(N) time. #[inline] pub fn count(text: &str) -> usize { crate::chars::count_impl::(text.as_bytes()) + count_surrogates_impl::(text.as_bytes()) } /// Counts the utf16 surrogate pairs that would be in a string slice if /// it were encoded as utf16. /// /// Runs in O(N) time. #[inline] pub fn count_surrogates(text: &str) -> usize { count_surrogates_impl::(text.as_bytes()) } /// Converts from byte-index to utf16-code-unit-index in a string slice. /// /// If the byte is in the middle of a multi-byte char, returns the utf16 /// index of the char that the byte belongs to. /// /// Any past-the-end index will return the one-past-the-end utf16 index. /// /// Runs in O(N) time. #[inline] pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize { let mut i = byte_idx.min(text.len()); while !text.is_char_boundary(i) { i -= 1; } let slice = &text.as_bytes()[..i]; crate::chars::count_impl::(slice) + count_surrogates_impl::(slice) } /// Converts from utf16-code-unit-index to byte-index in a string slice. /// /// If the utf16 index is in the middle of a char, returns the bytes /// index of the char that utf16 code unit belongs to. /// /// Any past-the-end index will return the one-past-the-end byte index. /// /// Runs in O(N) time. #[inline] pub fn to_byte_idx(text: &str, utf16_idx: usize) -> usize { to_byte_idx_impl::(text, utf16_idx) } //------------------------------------------------------------- #[inline(always)] fn to_byte_idx_impl(text: &str, utf16_idx: usize) -> usize { // Get `middle` so we can do more efficient chunk-based counting. // We can't use this to get `end`, however, because the start index of // `end` actually depends on the accumulating char counts during the // counting process. let (start, middle, _) = unsafe { text.as_bytes().align_to::() }; let mut byte_count = 0; let mut utf16_count = 0; // Take care of any unaligned bytes at the beginning. for byte in start.iter() { utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize; if utf16_count > utf16_idx { break; } byte_count += 1; } // Process chunks in the fast path. let mut chunks = middle; let mut max_round_len = utf16_idx.saturating_sub(utf16_count) / T::MAX_ACC; while max_round_len > 0 && !chunks.is_empty() { // Choose the largest number of chunks we can do this round // that will neither overflow `max_acc` nor blast past the // utf16 code unit we're looking for. let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len()); max_round_len -= round_len; let round = &chunks[..round_len]; chunks = &chunks[round_len..]; // Process the chunks in this round. let mut acc_inv_chars = T::zero(); let mut acc_surrogates = T::zero(); for chunk in round.iter() { acc_inv_chars = acc_inv_chars.add(chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80)); acc_surrogates = acc_surrogates.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0)); } utf16_count += ((T::SIZE * round_len) - acc_inv_chars.sum_bytes()) + acc_surrogates.sum_bytes(); byte_count += T::SIZE * round_len; } // Process chunks in the slow path. for chunk in chunks.iter() { let inv_chars = chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80).sum_bytes(); let surrogates = chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0).sum_bytes(); let new_utf16_count = utf16_count + (T::SIZE - inv_chars) + surrogates; if new_utf16_count >= utf16_idx { break; } utf16_count = new_utf16_count; byte_count += T::SIZE; } // Take care of any unaligned bytes at the end. let end = &text.as_bytes()[byte_count..]; for byte in end.iter() { utf16_count += ((*byte & 0xC0) != 0x80) as usize + ((byte & 0xf0) == 0xf0) as usize; if utf16_count > utf16_idx { break; } byte_count += 1; } byte_count } #[inline(always)] fn count_surrogates_impl(text: &[u8]) -> usize { // We chop off the last three bytes, because all surrogate pairs are // four bytes in utf8, and so it prevents counting partial // characters. if text.len() <= 3 { return 0; } let text = &text[..(text.len() - 3)]; // Get `middle` for more efficient chunk-based counting. let (start, middle, end) = unsafe { text.align_to::() }; let mut utf16_surrogate_count = 0; // Take care of unaligned bytes at the beginning. for byte in start.iter() { utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize; } // Take care of the middle bytes in big chunks. for chunks in middle.chunks(T::MAX_ACC) { let mut acc = T::zero(); for chunk in chunks.iter() { acc = acc.add(chunk.bitand(T::splat(0xf0)).cmp_eq_byte(0xf0)); } utf16_surrogate_count += acc.sum_bytes(); } // Take care of unaligned bytes at the end. for byte in end.iter() { utf16_surrogate_count += ((byte & 0xf0) == 0xf0) as usize; } utf16_surrogate_count } //============================================================= #[cfg(test)] mod tests { use super::*; // 45 bytes, 27 utf16 code units. const TEXT: &str = "Hel🐸lo world! こん🐸にち🐸🐸は!"; #[test] fn count_01() { assert_eq!(27, count(TEXT)); } #[test] fn count_surrogates_01() { assert_eq!(4, count_surrogates(TEXT)); } #[test] fn from_byte_idx_01() { assert_eq!(0, from_byte_idx(TEXT, 0)); assert_eq!(3, from_byte_idx(TEXT, 3)); assert_eq!(3, from_byte_idx(TEXT, 4)); assert_eq!(3, from_byte_idx(TEXT, 5)); assert_eq!(3, from_byte_idx(TEXT, 6)); assert_eq!(5, from_byte_idx(TEXT, 7)); assert_eq!(7, from_byte_idx(TEXT, 9)); assert_eq!(17, from_byte_idx(TEXT, 23)); assert_eq!(17, from_byte_idx(TEXT, 24)); assert_eq!(17, from_byte_idx(TEXT, 25)); assert_eq!(17, from_byte_idx(TEXT, 26)); assert_eq!(19, from_byte_idx(TEXT, 27)); assert_eq!(21, from_byte_idx(TEXT, 33)); assert_eq!(21, from_byte_idx(TEXT, 34)); assert_eq!(21, from_byte_idx(TEXT, 35)); assert_eq!(21, from_byte_idx(TEXT, 36)); assert_eq!(23, from_byte_idx(TEXT, 37)); assert_eq!(23, from_byte_idx(TEXT, 38)); assert_eq!(23, from_byte_idx(TEXT, 39)); assert_eq!(23, from_byte_idx(TEXT, 40)); assert_eq!(25, from_byte_idx(TEXT, 41)); assert_eq!(27, from_byte_idx(TEXT, 45)); assert_eq!(27, from_byte_idx(TEXT, 46)); // Index 1 past the end. } #[test] fn to_byte_idx_01() { assert_eq!(to_byte_idx(TEXT, 0), 0); assert_eq!(3, to_byte_idx(TEXT, 3)); assert_eq!(3, to_byte_idx(TEXT, 4)); assert_eq!(7, to_byte_idx(TEXT, 5)); assert_eq!(9, to_byte_idx(TEXT, 7)); assert_eq!(23, to_byte_idx(TEXT, 17)); assert_eq!(23, to_byte_idx(TEXT, 18)); assert_eq!(27, to_byte_idx(TEXT, 19)); assert_eq!(33, to_byte_idx(TEXT, 21)); assert_eq!(33, to_byte_idx(TEXT, 22)); assert_eq!(37, to_byte_idx(TEXT, 23)); assert_eq!(37, to_byte_idx(TEXT, 24)); assert_eq!(41, to_byte_idx(TEXT, 25)); assert_eq!(45, to_byte_idx(TEXT, 27)); assert_eq!(45, to_byte_idx(TEXT, 27)); // Index 1 past the end. } }