unicode-bidi-0.3.17/.appveyor.yml000064400000000000000000000007261046102023000147220ustar 00000000000000install: - appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe - rustup-init -yv --default-toolchain nightly - set PATH=%PATH%;%USERPROFILE%\.cargo\bin - rustc -V - cargo -V - git submodule update --init --recursive build: false environment: RUST_BACKTRACE: full test_script: - cargo build --verbose --all - cargo doc --verbose --all --no-deps - cargo test --verbose --all - cargo test --verbose --all --features serde unicode-bidi-0.3.17/.cargo_vcs_info.json0000644000000001360000000000100134570ustar { "git": { "sha1": "dd3a738d7ab34d4fbc05908beacf8e348491b399" }, "path_in_vcs": "" }unicode-bidi-0.3.17/.github/workflows/main.yml000064400000000000000000000025601046102023000173160ustar 00000000000000name: CI on: push: branches: ['main', 'auto'] pull_request: jobs: Test: strategy: matrix: os: [ubuntu-latest] rust: [1.47.0, stable, beta, nightly] runs-on: ${{ matrix.os }} steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@master with: toolchain: ${{ matrix.rust }} - name: Unpin dependencies except on MSRV if: matrix.rust != '1.47.0' run: cargo update - run: cargo build --all-targets - run: cargo test - run: cargo test --features "serde" - run: cargo test --no-default-features - run: cargo test --no-default-features --features=hardcoded-data Fmt: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - uses: dtolnay/rust-toolchain@stable with: components: rustfmt - run: cargo fmt --check Verify: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Verify regenerated files run: ./tools/generate.py && git diff --exit-code -- src/char_data/tables.rs build_result: name: homu build finished runs-on: ubuntu-latest needs: - "Test" - "Fmt" steps: - name: Mark the job as successful run: exit 0 if: success() - name: Mark the job as unsuccessful run: exit 1 if: "!success()" unicode-bidi-0.3.17/.gitignore000064400000000000000000000000301046102023000142300ustar 00000000000000/data/ /target/ /*.html unicode-bidi-0.3.17/.rustfmt.toml000064400000000000000000000000211046102023000147170ustar 00000000000000array_width = 80 unicode-bidi-0.3.17/AUTHORS000064400000000000000000000001711046102023000133160ustar 00000000000000This software was written by the following people: Matt Brubeck Behnam Esfahbod unicode-bidi-0.3.17/COPYRIGHT000064400000000000000000000006341046102023000135450ustar 00000000000000This project is copyright 2015, The Servo Project Developers (given in the file AUTHORS). Licensed under the Apache License, Version 2.0 or the MIT license , at your option. All files in the project carrying such notice may not be copied, modified, or distributed except according to those terms. unicode-bidi-0.3.17/Cargo.lock0000644000000111360000000000100114340ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. version = 3 [[package]] name = "flame" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1fc2706461e1ee94f55cab2ed2e3d34ae9536cfa830358ef80acff1a3dacab30" dependencies = [ "lazy_static", "serde", "serde_derive", "serde_json", "thread-id", ] [[package]] name = "flamer" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "36b732da54fd4ea34452f2431cf464ac7be94ca4b339c9cd3d3d12eb06fe7aab" dependencies = [ "flame", "quote", "syn", ] [[package]] name = "itoa" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "af150ab688ff2122fcef229be89cb50dd66af9e01a4ff320cc137eecc9bacc38" [[package]] name = "lazy_static" version = "0.2.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "76f033c7ad61445c5b347c7382dd1237847eb1bce590fe50365dcb33d546be73" [[package]] name = "libc" version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "proc-macro2" version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "92de25114670a878b1261c79c9f8f729fb97e95bac93f6312f583c60dd6a1dfe" dependencies = [ "unicode-ident", ] [[package]] name = "quote" version = "1.0.30" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5907a1b7c277254a8b15170f6e7c97cfa60ee7872a3217663bb81151e48184bb" dependencies = [ "proc-macro2", ] [[package]] name = "redox_syscall" version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] name = "ryu" version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ad4cc8da4ef723ed60bced201181d83791ad433213d8c24efffda1eec85d741" [[package]] name = "serde" version = "1.0.156" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "314b5b092c0ade17c00142951e50ced110ec27cea304b1037c6969246c2469a4" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" version = "1.0.156" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7e29c4601e36bcec74a223228dce795f4cd3616341a4af93520ca1a837c087d" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.99" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "46266871c240a00b8f503b877622fe33430b3c7d963bdc0f2adc511e54a1eae3" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "serde_test" version = "1.0.175" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "29baf0f77ca9ad9c6ed46e1b408b5e0f30b5184bcd66884e7f6d36bd7a65a8a4" dependencies = [ "serde", ] [[package]] name = "smallvec" version = "1.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6ecd384b10a64542d77071bd64bd7b231f4ed5940fba55e98c3de13824cf3d7" [[package]] name = "syn" version = "1.0.109" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" dependencies = [ "proc-macro2", "quote", "unicode-ident", ] [[package]] name = "thread-id" version = "3.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c7fbf4c9d56b320106cd64fd024dadfa0be7cb4706725fc44a7d7ce952d820c1" dependencies = [ "libc", "redox_syscall", "winapi", ] [[package]] name = "unicode-bidi" version = "0.3.17" dependencies = [ "flame", "flamer", "serde", "serde_test", "smallvec", ] [[package]] name = "unicode-ident" version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" unicode-bidi-0.3.17/Cargo.toml0000644000000034060000000000100114600ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" rust-version = "1.47.0" name = "unicode-bidi" version = "0.3.17" authors = ["The Servo Project Developers"] build = false exclude = [ "benches/**", "data/**", "examples/**", "tests/**", "tools/**", ] autobins = false autoexamples = false autotests = false autobenches = false description = "Implementation of the Unicode Bidirectional Algorithm" documentation = "https://docs.rs/unicode-bidi/" readme = "README.md" keywords = [ "rtl", "unicode", "text", "layout", "bidi", ] categories = [ "no-std", "encoding", "text-processing", ] license = "MIT OR Apache-2.0" repository = "https://github.com/servo/unicode-bidi" [lib] name = "unicode_bidi" path = "src/lib.rs" [dependencies.flame] version = "0.2" optional = true [dependencies.flamer] version = "0.4" optional = true [dependencies.serde] version = ">=0.8, <2.0" features = ["derive"] optional = true default-features = false [dependencies.smallvec] version = ">=1.13" features = ["union"] optional = true [dev-dependencies.serde_test] version = ">=0.8, <2.0" [features] bench_it = [] default = [ "std", "hardcoded-data", ] flame_it = [ "flame", "flamer", ] hardcoded-data = [] std = [] unstable = [] with_serde = ["serde"] [badges.appveyor] repository = "servo/unicode-bidi" unicode-bidi-0.3.17/Cargo.toml.orig000064400000000000000000000030561046102023000151420ustar 00000000000000[package] name = "unicode-bidi" version = "0.3.17" authors = ["The Servo Project Developers"] license = "MIT OR Apache-2.0" description = "Implementation of the Unicode Bidirectional Algorithm" repository = "https://github.com/servo/unicode-bidi" documentation = "https://docs.rs/unicode-bidi/" keywords = ["rtl", "unicode", "text", "layout", "bidi"] readme="README.md" edition = "2018" rust-version = "1.47.0" categories = [ "no-std", "encoding", "text-processing", ] # No data is shipped; benches, examples and tests also depend on data. exclude = [ "benches/**", "data/**", "examples/**", "tests/**", "tools/**", ] [badges] appveyor = { repository = "servo/unicode-bidi" } [lib] name = "unicode_bidi" [dependencies] flame = { version = "0.2", optional = true } flamer = { version = "0.4", optional = true } serde = { version = ">=0.8, <2.0", default-features = false, optional = true, features = ["derive"] } smallvec = { version = ">=1.13", optional = true, features = ["union"] } [dev-dependencies] serde_test = ">=0.8, <2.0" [features] # Note: We don't actually use the `std` feature for anything other than making # doctests work. But it may come in handy in the future. default = ["std", "hardcoded-data"] hardcoded-data = [] # Include hardcoded Bidi data std = [] unstable = [] # travis-cargo needs it bench_it = [] flame_it = ["flame", "flamer"] with_serde = ["serde"] # DEPRECATED, please use `serde` feature, instead. [[test]] name = "conformance_tests" required-features = ["hardcoded-data"] path = "tests/conformance_tests.rs" unicode-bidi-0.3.17/LICENSE-APACHE000064400000000000000000000251371046102023000142030ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. unicode-bidi-0.3.17/LICENSE-MIT000064400000000000000000000020571046102023000137070ustar 00000000000000Copyright (c) 2015 The Rust Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. unicode-bidi-0.3.17/README.md000064400000000000000000000010721046102023000135260ustar 00000000000000# unicode-bidi This crate implements the [Unicode Bidirectional Algorithm][tr9] for display of mixed right-to-left and left-to-right text. It is written in safe Rust, compatible with the current stable release. [Documentation](https://docs.rs/unicode-bidi/) [![CI](https://github.com/servo/unicode-bidi/actions/workflows/main.yml/badge.svg)](https://github.com/servo/unicode-bidi/actions) [![AppVeyor](https://img.shields.io/appveyor/ci/servo/unicode-bidi/master.svg)](https://ci.appveyor.com/project/servo/unicode-bidi) [tr9]: https://www.unicode.org/reports/tr9/ unicode-bidi-0.3.17/src/char_data/mod.rs000064400000000000000000000131451046102023000160750ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Accessor for `Bidi_Class` property from Unicode Character Database (UCD) mod tables; pub use self::tables::{BidiClass, UNICODE_VERSION}; #[cfg(feature = "hardcoded-data")] use core::char; #[cfg(feature = "hardcoded-data")] use core::cmp::Ordering::{Equal, Greater, Less}; #[cfg(feature = "hardcoded-data")] use self::tables::bidi_class_table; use crate::data_source::BidiMatchedOpeningBracket; use crate::BidiClass::*; #[cfg(feature = "hardcoded-data")] use crate::BidiDataSource; /// Hardcoded Bidi data that ships with the unicode-bidi crate. /// /// This can be enabled with the default `hardcoded-data` Cargo feature. #[cfg(feature = "hardcoded-data")] pub struct HardcodedBidiData; #[cfg(feature = "hardcoded-data")] impl BidiDataSource for HardcodedBidiData { fn bidi_class(&self, c: char) -> BidiClass { bsearch_range_value_table(c, bidi_class_table) } } /// Find the `BidiClass` of a single char. #[cfg(feature = "hardcoded-data")] pub fn bidi_class(c: char) -> BidiClass { bsearch_range_value_table(c, bidi_class_table) } /// If this character is a bracket according to BidiBrackets.txt, /// return the corresponding *normalized* *opening bracket* of the pair, /// and whether or not it itself is an opening bracket. pub(crate) fn bidi_matched_opening_bracket(c: char) -> Option { for pair in self::tables::bidi_pairs_table { if pair.0 == c || pair.1 == c { let skeleton = pair.2.unwrap_or(pair.0); return Some(BidiMatchedOpeningBracket { opening: skeleton, is_open: pair.0 == c, }); } } None } pub fn is_rtl(bidi_class: BidiClass) -> bool { matches!(bidi_class, RLE | RLO | RLI) } #[cfg(feature = "hardcoded-data")] fn bsearch_range_value_table(c: char, r: &'static [(char, char, BidiClass)]) -> BidiClass { match r.binary_search_by(|&(lo, hi, _)| { if lo <= c && c <= hi { Equal } else if hi < c { Less } else { Greater } }) { Ok(idx) => { let (_, _, cat) = r[idx]; cat } // UCD/extracted/DerivedBidiClass.txt: "All code points not explicitly listed // for Bidi_Class have the value Left_To_Right (L)." Err(_) => L, } } #[cfg(all(test, feature = "hardcoded-data"))] mod tests { use super::*; #[test] fn test_ascii() { assert_eq!(bidi_class('\u{0000}'), BN); assert_eq!(bidi_class('\u{0040}'), ON); assert_eq!(bidi_class('\u{0041}'), L); assert_eq!(bidi_class('\u{0062}'), L); assert_eq!(bidi_class('\u{007F}'), BN); } #[test] fn test_bmp() { // Hebrew assert_eq!(bidi_class('\u{0590}'), R); assert_eq!(bidi_class('\u{05D0}'), R); assert_eq!(bidi_class('\u{05D1}'), R); assert_eq!(bidi_class('\u{05FF}'), R); // Arabic assert_eq!(bidi_class('\u{0600}'), AN); assert_eq!(bidi_class('\u{0627}'), AL); assert_eq!(bidi_class('\u{07BF}'), AL); // Default R + Arabic Extras assert_eq!(bidi_class('\u{07C0}'), R); assert_eq!(bidi_class('\u{085F}'), R); assert_eq!(bidi_class('\u{0860}'), AL); assert_eq!(bidi_class('\u{0870}'), AL); assert_eq!(bidi_class('\u{089F}'), NSM); assert_eq!(bidi_class('\u{08A0}'), AL); assert_eq!(bidi_class('\u{089F}'), NSM); assert_eq!(bidi_class('\u{08FF}'), NSM); // Default ET assert_eq!(bidi_class('\u{20A0}'), ET); assert_eq!(bidi_class('\u{20CF}'), ET); // Arabic Presentation Forms assert_eq!(bidi_class('\u{FB1D}'), R); assert_eq!(bidi_class('\u{FB4F}'), R); assert_eq!(bidi_class('\u{FB50}'), AL); assert_eq!(bidi_class('\u{FDCF}'), ON); assert_eq!(bidi_class('\u{FDF0}'), AL); assert_eq!(bidi_class('\u{FDFF}'), ON); assert_eq!(bidi_class('\u{FE70}'), AL); assert_eq!(bidi_class('\u{FEFE}'), AL); assert_eq!(bidi_class('\u{FEFF}'), BN); // noncharacters assert_eq!(bidi_class('\u{FDD0}'), L); assert_eq!(bidi_class('\u{FDD1}'), L); assert_eq!(bidi_class('\u{FDEE}'), L); assert_eq!(bidi_class('\u{FDEF}'), L); assert_eq!(bidi_class('\u{FFFE}'), L); assert_eq!(bidi_class('\u{FFFF}'), L); } #[test] fn test_smp() { // Default AL + R assert_eq!(bidi_class('\u{10800}'), R); assert_eq!(bidi_class('\u{10FFF}'), R); assert_eq!(bidi_class('\u{1E800}'), R); assert_eq!(bidi_class('\u{1EDFF}'), R); assert_eq!(bidi_class('\u{1EE00}'), AL); assert_eq!(bidi_class('\u{1EEFF}'), AL); assert_eq!(bidi_class('\u{1EF00}'), R); assert_eq!(bidi_class('\u{1EFFF}'), R); } #[test] fn test_unassigned_planes() { assert_eq!(bidi_class('\u{30000}'), L); assert_eq!(bidi_class('\u{40000}'), L); assert_eq!(bidi_class('\u{50000}'), L); assert_eq!(bidi_class('\u{60000}'), L); assert_eq!(bidi_class('\u{70000}'), L); assert_eq!(bidi_class('\u{80000}'), L); assert_eq!(bidi_class('\u{90000}'), L); assert_eq!(bidi_class('\u{a0000}'), L); } } unicode-bidi-0.3.17/src/char_data/tables.rs000064400000000000000000001420561046102023000165740ustar 00000000000000// NOTE: // The following code was generated by "tools/generate.py". do not edit directly #![allow(missing_docs, non_upper_case_globals, non_snake_case)] #![cfg_attr(rustfmt, rustfmt_skip)] /// The [Unicode version](http://www.unicode.org/versions/) of data pub const UNICODE_VERSION: (u64, u64, u64) = (16, 0, 0); #[allow(non_camel_case_types)] #[derive(Clone, Copy, Debug, PartialEq, Eq)] /// Represents values of the Unicode character property /// [`Bidi_Class`](http://www.unicode.org/reports/tr44/#Bidi_Class), also /// known as the *bidirectional character type*. /// /// * /// * pub enum BidiClass { AL, AN, B, BN, CS, EN, ES, ET, FSI, L, LRE, LRI, LRO, NSM, ON, PDF, PDI, R, RLE, RLI, RLO, S, WS, } #[cfg(feature = "hardcoded-data")] use self::BidiClass::*; #[cfg(feature = "hardcoded-data")] pub const bidi_class_table: &'static [(char, char, BidiClass)] = &[ ('\u{0}', '\u{8}', BN), ('\u{9}', '\u{9}', S), ('\u{a}', '\u{a}', B), ('\u{b}', '\u{b}', S), ('\u{c}', '\u{c}', WS), ('\u{d}', '\u{d}', B), ('\u{e}', '\u{1b}', BN), ('\u{1c}', '\u{1e}', B), ('\u{1f}', '\u{1f}', S), ('\u{20}', '\u{20}', WS), ('\u{21}', '\u{22}', ON), ('\u{23}', '\u{25}', ET), ('\u{26}', '\u{2a}', ON), ('\u{2b}', '\u{2b}', ES), ('\u{2c}', '\u{2c}', CS), ('\u{2d}', '\u{2d}', ES), ('\u{2e}', '\u{2f}', CS), ('\u{30}', '\u{39}', EN), ('\u{3a}', '\u{3a}', CS), ('\u{3b}', '\u{40}', ON), ('\u{41}', '\u{5a}', L), ('\u{5b}', '\u{60}', ON), ('\u{61}', '\u{7a}', L), ('\u{7b}', '\u{7e}', ON), ('\u{7f}', '\u{84}', BN), ('\u{85}', '\u{85}', B), ('\u{86}', '\u{9f}', BN), ('\u{a0}', '\u{a0}', CS), ('\u{a1}', '\u{a1}', ON), ('\u{a2}', '\u{a5}', ET), ('\u{a6}', '\u{a9}', ON), ('\u{aa}', '\u{aa}', L), ('\u{ab}', '\u{ac}', ON), ('\u{ad}', '\u{ad}', BN), ('\u{ae}', '\u{af}', ON), ('\u{b0}', '\u{b1}', ET), ('\u{b2}', '\u{b3}', EN), ('\u{b4}', '\u{b4}', ON), ('\u{b5}', '\u{b5}', L), ('\u{b6}', '\u{b8}', ON), ('\u{b9}', '\u{b9}', EN), ('\u{ba}', '\u{ba}', L), ('\u{bb}', '\u{bf}', ON), ('\u{c0}', '\u{d6}', L), ('\u{d7}', '\u{d7}', ON), ('\u{d8}', '\u{f6}', L), ('\u{f7}', '\u{f7}', ON), ('\u{f8}', '\u{2b8}', L), ('\u{2b9}', '\u{2ba}', ON), ('\u{2bb}', '\u{2c1}', L), ('\u{2c2}', '\u{2cf}', ON), ('\u{2d0}', '\u{2d1}', L), ('\u{2d2}', '\u{2df}', ON), ('\u{2e0}', '\u{2e4}', L), ('\u{2e5}', '\u{2ed}', ON), ('\u{2ee}', '\u{2ee}', L), ('\u{2ef}', '\u{2ff}', ON), ('\u{300}', '\u{36f}', NSM), ('\u{370}', '\u{373}', L), ('\u{374}', '\u{375}', ON), ('\u{376}', '\u{377}', L), ('\u{37a}', '\u{37d}', L), ('\u{37e}', '\u{37e}', ON), ('\u{37f}', '\u{37f}', L), ('\u{384}', '\u{385}', ON), ('\u{386}', '\u{386}', L), ('\u{387}', '\u{387}', ON), ('\u{388}', '\u{38a}', L), ('\u{38c}', '\u{38c}', L), ('\u{38e}', '\u{3a1}', L), ('\u{3a3}', '\u{3f5}', L), ('\u{3f6}', '\u{3f6}', ON), ('\u{3f7}', '\u{482}', L), ('\u{483}', '\u{489}', NSM), ('\u{48a}', '\u{52f}', L), ('\u{531}', '\u{556}', L), ('\u{559}', '\u{589}', L), ('\u{58a}', '\u{58a}', ON), ('\u{58d}', '\u{58e}', ON), ('\u{58f}', '\u{58f}', ET), ('\u{590}', '\u{590}', R), ('\u{591}', '\u{5bd}', NSM), ('\u{5be}', '\u{5be}', R), ('\u{5bf}', '\u{5bf}', NSM), ('\u{5c0}', '\u{5c0}', R), ('\u{5c1}', '\u{5c2}', NSM), ('\u{5c3}', '\u{5c3}', R), ('\u{5c4}', '\u{5c5}', NSM), ('\u{5c6}', '\u{5c6}', R), ('\u{5c7}', '\u{5c7}', NSM), ('\u{5c8}', '\u{5ff}', R), ('\u{600}', '\u{605}', AN), ('\u{606}', '\u{607}', ON), ('\u{608}', '\u{608}', AL), ('\u{609}', '\u{60a}', ET), ('\u{60b}', '\u{60b}', AL), ('\u{60c}', '\u{60c}', CS), ('\u{60d}', '\u{60d}', AL), ('\u{60e}', '\u{60f}', ON), ('\u{610}', '\u{61a}', NSM), ('\u{61b}', '\u{64a}', AL), ('\u{64b}', '\u{65f}', NSM), ('\u{660}', '\u{669}', AN), ('\u{66a}', '\u{66a}', ET), ('\u{66b}', '\u{66c}', AN), ('\u{66d}', '\u{66f}', AL), ('\u{670}', '\u{670}', NSM), ('\u{671}', '\u{6d5}', AL), ('\u{6d6}', '\u{6dc}', NSM), ('\u{6dd}', '\u{6dd}', AN), ('\u{6de}', '\u{6de}', ON), ('\u{6df}', '\u{6e4}', NSM), ('\u{6e5}', '\u{6e6}', AL), ('\u{6e7}', '\u{6e8}', NSM), ('\u{6e9}', '\u{6e9}', ON), ('\u{6ea}', '\u{6ed}', NSM), ('\u{6ee}', '\u{6ef}', AL), ('\u{6f0}', '\u{6f9}', EN), ('\u{6fa}', '\u{710}', AL), ('\u{711}', '\u{711}', NSM), ('\u{712}', '\u{72f}', AL), ('\u{730}', '\u{74a}', NSM), ('\u{74b}', '\u{7a5}', AL), ('\u{7a6}', '\u{7b0}', NSM), ('\u{7b1}', '\u{7bf}', AL), ('\u{7c0}', '\u{7ea}', R), ('\u{7eb}', '\u{7f3}', NSM), ('\u{7f4}', '\u{7f5}', R), ('\u{7f6}', '\u{7f9}', ON), ('\u{7fa}', '\u{7fc}', R), ('\u{7fd}', '\u{7fd}', NSM), ('\u{7fe}', '\u{815}', R), ('\u{816}', '\u{819}', NSM), ('\u{81a}', '\u{81a}', R), ('\u{81b}', '\u{823}', NSM), ('\u{824}', '\u{824}', R), ('\u{825}', '\u{827}', NSM), ('\u{828}', '\u{828}', R), ('\u{829}', '\u{82d}', NSM), ('\u{82e}', '\u{858}', R), ('\u{859}', '\u{85b}', NSM), ('\u{85c}', '\u{85f}', R), ('\u{860}', '\u{86a}', AL), ('\u{86b}', '\u{86f}', R), ('\u{870}', '\u{88e}', AL), ('\u{88f}', '\u{88f}', R), ('\u{890}', '\u{891}', AN), ('\u{892}', '\u{896}', R), ('\u{897}', '\u{89f}', NSM), ('\u{8a0}', '\u{8c9}', AL), ('\u{8ca}', '\u{8e1}', NSM), ('\u{8e2}', '\u{8e2}', AN), ('\u{8e3}', '\u{902}', NSM), ('\u{903}', '\u{939}', L), ('\u{93a}', '\u{93a}', NSM), ('\u{93b}', '\u{93b}', L), ('\u{93c}', '\u{93c}', NSM), ('\u{93d}', '\u{940}', L), ('\u{941}', '\u{948}', NSM), ('\u{949}', '\u{94c}', L), ('\u{94d}', '\u{94d}', NSM), ('\u{94e}', '\u{950}', L), ('\u{951}', '\u{957}', NSM), ('\u{958}', '\u{961}', L), ('\u{962}', '\u{963}', NSM), ('\u{964}', '\u{980}', L), ('\u{981}', '\u{981}', NSM), ('\u{982}', '\u{983}', L), ('\u{985}', '\u{98c}', L), ('\u{98f}', '\u{990}', L), ('\u{993}', '\u{9a8}', L), ('\u{9aa}', '\u{9b0}', L), ('\u{9b2}', '\u{9b2}', L), ('\u{9b6}', '\u{9b9}', L), ('\u{9bc}', '\u{9bc}', NSM), ('\u{9bd}', '\u{9c0}', L), ('\u{9c1}', '\u{9c4}', NSM), ('\u{9c7}', '\u{9c8}', L), ('\u{9cb}', '\u{9cc}', L), ('\u{9cd}', '\u{9cd}', NSM), ('\u{9ce}', '\u{9ce}', L), ('\u{9d7}', '\u{9d7}', L), ('\u{9dc}', '\u{9dd}', L), ('\u{9df}', '\u{9e1}', L), ('\u{9e2}', '\u{9e3}', NSM), ('\u{9e6}', '\u{9f1}', L), ('\u{9f2}', '\u{9f3}', ET), ('\u{9f4}', '\u{9fa}', L), ('\u{9fb}', '\u{9fb}', ET), ('\u{9fc}', '\u{9fd}', L), ('\u{9fe}', '\u{9fe}', NSM), ('\u{a01}', '\u{a02}', NSM), ('\u{a03}', '\u{a03}', L), ('\u{a05}', '\u{a0a}', L), ('\u{a0f}', '\u{a10}', L), ('\u{a13}', '\u{a28}', L), ('\u{a2a}', '\u{a30}', L), ('\u{a32}', '\u{a33}', L), ('\u{a35}', '\u{a36}', L), ('\u{a38}', '\u{a39}', L), ('\u{a3c}', '\u{a3c}', NSM), ('\u{a3e}', '\u{a40}', L), ('\u{a41}', '\u{a42}', NSM), ('\u{a47}', '\u{a48}', NSM), ('\u{a4b}', '\u{a4d}', NSM), ('\u{a51}', '\u{a51}', NSM), ('\u{a59}', '\u{a5c}', L), ('\u{a5e}', '\u{a5e}', L), ('\u{a66}', '\u{a6f}', L), ('\u{a70}', '\u{a71}', NSM), ('\u{a72}', '\u{a74}', L), ('\u{a75}', '\u{a75}', NSM), ('\u{a76}', '\u{a76}', L), ('\u{a81}', '\u{a82}', NSM), ('\u{a83}', '\u{a83}', L), ('\u{a85}', '\u{a8d}', L), ('\u{a8f}', '\u{a91}', L), ('\u{a93}', '\u{aa8}', L), ('\u{aaa}', '\u{ab0}', L), ('\u{ab2}', '\u{ab3}', L), ('\u{ab5}', '\u{ab9}', L), ('\u{abc}', '\u{abc}', NSM), ('\u{abd}', '\u{ac0}', L), ('\u{ac1}', '\u{ac5}', NSM), ('\u{ac7}', '\u{ac8}', NSM), ('\u{ac9}', '\u{ac9}', L), ('\u{acb}', '\u{acc}', L), ('\u{acd}', '\u{acd}', NSM), ('\u{ad0}', '\u{ad0}', L), ('\u{ae0}', '\u{ae1}', L), ('\u{ae2}', '\u{ae3}', NSM), ('\u{ae6}', '\u{af0}', L), ('\u{af1}', '\u{af1}', ET), ('\u{af9}', '\u{af9}', L), ('\u{afa}', '\u{aff}', NSM), ('\u{b01}', '\u{b01}', NSM), ('\u{b02}', '\u{b03}', L), ('\u{b05}', '\u{b0c}', L), ('\u{b0f}', '\u{b10}', L), ('\u{b13}', '\u{b28}', L), ('\u{b2a}', '\u{b30}', L), ('\u{b32}', '\u{b33}', L), ('\u{b35}', '\u{b39}', L), ('\u{b3c}', '\u{b3c}', NSM), ('\u{b3d}', '\u{b3e}', L), ('\u{b3f}', '\u{b3f}', NSM), ('\u{b40}', '\u{b40}', L), ('\u{b41}', '\u{b44}', NSM), ('\u{b47}', '\u{b48}', L), ('\u{b4b}', '\u{b4c}', L), ('\u{b4d}', '\u{b4d}', NSM), ('\u{b55}', '\u{b56}', NSM), ('\u{b57}', '\u{b57}', L), ('\u{b5c}', '\u{b5d}', L), ('\u{b5f}', '\u{b61}', L), ('\u{b62}', '\u{b63}', NSM), ('\u{b66}', '\u{b77}', L), ('\u{b82}', '\u{b82}', NSM), ('\u{b83}', '\u{b83}', L), ('\u{b85}', '\u{b8a}', L), ('\u{b8e}', '\u{b90}', L), ('\u{b92}', '\u{b95}', L), ('\u{b99}', '\u{b9a}', L), ('\u{b9c}', '\u{b9c}', L), ('\u{b9e}', '\u{b9f}', L), ('\u{ba3}', '\u{ba4}', L), ('\u{ba8}', '\u{baa}', L), ('\u{bae}', '\u{bb9}', L), ('\u{bbe}', '\u{bbf}', L), ('\u{bc0}', '\u{bc0}', NSM), ('\u{bc1}', '\u{bc2}', L), ('\u{bc6}', '\u{bc8}', L), ('\u{bca}', '\u{bcc}', L), ('\u{bcd}', '\u{bcd}', NSM), ('\u{bd0}', '\u{bd0}', L), ('\u{bd7}', '\u{bd7}', L), ('\u{be6}', '\u{bf2}', L), ('\u{bf3}', '\u{bf8}', ON), ('\u{bf9}', '\u{bf9}', ET), ('\u{bfa}', '\u{bfa}', ON), ('\u{c00}', '\u{c00}', NSM), ('\u{c01}', '\u{c03}', L), ('\u{c04}', '\u{c04}', NSM), ('\u{c05}', '\u{c0c}', L), ('\u{c0e}', '\u{c10}', L), ('\u{c12}', '\u{c28}', L), ('\u{c2a}', '\u{c39}', L), ('\u{c3c}', '\u{c3c}', NSM), ('\u{c3d}', '\u{c3d}', L), ('\u{c3e}', '\u{c40}', NSM), ('\u{c41}', '\u{c44}', L), ('\u{c46}', '\u{c48}', NSM), ('\u{c4a}', '\u{c4d}', NSM), ('\u{c55}', '\u{c56}', NSM), ('\u{c58}', '\u{c5a}', L), ('\u{c5d}', '\u{c5d}', L), ('\u{c60}', '\u{c61}', L), ('\u{c62}', '\u{c63}', NSM), ('\u{c66}', '\u{c6f}', L), ('\u{c77}', '\u{c77}', L), ('\u{c78}', '\u{c7e}', ON), ('\u{c7f}', '\u{c80}', L), ('\u{c81}', '\u{c81}', NSM), ('\u{c82}', '\u{c8c}', L), ('\u{c8e}', '\u{c90}', L), ('\u{c92}', '\u{ca8}', L), ('\u{caa}', '\u{cb3}', L), ('\u{cb5}', '\u{cb9}', L), ('\u{cbc}', '\u{cbc}', NSM), ('\u{cbd}', '\u{cc4}', L), ('\u{cc6}', '\u{cc8}', L), ('\u{cca}', '\u{ccb}', L), ('\u{ccc}', '\u{ccd}', NSM), ('\u{cd5}', '\u{cd6}', L), ('\u{cdd}', '\u{cde}', L), ('\u{ce0}', '\u{ce1}', L), ('\u{ce2}', '\u{ce3}', NSM), ('\u{ce6}', '\u{cef}', L), ('\u{cf1}', '\u{cf3}', L), ('\u{d00}', '\u{d01}', NSM), ('\u{d02}', '\u{d0c}', L), ('\u{d0e}', '\u{d10}', L), ('\u{d12}', '\u{d3a}', L), ('\u{d3b}', '\u{d3c}', NSM), ('\u{d3d}', '\u{d40}', L), ('\u{d41}', '\u{d44}', NSM), ('\u{d46}', '\u{d48}', L), ('\u{d4a}', '\u{d4c}', L), ('\u{d4d}', '\u{d4d}', NSM), ('\u{d4e}', '\u{d4f}', L), ('\u{d54}', '\u{d61}', L), ('\u{d62}', '\u{d63}', NSM), ('\u{d66}', '\u{d7f}', L), ('\u{d81}', '\u{d81}', NSM), ('\u{d82}', '\u{d83}', L), ('\u{d85}', '\u{d96}', L), ('\u{d9a}', '\u{db1}', L), ('\u{db3}', '\u{dbb}', L), ('\u{dbd}', '\u{dbd}', L), ('\u{dc0}', '\u{dc6}', L), ('\u{dca}', '\u{dca}', NSM), ('\u{dcf}', '\u{dd1}', L), ('\u{dd2}', '\u{dd4}', NSM), ('\u{dd6}', '\u{dd6}', NSM), ('\u{dd8}', '\u{ddf}', L), ('\u{de6}', '\u{def}', L), ('\u{df2}', '\u{df4}', L), ('\u{e01}', '\u{e30}', L), ('\u{e31}', '\u{e31}', NSM), ('\u{e32}', '\u{e33}', L), ('\u{e34}', '\u{e3a}', NSM), ('\u{e3f}', '\u{e3f}', ET), ('\u{e40}', '\u{e46}', L), ('\u{e47}', '\u{e4e}', NSM), ('\u{e4f}', '\u{e5b}', L), ('\u{e81}', '\u{e82}', L), ('\u{e84}', '\u{e84}', L), ('\u{e86}', '\u{e8a}', L), ('\u{e8c}', '\u{ea3}', L), ('\u{ea5}', '\u{ea5}', L), ('\u{ea7}', '\u{eb0}', L), ('\u{eb1}', '\u{eb1}', NSM), ('\u{eb2}', '\u{eb3}', L), ('\u{eb4}', '\u{ebc}', NSM), ('\u{ebd}', '\u{ebd}', L), ('\u{ec0}', '\u{ec4}', L), ('\u{ec6}', '\u{ec6}', L), ('\u{ec8}', '\u{ece}', NSM), ('\u{ed0}', '\u{ed9}', L), ('\u{edc}', '\u{edf}', L), ('\u{f00}', '\u{f17}', L), ('\u{f18}', '\u{f19}', NSM), ('\u{f1a}', '\u{f34}', L), ('\u{f35}', '\u{f35}', NSM), ('\u{f36}', '\u{f36}', L), ('\u{f37}', '\u{f37}', NSM), ('\u{f38}', '\u{f38}', L), ('\u{f39}', '\u{f39}', NSM), ('\u{f3a}', '\u{f3d}', ON), ('\u{f3e}', '\u{f47}', L), ('\u{f49}', '\u{f6c}', L), ('\u{f71}', '\u{f7e}', NSM), ('\u{f7f}', '\u{f7f}', L), ('\u{f80}', '\u{f84}', NSM), ('\u{f85}', '\u{f85}', L), ('\u{f86}', '\u{f87}', NSM), ('\u{f88}', '\u{f8c}', L), ('\u{f8d}', '\u{f97}', NSM), ('\u{f99}', '\u{fbc}', NSM), ('\u{fbe}', '\u{fc5}', L), ('\u{fc6}', '\u{fc6}', NSM), ('\u{fc7}', '\u{fcc}', L), ('\u{fce}', '\u{fda}', L), ('\u{1000}', '\u{102c}', L), ('\u{102d}', '\u{1030}', NSM), ('\u{1031}', '\u{1031}', L), ('\u{1032}', '\u{1037}', NSM), ('\u{1038}', '\u{1038}', L), ('\u{1039}', '\u{103a}', NSM), ('\u{103b}', '\u{103c}', L), ('\u{103d}', '\u{103e}', NSM), ('\u{103f}', '\u{1057}', L), ('\u{1058}', '\u{1059}', NSM), ('\u{105a}', '\u{105d}', L), ('\u{105e}', '\u{1060}', NSM), ('\u{1061}', '\u{1070}', L), ('\u{1071}', '\u{1074}', NSM), ('\u{1075}', '\u{1081}', L), ('\u{1082}', '\u{1082}', NSM), ('\u{1083}', '\u{1084}', L), ('\u{1085}', '\u{1086}', NSM), ('\u{1087}', '\u{108c}', L), ('\u{108d}', '\u{108d}', NSM), ('\u{108e}', '\u{109c}', L), ('\u{109d}', '\u{109d}', NSM), ('\u{109e}', '\u{10c5}', L), ('\u{10c7}', '\u{10c7}', L), ('\u{10cd}', '\u{10cd}', L), ('\u{10d0}', '\u{1248}', L), ('\u{124a}', '\u{124d}', L), ('\u{1250}', '\u{1256}', L), ('\u{1258}', '\u{1258}', L), ('\u{125a}', '\u{125d}', L), ('\u{1260}', '\u{1288}', L), ('\u{128a}', '\u{128d}', L), ('\u{1290}', '\u{12b0}', L), ('\u{12b2}', '\u{12b5}', L), ('\u{12b8}', '\u{12be}', L), ('\u{12c0}', '\u{12c0}', L), ('\u{12c2}', '\u{12c5}', L), ('\u{12c8}', '\u{12d6}', L), ('\u{12d8}', '\u{1310}', L), ('\u{1312}', '\u{1315}', L), ('\u{1318}', '\u{135a}', L), ('\u{135d}', '\u{135f}', NSM), ('\u{1360}', '\u{137c}', L), ('\u{1380}', '\u{138f}', L), ('\u{1390}', '\u{1399}', ON), ('\u{13a0}', '\u{13f5}', L), ('\u{13f8}', '\u{13fd}', L), ('\u{1400}', '\u{1400}', ON), ('\u{1401}', '\u{167f}', L), ('\u{1680}', '\u{1680}', WS), ('\u{1681}', '\u{169a}', L), ('\u{169b}', '\u{169c}', ON), ('\u{16a0}', '\u{16f8}', L), ('\u{1700}', '\u{1711}', L), ('\u{1712}', '\u{1714}', NSM), ('\u{1715}', '\u{1715}', L), ('\u{171f}', '\u{1731}', L), ('\u{1732}', '\u{1733}', NSM), ('\u{1734}', '\u{1736}', L), ('\u{1740}', '\u{1751}', L), ('\u{1752}', '\u{1753}', NSM), ('\u{1760}', '\u{176c}', L), ('\u{176e}', '\u{1770}', L), ('\u{1772}', '\u{1773}', NSM), ('\u{1780}', '\u{17b3}', L), ('\u{17b4}', '\u{17b5}', NSM), ('\u{17b6}', '\u{17b6}', L), ('\u{17b7}', '\u{17bd}', NSM), ('\u{17be}', '\u{17c5}', L), ('\u{17c6}', '\u{17c6}', NSM), ('\u{17c7}', '\u{17c8}', L), ('\u{17c9}', '\u{17d3}', NSM), ('\u{17d4}', '\u{17da}', L), ('\u{17db}', '\u{17db}', ET), ('\u{17dc}', '\u{17dc}', L), ('\u{17dd}', '\u{17dd}', NSM), ('\u{17e0}', '\u{17e9}', L), ('\u{17f0}', '\u{17f9}', ON), ('\u{1800}', '\u{180a}', ON), ('\u{180b}', '\u{180d}', NSM), ('\u{180e}', '\u{180e}', BN), ('\u{180f}', '\u{180f}', NSM), ('\u{1810}', '\u{1819}', L), ('\u{1820}', '\u{1878}', L), ('\u{1880}', '\u{1884}', L), ('\u{1885}', '\u{1886}', NSM), ('\u{1887}', '\u{18a8}', L), ('\u{18a9}', '\u{18a9}', NSM), ('\u{18aa}', '\u{18aa}', L), ('\u{18b0}', '\u{18f5}', L), ('\u{1900}', '\u{191e}', L), ('\u{1920}', '\u{1922}', NSM), ('\u{1923}', '\u{1926}', L), ('\u{1927}', '\u{1928}', NSM), ('\u{1929}', '\u{192b}', L), ('\u{1930}', '\u{1931}', L), ('\u{1932}', '\u{1932}', NSM), ('\u{1933}', '\u{1938}', L), ('\u{1939}', '\u{193b}', NSM), ('\u{1940}', '\u{1940}', ON), ('\u{1944}', '\u{1945}', ON), ('\u{1946}', '\u{196d}', L), ('\u{1970}', '\u{1974}', L), ('\u{1980}', '\u{19ab}', L), ('\u{19b0}', '\u{19c9}', L), ('\u{19d0}', '\u{19da}', L), ('\u{19de}', '\u{19ff}', ON), ('\u{1a00}', '\u{1a16}', L), ('\u{1a17}', '\u{1a18}', NSM), ('\u{1a19}', '\u{1a1a}', L), ('\u{1a1b}', '\u{1a1b}', NSM), ('\u{1a1e}', '\u{1a55}', L), ('\u{1a56}', '\u{1a56}', NSM), ('\u{1a57}', '\u{1a57}', L), ('\u{1a58}', '\u{1a5e}', NSM), ('\u{1a60}', '\u{1a60}', NSM), ('\u{1a61}', '\u{1a61}', L), ('\u{1a62}', '\u{1a62}', NSM), ('\u{1a63}', '\u{1a64}', L), ('\u{1a65}', '\u{1a6c}', NSM), ('\u{1a6d}', '\u{1a72}', L), ('\u{1a73}', '\u{1a7c}', NSM), ('\u{1a7f}', '\u{1a7f}', NSM), ('\u{1a80}', '\u{1a89}', L), ('\u{1a90}', '\u{1a99}', L), ('\u{1aa0}', '\u{1aad}', L), ('\u{1ab0}', '\u{1ace}', NSM), ('\u{1b00}', '\u{1b03}', NSM), ('\u{1b04}', '\u{1b33}', L), ('\u{1b34}', '\u{1b34}', NSM), ('\u{1b35}', '\u{1b35}', L), ('\u{1b36}', '\u{1b3a}', NSM), ('\u{1b3b}', '\u{1b3b}', L), ('\u{1b3c}', '\u{1b3c}', NSM), ('\u{1b3d}', '\u{1b41}', L), ('\u{1b42}', '\u{1b42}', NSM), ('\u{1b43}', '\u{1b4c}', L), ('\u{1b4e}', '\u{1b6a}', L), ('\u{1b6b}', '\u{1b73}', NSM), ('\u{1b74}', '\u{1b7f}', L), ('\u{1b80}', '\u{1b81}', NSM), ('\u{1b82}', '\u{1ba1}', L), ('\u{1ba2}', '\u{1ba5}', NSM), ('\u{1ba6}', '\u{1ba7}', L), ('\u{1ba8}', '\u{1ba9}', NSM), ('\u{1baa}', '\u{1baa}', L), ('\u{1bab}', '\u{1bad}', NSM), ('\u{1bae}', '\u{1be5}', L), ('\u{1be6}', '\u{1be6}', NSM), ('\u{1be7}', '\u{1be7}', L), ('\u{1be8}', '\u{1be9}', NSM), ('\u{1bea}', '\u{1bec}', L), ('\u{1bed}', '\u{1bed}', NSM), ('\u{1bee}', '\u{1bee}', L), ('\u{1bef}', '\u{1bf1}', NSM), ('\u{1bf2}', '\u{1bf3}', L), ('\u{1bfc}', '\u{1c2b}', L), ('\u{1c2c}', '\u{1c33}', NSM), ('\u{1c34}', '\u{1c35}', L), ('\u{1c36}', '\u{1c37}', NSM), ('\u{1c3b}', '\u{1c49}', L), ('\u{1c4d}', '\u{1c8a}', L), ('\u{1c90}', '\u{1cba}', L), ('\u{1cbd}', '\u{1cc7}', L), ('\u{1cd0}', '\u{1cd2}', NSM), ('\u{1cd3}', '\u{1cd3}', L), ('\u{1cd4}', '\u{1ce0}', NSM), ('\u{1ce1}', '\u{1ce1}', L), ('\u{1ce2}', '\u{1ce8}', NSM), ('\u{1ce9}', '\u{1cec}', L), ('\u{1ced}', '\u{1ced}', NSM), ('\u{1cee}', '\u{1cf3}', L), ('\u{1cf4}', '\u{1cf4}', NSM), ('\u{1cf5}', '\u{1cf7}', L), ('\u{1cf8}', '\u{1cf9}', NSM), ('\u{1cfa}', '\u{1cfa}', L), ('\u{1d00}', '\u{1dbf}', L), ('\u{1dc0}', '\u{1dff}', NSM), ('\u{1e00}', '\u{1f15}', L), ('\u{1f18}', '\u{1f1d}', L), ('\u{1f20}', '\u{1f45}', L), ('\u{1f48}', '\u{1f4d}', L), ('\u{1f50}', '\u{1f57}', L), ('\u{1f59}', '\u{1f59}', L), ('\u{1f5b}', '\u{1f5b}', L), ('\u{1f5d}', '\u{1f5d}', L), ('\u{1f5f}', '\u{1f7d}', L), ('\u{1f80}', '\u{1fb4}', L), ('\u{1fb6}', '\u{1fbc}', L), ('\u{1fbd}', '\u{1fbd}', ON), ('\u{1fbe}', '\u{1fbe}', L), ('\u{1fbf}', '\u{1fc1}', ON), ('\u{1fc2}', '\u{1fc4}', L), ('\u{1fc6}', '\u{1fcc}', L), ('\u{1fcd}', '\u{1fcf}', ON), ('\u{1fd0}', '\u{1fd3}', L), ('\u{1fd6}', '\u{1fdb}', L), ('\u{1fdd}', '\u{1fdf}', ON), ('\u{1fe0}', '\u{1fec}', L), ('\u{1fed}', '\u{1fef}', ON), ('\u{1ff2}', '\u{1ff4}', L), ('\u{1ff6}', '\u{1ffc}', L), ('\u{1ffd}', '\u{1ffe}', ON), ('\u{2000}', '\u{200a}', WS), ('\u{200b}', '\u{200d}', BN), ('\u{200e}', '\u{200e}', L), ('\u{200f}', '\u{200f}', R), ('\u{2010}', '\u{2027}', ON), ('\u{2028}', '\u{2028}', WS), ('\u{2029}', '\u{2029}', B), ('\u{202a}', '\u{202a}', LRE), ('\u{202b}', '\u{202b}', RLE), ('\u{202c}', '\u{202c}', PDF), ('\u{202d}', '\u{202d}', LRO), ('\u{202e}', '\u{202e}', RLO), ('\u{202f}', '\u{202f}', CS), ('\u{2030}', '\u{2034}', ET), ('\u{2035}', '\u{2043}', ON), ('\u{2044}', '\u{2044}', CS), ('\u{2045}', '\u{205e}', ON), ('\u{205f}', '\u{205f}', WS), ('\u{2060}', '\u{2064}', BN), ('\u{2066}', '\u{2066}', LRI), ('\u{2067}', '\u{2067}', RLI), ('\u{2068}', '\u{2068}', FSI), ('\u{2069}', '\u{2069}', PDI), ('\u{206a}', '\u{206f}', BN), ('\u{2070}', '\u{2070}', EN), ('\u{2071}', '\u{2071}', L), ('\u{2074}', '\u{2079}', EN), ('\u{207a}', '\u{207b}', ES), ('\u{207c}', '\u{207e}', ON), ('\u{207f}', '\u{207f}', L), ('\u{2080}', '\u{2089}', EN), ('\u{208a}', '\u{208b}', ES), ('\u{208c}', '\u{208e}', ON), ('\u{2090}', '\u{209c}', L), ('\u{20a0}', '\u{20cf}', ET), ('\u{20d0}', '\u{20f0}', NSM), ('\u{2100}', '\u{2101}', ON), ('\u{2102}', '\u{2102}', L), ('\u{2103}', '\u{2106}', ON), ('\u{2107}', '\u{2107}', L), ('\u{2108}', '\u{2109}', ON), ('\u{210a}', '\u{2113}', L), ('\u{2114}', '\u{2114}', ON), ('\u{2115}', '\u{2115}', L), ('\u{2116}', '\u{2118}', ON), ('\u{2119}', '\u{211d}', L), ('\u{211e}', '\u{2123}', ON), ('\u{2124}', '\u{2124}', L), ('\u{2125}', '\u{2125}', ON), ('\u{2126}', '\u{2126}', L), ('\u{2127}', '\u{2127}', ON), ('\u{2128}', '\u{2128}', L), ('\u{2129}', '\u{2129}', ON), ('\u{212a}', '\u{212d}', L), ('\u{212e}', '\u{212e}', ET), ('\u{212f}', '\u{2139}', L), ('\u{213a}', '\u{213b}', ON), ('\u{213c}', '\u{213f}', L), ('\u{2140}', '\u{2144}', ON), ('\u{2145}', '\u{2149}', L), ('\u{214a}', '\u{214d}', ON), ('\u{214e}', '\u{214f}', L), ('\u{2150}', '\u{215f}', ON), ('\u{2160}', '\u{2188}', L), ('\u{2189}', '\u{218b}', ON), ('\u{2190}', '\u{2211}', ON), ('\u{2212}', '\u{2212}', ES), ('\u{2213}', '\u{2213}', ET), ('\u{2214}', '\u{2335}', ON), ('\u{2336}', '\u{237a}', L), ('\u{237b}', '\u{2394}', ON), ('\u{2395}', '\u{2395}', L), ('\u{2396}', '\u{2429}', ON), ('\u{2440}', '\u{244a}', ON), ('\u{2460}', '\u{2487}', ON), ('\u{2488}', '\u{249b}', EN), ('\u{249c}', '\u{24e9}', L), ('\u{24ea}', '\u{26ab}', ON), ('\u{26ac}', '\u{26ac}', L), ('\u{26ad}', '\u{27ff}', ON), ('\u{2800}', '\u{28ff}', L), ('\u{2900}', '\u{2b73}', ON), ('\u{2b76}', '\u{2b95}', ON), ('\u{2b97}', '\u{2bff}', ON), ('\u{2c00}', '\u{2ce4}', L), ('\u{2ce5}', '\u{2cea}', ON), ('\u{2ceb}', '\u{2cee}', L), ('\u{2cef}', '\u{2cf1}', NSM), ('\u{2cf2}', '\u{2cf3}', L), ('\u{2cf9}', '\u{2cff}', ON), ('\u{2d00}', '\u{2d25}', L), ('\u{2d27}', '\u{2d27}', L), ('\u{2d2d}', '\u{2d2d}', L), ('\u{2d30}', '\u{2d67}', L), ('\u{2d6f}', '\u{2d70}', L), ('\u{2d7f}', '\u{2d7f}', NSM), ('\u{2d80}', '\u{2d96}', L), ('\u{2da0}', '\u{2da6}', L), ('\u{2da8}', '\u{2dae}', L), ('\u{2db0}', '\u{2db6}', L), ('\u{2db8}', '\u{2dbe}', L), ('\u{2dc0}', '\u{2dc6}', L), ('\u{2dc8}', '\u{2dce}', L), ('\u{2dd0}', '\u{2dd6}', L), ('\u{2dd8}', '\u{2dde}', L), ('\u{2de0}', '\u{2dff}', NSM), ('\u{2e00}', '\u{2e5d}', ON), ('\u{2e80}', '\u{2e99}', ON), ('\u{2e9b}', '\u{2ef3}', ON), ('\u{2f00}', '\u{2fd5}', ON), ('\u{2ff0}', '\u{2fff}', ON), ('\u{3000}', '\u{3000}', WS), ('\u{3001}', '\u{3004}', ON), ('\u{3005}', '\u{3007}', L), ('\u{3008}', '\u{3020}', ON), ('\u{3021}', '\u{3029}', L), ('\u{302a}', '\u{302d}', NSM), ('\u{302e}', '\u{302f}', L), ('\u{3030}', '\u{3030}', ON), ('\u{3031}', '\u{3035}', L), ('\u{3036}', '\u{3037}', ON), ('\u{3038}', '\u{303c}', L), ('\u{303d}', '\u{303f}', ON), ('\u{3041}', '\u{3096}', L), ('\u{3099}', '\u{309a}', NSM), ('\u{309b}', '\u{309c}', ON), ('\u{309d}', '\u{309f}', L), ('\u{30a0}', '\u{30a0}', ON), ('\u{30a1}', '\u{30fa}', L), ('\u{30fb}', '\u{30fb}', ON), ('\u{30fc}', '\u{30ff}', L), ('\u{3105}', '\u{312f}', L), ('\u{3131}', '\u{318e}', L), ('\u{3190}', '\u{31bf}', L), ('\u{31c0}', '\u{31e5}', ON), ('\u{31ef}', '\u{31ef}', ON), ('\u{31f0}', '\u{321c}', L), ('\u{321d}', '\u{321e}', ON), ('\u{3220}', '\u{324f}', L), ('\u{3250}', '\u{325f}', ON), ('\u{3260}', '\u{327b}', L), ('\u{327c}', '\u{327e}', ON), ('\u{327f}', '\u{32b0}', L), ('\u{32b1}', '\u{32bf}', ON), ('\u{32c0}', '\u{32cb}', L), ('\u{32cc}', '\u{32cf}', ON), ('\u{32d0}', '\u{3376}', L), ('\u{3377}', '\u{337a}', ON), ('\u{337b}', '\u{33dd}', L), ('\u{33de}', '\u{33df}', ON), ('\u{33e0}', '\u{33fe}', L), ('\u{33ff}', '\u{33ff}', ON), ('\u{3400}', '\u{4dbf}', L), ('\u{4dc0}', '\u{4dff}', ON), ('\u{4e00}', '\u{a48c}', L), ('\u{a490}', '\u{a4c6}', ON), ('\u{a4d0}', '\u{a60c}', L), ('\u{a60d}', '\u{a60f}', ON), ('\u{a610}', '\u{a62b}', L), ('\u{a640}', '\u{a66e}', L), ('\u{a66f}', '\u{a672}', NSM), ('\u{a673}', '\u{a673}', ON), ('\u{a674}', '\u{a67d}', NSM), ('\u{a67e}', '\u{a67f}', ON), ('\u{a680}', '\u{a69d}', L), ('\u{a69e}', '\u{a69f}', NSM), ('\u{a6a0}', '\u{a6ef}', L), ('\u{a6f0}', '\u{a6f1}', NSM), ('\u{a6f2}', '\u{a6f7}', L), ('\u{a700}', '\u{a721}', ON), ('\u{a722}', '\u{a787}', L), ('\u{a788}', '\u{a788}', ON), ('\u{a789}', '\u{a7cd}', L), ('\u{a7d0}', '\u{a7d1}', L), ('\u{a7d3}', '\u{a7d3}', L), ('\u{a7d5}', '\u{a7dc}', L), ('\u{a7f2}', '\u{a801}', L), ('\u{a802}', '\u{a802}', NSM), ('\u{a803}', '\u{a805}', L), ('\u{a806}', '\u{a806}', NSM), ('\u{a807}', '\u{a80a}', L), ('\u{a80b}', '\u{a80b}', NSM), ('\u{a80c}', '\u{a824}', L), ('\u{a825}', '\u{a826}', NSM), ('\u{a827}', '\u{a827}', L), ('\u{a828}', '\u{a82b}', ON), ('\u{a82c}', '\u{a82c}', NSM), ('\u{a830}', '\u{a837}', L), ('\u{a838}', '\u{a839}', ET), ('\u{a840}', '\u{a873}', L), ('\u{a874}', '\u{a877}', ON), ('\u{a880}', '\u{a8c3}', L), ('\u{a8c4}', '\u{a8c5}', NSM), ('\u{a8ce}', '\u{a8d9}', L), ('\u{a8e0}', '\u{a8f1}', NSM), ('\u{a8f2}', '\u{a8fe}', L), ('\u{a8ff}', '\u{a8ff}', NSM), ('\u{a900}', '\u{a925}', L), ('\u{a926}', '\u{a92d}', NSM), ('\u{a92e}', '\u{a946}', L), ('\u{a947}', '\u{a951}', NSM), ('\u{a952}', '\u{a953}', L), ('\u{a95f}', '\u{a97c}', L), ('\u{a980}', '\u{a982}', NSM), ('\u{a983}', '\u{a9b2}', L), ('\u{a9b3}', '\u{a9b3}', NSM), ('\u{a9b4}', '\u{a9b5}', L), ('\u{a9b6}', '\u{a9b9}', NSM), ('\u{a9ba}', '\u{a9bb}', L), ('\u{a9bc}', '\u{a9bd}', NSM), ('\u{a9be}', '\u{a9cd}', L), ('\u{a9cf}', '\u{a9d9}', L), ('\u{a9de}', '\u{a9e4}', L), ('\u{a9e5}', '\u{a9e5}', NSM), ('\u{a9e6}', '\u{a9fe}', L), ('\u{aa00}', '\u{aa28}', L), ('\u{aa29}', '\u{aa2e}', NSM), ('\u{aa2f}', '\u{aa30}', L), ('\u{aa31}', '\u{aa32}', NSM), ('\u{aa33}', '\u{aa34}', L), ('\u{aa35}', '\u{aa36}', NSM), ('\u{aa40}', '\u{aa42}', L), ('\u{aa43}', '\u{aa43}', NSM), ('\u{aa44}', '\u{aa4b}', L), ('\u{aa4c}', '\u{aa4c}', NSM), ('\u{aa4d}', '\u{aa4d}', L), ('\u{aa50}', '\u{aa59}', L), ('\u{aa5c}', '\u{aa7b}', L), ('\u{aa7c}', '\u{aa7c}', NSM), ('\u{aa7d}', '\u{aaaf}', L), ('\u{aab0}', '\u{aab0}', NSM), ('\u{aab1}', '\u{aab1}', L), ('\u{aab2}', '\u{aab4}', NSM), ('\u{aab5}', '\u{aab6}', L), ('\u{aab7}', '\u{aab8}', NSM), ('\u{aab9}', '\u{aabd}', L), ('\u{aabe}', '\u{aabf}', NSM), ('\u{aac0}', '\u{aac0}', L), ('\u{aac1}', '\u{aac1}', NSM), ('\u{aac2}', '\u{aac2}', L), ('\u{aadb}', '\u{aaeb}', L), ('\u{aaec}', '\u{aaed}', NSM), ('\u{aaee}', '\u{aaf5}', L), ('\u{aaf6}', '\u{aaf6}', NSM), ('\u{ab01}', '\u{ab06}', L), ('\u{ab09}', '\u{ab0e}', L), ('\u{ab11}', '\u{ab16}', L), ('\u{ab20}', '\u{ab26}', L), ('\u{ab28}', '\u{ab2e}', L), ('\u{ab30}', '\u{ab69}', L), ('\u{ab6a}', '\u{ab6b}', ON), ('\u{ab70}', '\u{abe4}', L), ('\u{abe5}', '\u{abe5}', NSM), ('\u{abe6}', '\u{abe7}', L), ('\u{abe8}', '\u{abe8}', NSM), ('\u{abe9}', '\u{abec}', L), ('\u{abed}', '\u{abed}', NSM), ('\u{abf0}', '\u{abf9}', L), ('\u{ac00}', '\u{d7a3}', L), ('\u{d7b0}', '\u{d7c6}', L), ('\u{d7cb}', '\u{d7fb}', L), ('\u{e000}', '\u{fa6d}', L), ('\u{fa70}', '\u{fad9}', L), ('\u{fb00}', '\u{fb06}', L), ('\u{fb13}', '\u{fb17}', L), ('\u{fb1d}', '\u{fb1d}', R), ('\u{fb1e}', '\u{fb1e}', NSM), ('\u{fb1f}', '\u{fb28}', R), ('\u{fb29}', '\u{fb29}', ES), ('\u{fb2a}', '\u{fb4f}', R), ('\u{fb50}', '\u{fd3d}', AL), ('\u{fd3e}', '\u{fd4f}', ON), ('\u{fd50}', '\u{fdce}', AL), ('\u{fdcf}', '\u{fdcf}', ON), ('\u{fdf0}', '\u{fdfc}', AL), ('\u{fdfd}', '\u{fdff}', ON), ('\u{fe00}', '\u{fe0f}', NSM), ('\u{fe10}', '\u{fe19}', ON), ('\u{fe20}', '\u{fe2f}', NSM), ('\u{fe30}', '\u{fe4f}', ON), ('\u{fe50}', '\u{fe50}', CS), ('\u{fe51}', '\u{fe51}', ON), ('\u{fe52}', '\u{fe52}', CS), ('\u{fe54}', '\u{fe54}', ON), ('\u{fe55}', '\u{fe55}', CS), ('\u{fe56}', '\u{fe5e}', ON), ('\u{fe5f}', '\u{fe5f}', ET), ('\u{fe60}', '\u{fe61}', ON), ('\u{fe62}', '\u{fe63}', ES), ('\u{fe64}', '\u{fe66}', ON), ('\u{fe68}', '\u{fe68}', ON), ('\u{fe69}', '\u{fe6a}', ET), ('\u{fe6b}', '\u{fe6b}', ON), ('\u{fe70}', '\u{fefe}', AL), ('\u{feff}', '\u{feff}', BN), ('\u{ff01}', '\u{ff02}', ON), ('\u{ff03}', '\u{ff05}', ET), ('\u{ff06}', '\u{ff0a}', ON), ('\u{ff0b}', '\u{ff0b}', ES), ('\u{ff0c}', '\u{ff0c}', CS), ('\u{ff0d}', '\u{ff0d}', ES), ('\u{ff0e}', '\u{ff0f}', CS), ('\u{ff10}', '\u{ff19}', EN), ('\u{ff1a}', '\u{ff1a}', CS), ('\u{ff1b}', '\u{ff20}', ON), ('\u{ff21}', '\u{ff3a}', L), ('\u{ff3b}', '\u{ff40}', ON), ('\u{ff41}', '\u{ff5a}', L), ('\u{ff5b}', '\u{ff65}', ON), ('\u{ff66}', '\u{ffbe}', L), ('\u{ffc2}', '\u{ffc7}', L), ('\u{ffca}', '\u{ffcf}', L), ('\u{ffd2}', '\u{ffd7}', L), ('\u{ffda}', '\u{ffdc}', L), ('\u{ffe0}', '\u{ffe1}', ET), ('\u{ffe2}', '\u{ffe4}', ON), ('\u{ffe5}', '\u{ffe6}', ET), ('\u{ffe8}', '\u{ffee}', ON), ('\u{fff9}', '\u{fffd}', ON), ('\u{10000}', '\u{1000b}', L), ('\u{1000d}', '\u{10026}', L), ('\u{10028}', '\u{1003a}', L), ('\u{1003c}', '\u{1003d}', L), ('\u{1003f}', '\u{1004d}', L), ('\u{10050}', '\u{1005d}', L), ('\u{10080}', '\u{100fa}', L), ('\u{10100}', '\u{10100}', L), ('\u{10101}', '\u{10101}', ON), ('\u{10102}', '\u{10102}', L), ('\u{10107}', '\u{10133}', L), ('\u{10137}', '\u{1013f}', L), ('\u{10140}', '\u{1018c}', ON), ('\u{1018d}', '\u{1018e}', L), ('\u{10190}', '\u{1019c}', ON), ('\u{101a0}', '\u{101a0}', ON), ('\u{101d0}', '\u{101fc}', L), ('\u{101fd}', '\u{101fd}', NSM), ('\u{10280}', '\u{1029c}', L), ('\u{102a0}', '\u{102d0}', L), ('\u{102e0}', '\u{102e0}', NSM), ('\u{102e1}', '\u{102fb}', EN), ('\u{10300}', '\u{10323}', L), ('\u{1032d}', '\u{1034a}', L), ('\u{10350}', '\u{10375}', L), ('\u{10376}', '\u{1037a}', NSM), ('\u{10380}', '\u{1039d}', L), ('\u{1039f}', '\u{103c3}', L), ('\u{103c8}', '\u{103d5}', L), ('\u{10400}', '\u{1049d}', L), ('\u{104a0}', '\u{104a9}', L), ('\u{104b0}', '\u{104d3}', L), ('\u{104d8}', '\u{104fb}', L), ('\u{10500}', '\u{10527}', L), ('\u{10530}', '\u{10563}', L), ('\u{1056f}', '\u{1057a}', L), ('\u{1057c}', '\u{1058a}', L), ('\u{1058c}', '\u{10592}', L), ('\u{10594}', '\u{10595}', L), ('\u{10597}', '\u{105a1}', L), ('\u{105a3}', '\u{105b1}', L), ('\u{105b3}', '\u{105b9}', L), ('\u{105bb}', '\u{105bc}', L), ('\u{105c0}', '\u{105f3}', L), ('\u{10600}', '\u{10736}', L), ('\u{10740}', '\u{10755}', L), ('\u{10760}', '\u{10767}', L), ('\u{10780}', '\u{10785}', L), ('\u{10787}', '\u{107b0}', L), ('\u{107b2}', '\u{107ba}', L), ('\u{10800}', '\u{1091e}', R), ('\u{1091f}', '\u{1091f}', ON), ('\u{10920}', '\u{10a00}', R), ('\u{10a01}', '\u{10a03}', NSM), ('\u{10a04}', '\u{10a04}', R), ('\u{10a05}', '\u{10a06}', NSM), ('\u{10a07}', '\u{10a0b}', R), ('\u{10a0c}', '\u{10a0f}', NSM), ('\u{10a10}', '\u{10a37}', R), ('\u{10a38}', '\u{10a3a}', NSM), ('\u{10a3b}', '\u{10a3e}', R), ('\u{10a3f}', '\u{10a3f}', NSM), ('\u{10a40}', '\u{10ae4}', R), ('\u{10ae5}', '\u{10ae6}', NSM), ('\u{10ae7}', '\u{10b38}', R), ('\u{10b39}', '\u{10b3f}', ON), ('\u{10b40}', '\u{10cff}', R), ('\u{10d00}', '\u{10d23}', AL), ('\u{10d24}', '\u{10d27}', NSM), ('\u{10d28}', '\u{10d2f}', R), ('\u{10d30}', '\u{10d39}', AN), ('\u{10d3a}', '\u{10d3f}', R), ('\u{10d40}', '\u{10d49}', AN), ('\u{10d4a}', '\u{10d68}', R), ('\u{10d69}', '\u{10d6d}', NSM), ('\u{10d6e}', '\u{10d6e}', ON), ('\u{10d6f}', '\u{10e5f}', R), ('\u{10e60}', '\u{10e7e}', AN), ('\u{10e7f}', '\u{10eaa}', R), ('\u{10eab}', '\u{10eac}', NSM), ('\u{10ead}', '\u{10ec1}', R), ('\u{10ec2}', '\u{10ec4}', AL), ('\u{10ec5}', '\u{10efb}', R), ('\u{10efc}', '\u{10eff}', NSM), ('\u{10f00}', '\u{10f2f}', R), ('\u{10f30}', '\u{10f45}', AL), ('\u{10f46}', '\u{10f50}', NSM), ('\u{10f51}', '\u{10f59}', AL), ('\u{10f5a}', '\u{10f81}', R), ('\u{10f82}', '\u{10f85}', NSM), ('\u{10f86}', '\u{10fff}', R), ('\u{11000}', '\u{11000}', L), ('\u{11001}', '\u{11001}', NSM), ('\u{11002}', '\u{11037}', L), ('\u{11038}', '\u{11046}', NSM), ('\u{11047}', '\u{1104d}', L), ('\u{11052}', '\u{11065}', ON), ('\u{11066}', '\u{1106f}', L), ('\u{11070}', '\u{11070}', NSM), ('\u{11071}', '\u{11072}', L), ('\u{11073}', '\u{11074}', NSM), ('\u{11075}', '\u{11075}', L), ('\u{1107f}', '\u{11081}', NSM), ('\u{11082}', '\u{110b2}', L), ('\u{110b3}', '\u{110b6}', NSM), ('\u{110b7}', '\u{110b8}', L), ('\u{110b9}', '\u{110ba}', NSM), ('\u{110bb}', '\u{110c1}', L), ('\u{110c2}', '\u{110c2}', NSM), ('\u{110cd}', '\u{110cd}', L), ('\u{110d0}', '\u{110e8}', L), ('\u{110f0}', '\u{110f9}', L), ('\u{11100}', '\u{11102}', NSM), ('\u{11103}', '\u{11126}', L), ('\u{11127}', '\u{1112b}', NSM), ('\u{1112c}', '\u{1112c}', L), ('\u{1112d}', '\u{11134}', NSM), ('\u{11136}', '\u{11147}', L), ('\u{11150}', '\u{11172}', L), ('\u{11173}', '\u{11173}', NSM), ('\u{11174}', '\u{11176}', L), ('\u{11180}', '\u{11181}', NSM), ('\u{11182}', '\u{111b5}', L), ('\u{111b6}', '\u{111be}', NSM), ('\u{111bf}', '\u{111c8}', L), ('\u{111c9}', '\u{111cc}', NSM), ('\u{111cd}', '\u{111ce}', L), ('\u{111cf}', '\u{111cf}', NSM), ('\u{111d0}', '\u{111df}', L), ('\u{111e1}', '\u{111f4}', L), ('\u{11200}', '\u{11211}', L), ('\u{11213}', '\u{1122e}', L), ('\u{1122f}', '\u{11231}', NSM), ('\u{11232}', '\u{11233}', L), ('\u{11234}', '\u{11234}', NSM), ('\u{11235}', '\u{11235}', L), ('\u{11236}', '\u{11237}', NSM), ('\u{11238}', '\u{1123d}', L), ('\u{1123e}', '\u{1123e}', NSM), ('\u{1123f}', '\u{11240}', L), ('\u{11241}', '\u{11241}', NSM), ('\u{11280}', '\u{11286}', L), ('\u{11288}', '\u{11288}', L), ('\u{1128a}', '\u{1128d}', L), ('\u{1128f}', '\u{1129d}', L), ('\u{1129f}', '\u{112a9}', L), ('\u{112b0}', '\u{112de}', L), ('\u{112df}', '\u{112df}', NSM), ('\u{112e0}', '\u{112e2}', L), ('\u{112e3}', '\u{112ea}', NSM), ('\u{112f0}', '\u{112f9}', L), ('\u{11300}', '\u{11301}', NSM), ('\u{11302}', '\u{11303}', L), ('\u{11305}', '\u{1130c}', L), ('\u{1130f}', '\u{11310}', L), ('\u{11313}', '\u{11328}', L), ('\u{1132a}', '\u{11330}', L), ('\u{11332}', '\u{11333}', L), ('\u{11335}', '\u{11339}', L), ('\u{1133b}', '\u{1133c}', NSM), ('\u{1133d}', '\u{1133f}', L), ('\u{11340}', '\u{11340}', NSM), ('\u{11341}', '\u{11344}', L), ('\u{11347}', '\u{11348}', L), ('\u{1134b}', '\u{1134d}', L), ('\u{11350}', '\u{11350}', L), ('\u{11357}', '\u{11357}', L), ('\u{1135d}', '\u{11363}', L), ('\u{11366}', '\u{1136c}', NSM), ('\u{11370}', '\u{11374}', NSM), ('\u{11380}', '\u{11389}', L), ('\u{1138b}', '\u{1138b}', L), ('\u{1138e}', '\u{1138e}', L), ('\u{11390}', '\u{113b5}', L), ('\u{113b7}', '\u{113ba}', L), ('\u{113bb}', '\u{113c0}', NSM), ('\u{113c2}', '\u{113c2}', L), ('\u{113c5}', '\u{113c5}', L), ('\u{113c7}', '\u{113ca}', L), ('\u{113cc}', '\u{113cd}', L), ('\u{113ce}', '\u{113ce}', NSM), ('\u{113cf}', '\u{113cf}', L), ('\u{113d0}', '\u{113d0}', NSM), ('\u{113d1}', '\u{113d1}', L), ('\u{113d2}', '\u{113d2}', NSM), ('\u{113d3}', '\u{113d5}', L), ('\u{113d7}', '\u{113d8}', L), ('\u{113e1}', '\u{113e2}', NSM), ('\u{11400}', '\u{11437}', L), ('\u{11438}', '\u{1143f}', NSM), ('\u{11440}', '\u{11441}', L), ('\u{11442}', '\u{11444}', NSM), ('\u{11445}', '\u{11445}', L), ('\u{11446}', '\u{11446}', NSM), ('\u{11447}', '\u{1145b}', L), ('\u{1145d}', '\u{1145d}', L), ('\u{1145e}', '\u{1145e}', NSM), ('\u{1145f}', '\u{11461}', L), ('\u{11480}', '\u{114b2}', L), ('\u{114b3}', '\u{114b8}', NSM), ('\u{114b9}', '\u{114b9}', L), ('\u{114ba}', '\u{114ba}', NSM), ('\u{114bb}', '\u{114be}', L), ('\u{114bf}', '\u{114c0}', NSM), ('\u{114c1}', '\u{114c1}', L), ('\u{114c2}', '\u{114c3}', NSM), ('\u{114c4}', '\u{114c7}', L), ('\u{114d0}', '\u{114d9}', L), ('\u{11580}', '\u{115b1}', L), ('\u{115b2}', '\u{115b5}', NSM), ('\u{115b8}', '\u{115bb}', L), ('\u{115bc}', '\u{115bd}', NSM), ('\u{115be}', '\u{115be}', L), ('\u{115bf}', '\u{115c0}', NSM), ('\u{115c1}', '\u{115db}', L), ('\u{115dc}', '\u{115dd}', NSM), ('\u{11600}', '\u{11632}', L), ('\u{11633}', '\u{1163a}', NSM), ('\u{1163b}', '\u{1163c}', L), ('\u{1163d}', '\u{1163d}', NSM), ('\u{1163e}', '\u{1163e}', L), ('\u{1163f}', '\u{11640}', NSM), ('\u{11641}', '\u{11644}', L), ('\u{11650}', '\u{11659}', L), ('\u{11660}', '\u{1166c}', ON), ('\u{11680}', '\u{116aa}', L), ('\u{116ab}', '\u{116ab}', NSM), ('\u{116ac}', '\u{116ac}', L), ('\u{116ad}', '\u{116ad}', NSM), ('\u{116ae}', '\u{116af}', L), ('\u{116b0}', '\u{116b5}', NSM), ('\u{116b6}', '\u{116b6}', L), ('\u{116b7}', '\u{116b7}', NSM), ('\u{116b8}', '\u{116b9}', L), ('\u{116c0}', '\u{116c9}', L), ('\u{116d0}', '\u{116e3}', L), ('\u{11700}', '\u{1171a}', L), ('\u{1171d}', '\u{1171d}', NSM), ('\u{1171e}', '\u{1171e}', L), ('\u{1171f}', '\u{1171f}', NSM), ('\u{11720}', '\u{11721}', L), ('\u{11722}', '\u{11725}', NSM), ('\u{11726}', '\u{11726}', L), ('\u{11727}', '\u{1172b}', NSM), ('\u{11730}', '\u{11746}', L), ('\u{11800}', '\u{1182e}', L), ('\u{1182f}', '\u{11837}', NSM), ('\u{11838}', '\u{11838}', L), ('\u{11839}', '\u{1183a}', NSM), ('\u{1183b}', '\u{1183b}', L), ('\u{118a0}', '\u{118f2}', L), ('\u{118ff}', '\u{11906}', L), ('\u{11909}', '\u{11909}', L), ('\u{1190c}', '\u{11913}', L), ('\u{11915}', '\u{11916}', L), ('\u{11918}', '\u{11935}', L), ('\u{11937}', '\u{11938}', L), ('\u{1193b}', '\u{1193c}', NSM), ('\u{1193d}', '\u{1193d}', L), ('\u{1193e}', '\u{1193e}', NSM), ('\u{1193f}', '\u{11942}', L), ('\u{11943}', '\u{11943}', NSM), ('\u{11944}', '\u{11946}', L), ('\u{11950}', '\u{11959}', L), ('\u{119a0}', '\u{119a7}', L), ('\u{119aa}', '\u{119d3}', L), ('\u{119d4}', '\u{119d7}', NSM), ('\u{119da}', '\u{119db}', NSM), ('\u{119dc}', '\u{119df}', L), ('\u{119e0}', '\u{119e0}', NSM), ('\u{119e1}', '\u{119e4}', L), ('\u{11a00}', '\u{11a00}', L), ('\u{11a01}', '\u{11a06}', NSM), ('\u{11a07}', '\u{11a08}', L), ('\u{11a09}', '\u{11a0a}', NSM), ('\u{11a0b}', '\u{11a32}', L), ('\u{11a33}', '\u{11a38}', NSM), ('\u{11a39}', '\u{11a3a}', L), ('\u{11a3b}', '\u{11a3e}', NSM), ('\u{11a3f}', '\u{11a46}', L), ('\u{11a47}', '\u{11a47}', NSM), ('\u{11a50}', '\u{11a50}', L), ('\u{11a51}', '\u{11a56}', NSM), ('\u{11a57}', '\u{11a58}', L), ('\u{11a59}', '\u{11a5b}', NSM), ('\u{11a5c}', '\u{11a89}', L), ('\u{11a8a}', '\u{11a96}', NSM), ('\u{11a97}', '\u{11a97}', L), ('\u{11a98}', '\u{11a99}', NSM), ('\u{11a9a}', '\u{11aa2}', L), ('\u{11ab0}', '\u{11af8}', L), ('\u{11b00}', '\u{11b09}', L), ('\u{11bc0}', '\u{11be1}', L), ('\u{11bf0}', '\u{11bf9}', L), ('\u{11c00}', '\u{11c08}', L), ('\u{11c0a}', '\u{11c2f}', L), ('\u{11c30}', '\u{11c36}', NSM), ('\u{11c38}', '\u{11c3d}', NSM), ('\u{11c3e}', '\u{11c45}', L), ('\u{11c50}', '\u{11c6c}', L), ('\u{11c70}', '\u{11c8f}', L), ('\u{11c92}', '\u{11ca7}', NSM), ('\u{11ca9}', '\u{11ca9}', L), ('\u{11caa}', '\u{11cb0}', NSM), ('\u{11cb1}', '\u{11cb1}', L), ('\u{11cb2}', '\u{11cb3}', NSM), ('\u{11cb4}', '\u{11cb4}', L), ('\u{11cb5}', '\u{11cb6}', NSM), ('\u{11d00}', '\u{11d06}', L), ('\u{11d08}', '\u{11d09}', L), ('\u{11d0b}', '\u{11d30}', L), ('\u{11d31}', '\u{11d36}', NSM), ('\u{11d3a}', '\u{11d3a}', NSM), ('\u{11d3c}', '\u{11d3d}', NSM), ('\u{11d3f}', '\u{11d45}', NSM), ('\u{11d46}', '\u{11d46}', L), ('\u{11d47}', '\u{11d47}', NSM), ('\u{11d50}', '\u{11d59}', L), ('\u{11d60}', '\u{11d65}', L), ('\u{11d67}', '\u{11d68}', L), ('\u{11d6a}', '\u{11d8e}', L), ('\u{11d90}', '\u{11d91}', NSM), ('\u{11d93}', '\u{11d94}', L), ('\u{11d95}', '\u{11d95}', NSM), ('\u{11d96}', '\u{11d96}', L), ('\u{11d97}', '\u{11d97}', NSM), ('\u{11d98}', '\u{11d98}', L), ('\u{11da0}', '\u{11da9}', L), ('\u{11ee0}', '\u{11ef2}', L), ('\u{11ef3}', '\u{11ef4}', NSM), ('\u{11ef5}', '\u{11ef8}', L), ('\u{11f00}', '\u{11f01}', NSM), ('\u{11f02}', '\u{11f10}', L), ('\u{11f12}', '\u{11f35}', L), ('\u{11f36}', '\u{11f3a}', NSM), ('\u{11f3e}', '\u{11f3f}', L), ('\u{11f40}', '\u{11f40}', NSM), ('\u{11f41}', '\u{11f41}', L), ('\u{11f42}', '\u{11f42}', NSM), ('\u{11f43}', '\u{11f59}', L), ('\u{11f5a}', '\u{11f5a}', NSM), ('\u{11fb0}', '\u{11fb0}', L), ('\u{11fc0}', '\u{11fd4}', L), ('\u{11fd5}', '\u{11fdc}', ON), ('\u{11fdd}', '\u{11fe0}', ET), ('\u{11fe1}', '\u{11ff1}', ON), ('\u{11fff}', '\u{12399}', L), ('\u{12400}', '\u{1246e}', L), ('\u{12470}', '\u{12474}', L), ('\u{12480}', '\u{12543}', L), ('\u{12f90}', '\u{12ff2}', L), ('\u{13000}', '\u{1343f}', L), ('\u{13440}', '\u{13440}', NSM), ('\u{13441}', '\u{13446}', L), ('\u{13447}', '\u{13455}', NSM), ('\u{13460}', '\u{143fa}', L), ('\u{14400}', '\u{14646}', L), ('\u{16100}', '\u{1611d}', L), ('\u{1611e}', '\u{16129}', NSM), ('\u{1612a}', '\u{1612c}', L), ('\u{1612d}', '\u{1612f}', NSM), ('\u{16130}', '\u{16139}', L), ('\u{16800}', '\u{16a38}', L), ('\u{16a40}', '\u{16a5e}', L), ('\u{16a60}', '\u{16a69}', L), ('\u{16a6e}', '\u{16abe}', L), ('\u{16ac0}', '\u{16ac9}', L), ('\u{16ad0}', '\u{16aed}', L), ('\u{16af0}', '\u{16af4}', NSM), ('\u{16af5}', '\u{16af5}', L), ('\u{16b00}', '\u{16b2f}', L), ('\u{16b30}', '\u{16b36}', NSM), ('\u{16b37}', '\u{16b45}', L), ('\u{16b50}', '\u{16b59}', L), ('\u{16b5b}', '\u{16b61}', L), ('\u{16b63}', '\u{16b77}', L), ('\u{16b7d}', '\u{16b8f}', L), ('\u{16d40}', '\u{16d79}', L), ('\u{16e40}', '\u{16e9a}', L), ('\u{16f00}', '\u{16f4a}', L), ('\u{16f4f}', '\u{16f4f}', NSM), ('\u{16f50}', '\u{16f87}', L), ('\u{16f8f}', '\u{16f92}', NSM), ('\u{16f93}', '\u{16f9f}', L), ('\u{16fe0}', '\u{16fe1}', L), ('\u{16fe2}', '\u{16fe2}', ON), ('\u{16fe3}', '\u{16fe3}', L), ('\u{16fe4}', '\u{16fe4}', NSM), ('\u{16ff0}', '\u{16ff1}', L), ('\u{17000}', '\u{187f7}', L), ('\u{18800}', '\u{18cd5}', L), ('\u{18cff}', '\u{18d08}', L), ('\u{1aff0}', '\u{1aff3}', L), ('\u{1aff5}', '\u{1affb}', L), ('\u{1affd}', '\u{1affe}', L), ('\u{1b000}', '\u{1b122}', L), ('\u{1b132}', '\u{1b132}', L), ('\u{1b150}', '\u{1b152}', L), ('\u{1b155}', '\u{1b155}', L), ('\u{1b164}', '\u{1b167}', L), ('\u{1b170}', '\u{1b2fb}', L), ('\u{1bc00}', '\u{1bc6a}', L), ('\u{1bc70}', '\u{1bc7c}', L), ('\u{1bc80}', '\u{1bc88}', L), ('\u{1bc90}', '\u{1bc99}', L), ('\u{1bc9c}', '\u{1bc9c}', L), ('\u{1bc9d}', '\u{1bc9e}', NSM), ('\u{1bc9f}', '\u{1bc9f}', L), ('\u{1bca0}', '\u{1bca3}', BN), ('\u{1cc00}', '\u{1ccd5}', ON), ('\u{1ccd6}', '\u{1ccef}', L), ('\u{1ccf0}', '\u{1ccf9}', EN), ('\u{1cd00}', '\u{1ceb3}', ON), ('\u{1cf00}', '\u{1cf2d}', NSM), ('\u{1cf30}', '\u{1cf46}', NSM), ('\u{1cf50}', '\u{1cfc3}', L), ('\u{1d000}', '\u{1d0f5}', L), ('\u{1d100}', '\u{1d126}', L), ('\u{1d129}', '\u{1d166}', L), ('\u{1d167}', '\u{1d169}', NSM), ('\u{1d16a}', '\u{1d172}', L), ('\u{1d173}', '\u{1d17a}', BN), ('\u{1d17b}', '\u{1d182}', NSM), ('\u{1d183}', '\u{1d184}', L), ('\u{1d185}', '\u{1d18b}', NSM), ('\u{1d18c}', '\u{1d1a9}', L), ('\u{1d1aa}', '\u{1d1ad}', NSM), ('\u{1d1ae}', '\u{1d1e8}', L), ('\u{1d1e9}', '\u{1d1ea}', ON), ('\u{1d200}', '\u{1d241}', ON), ('\u{1d242}', '\u{1d244}', NSM), ('\u{1d245}', '\u{1d245}', ON), ('\u{1d2c0}', '\u{1d2d3}', L), ('\u{1d2e0}', '\u{1d2f3}', L), ('\u{1d300}', '\u{1d356}', ON), ('\u{1d360}', '\u{1d378}', L), ('\u{1d400}', '\u{1d454}', L), ('\u{1d456}', '\u{1d49c}', L), ('\u{1d49e}', '\u{1d49f}', L), ('\u{1d4a2}', '\u{1d4a2}', L), ('\u{1d4a5}', '\u{1d4a6}', L), ('\u{1d4a9}', '\u{1d4ac}', L), ('\u{1d4ae}', '\u{1d4b9}', L), ('\u{1d4bb}', '\u{1d4bb}', L), ('\u{1d4bd}', '\u{1d4c3}', L), ('\u{1d4c5}', '\u{1d505}', L), ('\u{1d507}', '\u{1d50a}', L), ('\u{1d50d}', '\u{1d514}', L), ('\u{1d516}', '\u{1d51c}', L), ('\u{1d51e}', '\u{1d539}', L), ('\u{1d53b}', '\u{1d53e}', L), ('\u{1d540}', '\u{1d544}', L), ('\u{1d546}', '\u{1d546}', L), ('\u{1d54a}', '\u{1d550}', L), ('\u{1d552}', '\u{1d6a5}', L), ('\u{1d6a8}', '\u{1d6c0}', L), ('\u{1d6c1}', '\u{1d6c1}', ON), ('\u{1d6c2}', '\u{1d6da}', L), ('\u{1d6db}', '\u{1d6db}', ON), ('\u{1d6dc}', '\u{1d6fa}', L), ('\u{1d6fb}', '\u{1d6fb}', ON), ('\u{1d6fc}', '\u{1d714}', L), ('\u{1d715}', '\u{1d715}', ON), ('\u{1d716}', '\u{1d734}', L), ('\u{1d735}', '\u{1d735}', ON), ('\u{1d736}', '\u{1d74e}', L), ('\u{1d74f}', '\u{1d74f}', ON), ('\u{1d750}', '\u{1d76e}', L), ('\u{1d76f}', '\u{1d76f}', ON), ('\u{1d770}', '\u{1d788}', L), ('\u{1d789}', '\u{1d789}', ON), ('\u{1d78a}', '\u{1d7a8}', L), ('\u{1d7a9}', '\u{1d7a9}', ON), ('\u{1d7aa}', '\u{1d7c2}', L), ('\u{1d7c3}', '\u{1d7c3}', ON), ('\u{1d7c4}', '\u{1d7cb}', L), ('\u{1d7ce}', '\u{1d7ff}', EN), ('\u{1d800}', '\u{1d9ff}', L), ('\u{1da00}', '\u{1da36}', NSM), ('\u{1da37}', '\u{1da3a}', L), ('\u{1da3b}', '\u{1da6c}', NSM), ('\u{1da6d}', '\u{1da74}', L), ('\u{1da75}', '\u{1da75}', NSM), ('\u{1da76}', '\u{1da83}', L), ('\u{1da84}', '\u{1da84}', NSM), ('\u{1da85}', '\u{1da8b}', L), ('\u{1da9b}', '\u{1da9f}', NSM), ('\u{1daa1}', '\u{1daaf}', NSM), ('\u{1df00}', '\u{1df1e}', L), ('\u{1df25}', '\u{1df2a}', L), ('\u{1e000}', '\u{1e006}', NSM), ('\u{1e008}', '\u{1e018}', NSM), ('\u{1e01b}', '\u{1e021}', NSM), ('\u{1e023}', '\u{1e024}', NSM), ('\u{1e026}', '\u{1e02a}', NSM), ('\u{1e030}', '\u{1e06d}', L), ('\u{1e08f}', '\u{1e08f}', NSM), ('\u{1e100}', '\u{1e12c}', L), ('\u{1e130}', '\u{1e136}', NSM), ('\u{1e137}', '\u{1e13d}', L), ('\u{1e140}', '\u{1e149}', L), ('\u{1e14e}', '\u{1e14f}', L), ('\u{1e290}', '\u{1e2ad}', L), ('\u{1e2ae}', '\u{1e2ae}', NSM), ('\u{1e2c0}', '\u{1e2eb}', L), ('\u{1e2ec}', '\u{1e2ef}', NSM), ('\u{1e2f0}', '\u{1e2f9}', L), ('\u{1e2ff}', '\u{1e2ff}', ET), ('\u{1e4d0}', '\u{1e4eb}', L), ('\u{1e4ec}', '\u{1e4ef}', NSM), ('\u{1e4f0}', '\u{1e4f9}', L), ('\u{1e5d0}', '\u{1e5ed}', L), ('\u{1e5ee}', '\u{1e5ef}', NSM), ('\u{1e5f0}', '\u{1e5fa}', L), ('\u{1e5ff}', '\u{1e5ff}', L), ('\u{1e7e0}', '\u{1e7e6}', L), ('\u{1e7e8}', '\u{1e7eb}', L), ('\u{1e7ed}', '\u{1e7ee}', L), ('\u{1e7f0}', '\u{1e7fe}', L), ('\u{1e800}', '\u{1e8cf}', R), ('\u{1e8d0}', '\u{1e8d6}', NSM), ('\u{1e8d7}', '\u{1e943}', R), ('\u{1e944}', '\u{1e94a}', NSM), ('\u{1e94b}', '\u{1ec70}', R), ('\u{1ec71}', '\u{1ecb4}', AL), ('\u{1ecb5}', '\u{1ed00}', R), ('\u{1ed01}', '\u{1ed3d}', AL), ('\u{1ed3e}', '\u{1edff}', R), ('\u{1ee00}', '\u{1eeef}', AL), ('\u{1eef0}', '\u{1eef1}', ON), ('\u{1eef2}', '\u{1eeff}', AL), ('\u{1ef00}', '\u{1efff}', R), ('\u{1f000}', '\u{1f02b}', ON), ('\u{1f030}', '\u{1f093}', ON), ('\u{1f0a0}', '\u{1f0ae}', ON), ('\u{1f0b1}', '\u{1f0bf}', ON), ('\u{1f0c1}', '\u{1f0cf}', ON), ('\u{1f0d1}', '\u{1f0f5}', ON), ('\u{1f100}', '\u{1f10a}', EN), ('\u{1f10b}', '\u{1f10f}', ON), ('\u{1f110}', '\u{1f12e}', L), ('\u{1f12f}', '\u{1f12f}', ON), ('\u{1f130}', '\u{1f169}', L), ('\u{1f16a}', '\u{1f16f}', ON), ('\u{1f170}', '\u{1f1ac}', L), ('\u{1f1ad}', '\u{1f1ad}', ON), ('\u{1f1e6}', '\u{1f202}', L), ('\u{1f210}', '\u{1f23b}', L), ('\u{1f240}', '\u{1f248}', L), ('\u{1f250}', '\u{1f251}', L), ('\u{1f260}', '\u{1f265}', ON), ('\u{1f300}', '\u{1f6d7}', ON), ('\u{1f6dc}', '\u{1f6ec}', ON), ('\u{1f6f0}', '\u{1f6fc}', ON), ('\u{1f700}', '\u{1f776}', ON), ('\u{1f77b}', '\u{1f7d9}', ON), ('\u{1f7e0}', '\u{1f7eb}', ON), ('\u{1f7f0}', '\u{1f7f0}', ON), ('\u{1f800}', '\u{1f80b}', ON), ('\u{1f810}', '\u{1f847}', ON), ('\u{1f850}', '\u{1f859}', ON), ('\u{1f860}', '\u{1f887}', ON), ('\u{1f890}', '\u{1f8ad}', ON), ('\u{1f8b0}', '\u{1f8bb}', ON), ('\u{1f8c0}', '\u{1f8c1}', ON), ('\u{1f900}', '\u{1fa53}', ON), ('\u{1fa60}', '\u{1fa6d}', ON), ('\u{1fa70}', '\u{1fa7c}', ON), ('\u{1fa80}', '\u{1fa89}', ON), ('\u{1fa8f}', '\u{1fac6}', ON), ('\u{1face}', '\u{1fadc}', ON), ('\u{1fadf}', '\u{1fae9}', ON), ('\u{1faf0}', '\u{1faf8}', ON), ('\u{1fb00}', '\u{1fb92}', ON), ('\u{1fb94}', '\u{1fbef}', ON), ('\u{1fbf0}', '\u{1fbf9}', EN), ('\u{20000}', '\u{2a6df}', L), ('\u{2a700}', '\u{2b739}', L), ('\u{2b740}', '\u{2b81d}', L), ('\u{2b820}', '\u{2cea1}', L), ('\u{2ceb0}', '\u{2ebe0}', L), ('\u{2ebf0}', '\u{2ee5d}', L), ('\u{2f800}', '\u{2fa1d}', L), ('\u{30000}', '\u{3134a}', L), ('\u{31350}', '\u{323af}', L), ('\u{e0001}', '\u{e0001}', BN), ('\u{e0020}', '\u{e007f}', BN), ('\u{e0100}', '\u{e01ef}', NSM), ('\u{f0000}', '\u{ffffd}', L), ('\u{100000}', '\u{10fffd}', L) ]; pub const bidi_pairs_table: &'static [(char, char, Option)] = &[ ('\u{28}', '\u{29}', None), ('\u{5b}', '\u{5d}', None), ('\u{7b}', '\u{7d}', None), ('\u{f3a}', '\u{f3b}', None), ('\u{f3c}', '\u{f3d}', None), ('\u{169b}', '\u{169c}', None), ('\u{2045}', '\u{2046}', None), ('\u{207d}', '\u{207e}', None), ('\u{208d}', '\u{208e}', None), ('\u{2308}', '\u{2309}', None), ('\u{230a}', '\u{230b}', None), ('\u{2329}', '\u{232a}', Some('\u{3008}')), ('\u{2768}', '\u{2769}', None), ('\u{276a}', '\u{276b}', None), ('\u{276c}', '\u{276d}', None), ('\u{276e}', '\u{276f}', None), ('\u{2770}', '\u{2771}', None), ('\u{2772}', '\u{2773}', None), ('\u{2774}', '\u{2775}', None), ('\u{27c5}', '\u{27c6}', None), ('\u{27e6}', '\u{27e7}', None), ('\u{27e8}', '\u{27e9}', None), ('\u{27ea}', '\u{27eb}', None), ('\u{27ec}', '\u{27ed}', None), ('\u{27ee}', '\u{27ef}', None), ('\u{2983}', '\u{2984}', None), ('\u{2985}', '\u{2986}', None), ('\u{2987}', '\u{2988}', None), ('\u{2989}', '\u{298a}', None), ('\u{298b}', '\u{298c}', None), ('\u{298d}', '\u{2990}', None), ('\u{298f}', '\u{298e}', None), ('\u{2991}', '\u{2992}', None), ('\u{2993}', '\u{2994}', None), ('\u{2995}', '\u{2996}', None), ('\u{2997}', '\u{2998}', None), ('\u{29d8}', '\u{29d9}', None), ('\u{29da}', '\u{29db}', None), ('\u{29fc}', '\u{29fd}', None), ('\u{2e22}', '\u{2e23}', None), ('\u{2e24}', '\u{2e25}', None), ('\u{2e26}', '\u{2e27}', None), ('\u{2e28}', '\u{2e29}', None), ('\u{2e55}', '\u{2e56}', None), ('\u{2e57}', '\u{2e58}', None), ('\u{2e59}', '\u{2e5a}', None), ('\u{2e5b}', '\u{2e5c}', None), ('\u{3008}', '\u{3009}', None), ('\u{300a}', '\u{300b}', None), ('\u{300c}', '\u{300d}', None), ('\u{300e}', '\u{300f}', None), ('\u{3010}', '\u{3011}', None), ('\u{3014}', '\u{3015}', None), ('\u{3016}', '\u{3017}', None), ('\u{3018}', '\u{3019}', None), ('\u{301a}', '\u{301b}', None), ('\u{fe59}', '\u{fe5a}', None), ('\u{fe5b}', '\u{fe5c}', None), ('\u{fe5d}', '\u{fe5e}', None), ('\u{ff08}', '\u{ff09}', None), ('\u{ff3b}', '\u{ff3d}', None), ('\u{ff5b}', '\u{ff5d}', None), ('\u{ff5f}', '\u{ff60}', None), ('\u{ff62}', '\u{ff63}', None) ]; unicode-bidi-0.3.17/src/data_source.rs000064400000000000000000000043041046102023000156760ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use crate::BidiClass; /// This is the return value of [`BidiDataSource::bidi_matched_opening_bracket()`]. /// /// It represents the matching *normalized* opening bracket for a given bracket in a bracket pair, /// and whether or not that bracket is opening. #[derive(Debug, Copy, Clone)] pub struct BidiMatchedOpeningBracket { /// The corresponding opening bracket in this bracket pair, normalized /// /// In case of opening brackets, this will be the bracket itself, except for when the bracket /// is not normalized, in which case it will be the normalized form. pub opening: char, /// Whether or not the requested bracket was an opening bracket. True for opening pub is_open: bool, } /// This trait abstracts over a data source that is able to produce the Unicode Bidi class for a given /// character pub trait BidiDataSource { fn bidi_class(&self, c: char) -> BidiClass; /// If this character is a bracket according to BidiBrackets.txt, /// return the corresponding *normalized* *opening bracket* of the pair, /// and whether or not it itself is an opening bracket. /// /// This effectively buckets brackets into equivalence classes keyed on the /// normalized opening bracket. /// /// The default implementation will pull in a small amount of hardcoded data, /// regardless of the `hardcoded-data` feature. This is in part for convenience /// (since this data is small and changes less often), and in part so that this method can be /// added without needing a breaking version bump. /// Override this method in your custom data source to prevent the use of hardcoded data. fn bidi_matched_opening_bracket(&self, c: char) -> Option { crate::char_data::bidi_matched_opening_bracket(c) } } unicode-bidi-0.3.17/src/deprecated.rs000064400000000000000000000056161046102023000155140ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! This module holds deprecated assets only. use super::*; /// Find the level runs within a line and return them in visual order. /// /// NOTE: This implementation is incomplete. The algorithm needs information about the text, /// including original `BidiClass` property of each character, to be able to perform correctly. /// Please see [`BidiInfo::visual_runs()`](../struct.BidiInfo.html#method.visual_runs) for the /// improved implementation. /// /// `line` is a range of bytes indices within `levels`. /// /// #[deprecated( since = "0.3.0", note = "please use `BidiInfo::visual_runs()` instead." )] pub fn visual_runs(line: Range, levels: &[Level]) -> Vec { assert!(line.start <= levels.len()); assert!(line.end <= levels.len()); let mut runs = Vec::new(); // Find consecutive level runs. let mut start = line.start; let mut run_level = levels[start]; let mut min_level = run_level; let mut max_level = run_level; for (i, &new_level) in levels.iter().enumerate().take(line.end).skip(start + 1) { if new_level != run_level { // End of the previous run, start of a new one. runs.push(start..i); start = i; run_level = new_level; min_level = cmp::min(run_level, min_level); max_level = cmp::max(run_level, max_level); } } runs.push(start..line.end); let run_count = runs.len(); // Re-order the odd runs. // // Stop at the lowest *odd* level. min_level = min_level.new_lowest_ge_rtl().expect("Level error"); while max_level >= min_level { // Look for the start of a sequence of consecutive runs of max_level or higher. let mut seq_start = 0; while seq_start < run_count { if levels[runs[seq_start].start] < max_level { seq_start += 1; continue; } // Found the start of a sequence. Now find the end. let mut seq_end = seq_start + 1; while seq_end < run_count && levels[runs[seq_end].start] >= max_level { seq_end += 1; } // Reverse the runs within this sequence. runs[seq_start..seq_end].reverse(); seq_start = seq_end; } max_level .lower(1) .expect("Lowering embedding level below zero"); } runs } unicode-bidi-0.3.17/src/explicit.rs000064400000000000000000000201221046102023000152220ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.2 Explicit Levels and Directions //! //! #[cfg(feature = "smallvec")] use smallvec::{smallvec, SmallVec}; use super::char_data::{ is_rtl, BidiClass::{self, *}, }; use super::level::Level; use super::prepare::removed_by_x9; use super::LevelRunVec; use super::TextSource; /// Compute explicit embedding levels for one paragraph of text (X1-X8), and identify /// level runs (BD7) for use when determining Isolating Run Sequences (X10). /// /// `processing_classes[i]` must contain the `BidiClass` of the char at byte index `i`, /// for each char in `text`. /// /// `runs` returns the list of level runs (BD7) of the text. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn compute<'a, T: TextSource<'a> + ?Sized>( text: &'a T, para_level: Level, original_classes: &[BidiClass], levels: &mut [Level], processing_classes: &mut [BidiClass], runs: &mut LevelRunVec, ) { assert_eq!(text.len(), original_classes.len()); // #[cfg(feature = "smallvec")] let mut stack: SmallVec<[Status; 8]> = smallvec![Status { level: para_level, status: OverrideStatus::Neutral, }]; #[cfg(not(feature = "smallvec"))] let mut stack = vec![Status { level: para_level, status: OverrideStatus::Neutral, }]; let mut overflow_isolate_count = 0u32; let mut overflow_embedding_count = 0u32; let mut valid_isolate_count = 0u32; let mut current_run_level = Level::ltr(); let mut current_run_start = 0; for (i, len) in text.indices_lengths() { let last = stack.last().unwrap(); match original_classes[i] { // Rules X2-X5c RLE | LRE | RLO | LRO | RLI | LRI | FSI => { // levels[i] = last.level; // X5a-X5c: Isolate initiators get the level of the last entry on the stack. let is_isolate = matches!(original_classes[i], RLI | LRI | FSI); if is_isolate { // Redundant due to "Retaining explicit formatting characters" step. // levels[i] = last.level; match last.status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } let new_level = if is_rtl(original_classes[i]) { last.level.new_explicit_next_rtl() } else { last.level.new_explicit_next_ltr() }; if new_level.is_ok() && overflow_isolate_count == 0 && overflow_embedding_count == 0 { let new_level = new_level.unwrap(); stack.push(Status { level: new_level, status: match original_classes[i] { RLO => OverrideStatus::RTL, LRO => OverrideStatus::LTR, RLI | LRI | FSI => OverrideStatus::Isolate, _ => OverrideStatus::Neutral, }, }); if is_isolate { valid_isolate_count += 1; } else { // The spec doesn't explicitly mention this step, but it is necessary. // See the reference implementations for comparison. levels[i] = new_level; } } else if is_isolate { overflow_isolate_count += 1; } else if overflow_isolate_count == 0 { overflow_embedding_count += 1; } if !is_isolate { // X9 + // // (PDF handled below) processing_classes[i] = BN; } } // PDI => { if overflow_isolate_count > 0 { overflow_isolate_count -= 1; } else if valid_isolate_count > 0 { overflow_embedding_count = 0; while !matches!( stack.pop(), None | Some(Status { status: OverrideStatus::Isolate, .. }) ) {} valid_isolate_count -= 1; } let last = stack.last().unwrap(); levels[i] = last.level; match last.status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } // PDF => { if overflow_isolate_count > 0 { // do nothing } else if overflow_embedding_count > 0 { overflow_embedding_count -= 1; } else if last.status != OverrideStatus::Isolate && stack.len() >= 2 { stack.pop(); } // levels[i] = stack.last().unwrap().level; // X9 part of retaining explicit formatting characters. processing_classes[i] = BN; } // Nothing. // BN case moved down to X6, see B => {} // _ => { levels[i] = last.level; // This condition is not in the spec, but I am pretty sure that is a spec bug. // https://www.unicode.org/L2/L2023/23014-amd-to-uax9.pdf if original_classes[i] != BN { match last.status { OverrideStatus::RTL => processing_classes[i] = R, OverrideStatus::LTR => processing_classes[i] = L, _ => {} } } } } // Handle multi-byte characters. for j in 1..len { levels[i + j] = levels[i]; processing_classes[i + j] = processing_classes[i]; } // Identify level runs to be passed to prepare::isolating_run_sequences(). if i == 0 { // Initialize for the first (or only) run. current_run_level = levels[i]; } else { // Check if we need to start a new level run. // if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { // End the last run and start a new one. runs.push(current_run_start..i); current_run_level = levels[i]; current_run_start = i; } } } // Append the trailing level run, if non-empty. if levels.len() > current_run_start { runs.push(current_run_start..levels.len()); } } /// Entries in the directional status stack: struct Status { level: Level, status: OverrideStatus, } #[derive(PartialEq)] enum OverrideStatus { Neutral, RTL, LTR, Isolate, } unicode-bidi-0.3.17/src/format_chars.rs000064400000000000000000000025271046102023000160620ustar 00000000000000// Copyright 2017 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Directional Formatting Characters //! //! // == Implicit == /// ARABIC LETTER MARK pub const ALM: char = '\u{061C}'; /// LEFT-TO-RIGHT MARK pub const LRM: char = '\u{200E}'; /// RIGHT-TO-LEFT MARK pub const RLM: char = '\u{200F}'; // == Explicit Isolates == /// LEFT‑TO‑RIGHT ISOLATE pub const LRI: char = '\u{2066}'; /// RIGHT‑TO‑LEFT ISOLATE pub const RLI: char = '\u{2067}'; /// FIRST STRONG ISOLATE pub const FSI: char = '\u{2068}'; /// POP DIRECTIONAL ISOLATE pub const PDI: char = '\u{2069}'; // == Explicit Embeddings and Overrides == /// LEFT-TO-RIGHT EMBEDDING pub const LRE: char = '\u{202A}'; /// RIGHT-TO-LEFT EMBEDDING pub const RLE: char = '\u{202B}'; /// POP DIRECTIONAL FORMATTING pub const PDF: char = '\u{202C}'; /// LEFT-TO-RIGHT OVERRIDE pub const LRO: char = '\u{202D}'; /// RIGHT-TO-LEFT OVERRIDE pub const RLO: char = '\u{202E}'; unicode-bidi-0.3.17/src/implicit.rs000064400000000000000000000637601046102023000152320ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.4 - 3.3.6. Resolve implicit levels and types. #[cfg(not(feature = "smallvec"))] use alloc::vec::Vec; use core::cmp::max; #[cfg(feature = "smallvec")] use smallvec::SmallVec; use super::char_data::BidiClass::{self, *}; use super::level::Level; use super::prepare::{not_removed_by_x9, IsolatingRunSequence}; use super::{BidiDataSource, TextSource}; /// 3.3.4 Resolving Weak Types /// /// #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn resolve_weak<'a, T: TextSource<'a> + ?Sized>( text: &'a T, sequence: &IsolatingRunSequence, processing_classes: &mut [BidiClass], ) { // Note: The spec treats these steps as individual passes that are applied one after the other // on the entire IsolatingRunSequence at once. We instead collapse it into a single iteration, // which is straightforward for rules that are based on the state of the current character, but not // for rules that care about surrounding characters. To deal with them, we retain additional state // about previous character classes that may have since been changed by later rules. // The previous class for the purposes of rule W4/W6, not tracking changes made after or during W4. let mut prev_class_before_w4 = sequence.sos; // The previous class for the purposes of rule W5. let mut prev_class_before_w5 = sequence.sos; // The previous class for the purposes of rule W1, not tracking changes from any other rules. let mut prev_class_before_w1 = sequence.sos; let mut last_strong_is_al = false; #[cfg(feature = "smallvec")] let mut et_run_indices = SmallVec::<[usize; 8]>::new(); // for W5 #[cfg(not(feature = "smallvec"))] let mut et_run_indices = Vec::new(); // for W5 #[cfg(feature = "smallvec")] let mut bn_run_indices = SmallVec::<[usize; 8]>::new(); // for W5 + #[cfg(not(feature = "smallvec"))] let mut bn_run_indices = Vec::new(); // for W5 + for (run_index, level_run) in sequence.runs.iter().enumerate() { for i in &mut level_run.clone() { if processing_classes[i] == BN { // // Keeps track of bn runs for W5 in case we see an ET. bn_run_indices.push(i); // BNs aren't real, skip over them. continue; } // Store the processing class of all rules before W2/W1. // Used to keep track of the last strong character for W2. W3 is able to insert new strong // characters, so we don't want to be misled by it. let mut w2_processing_class = processing_classes[i]; // // if processing_classes[i] == NSM { processing_classes[i] = match prev_class_before_w1 { RLI | LRI | FSI | PDI => ON, _ => prev_class_before_w1, }; // W1 occurs before W2, update this. w2_processing_class = processing_classes[i]; } prev_class_before_w1 = processing_classes[i]; // // // match processing_classes[i] { EN => { if last_strong_is_al { // W2. If previous strong char was AL, change EN to AN. processing_classes[i] = AN; } } // W3. AL => processing_classes[i] = R, _ => {} } // update last_strong_is_al. match w2_processing_class { L | R => { last_strong_is_al = false; } AL => { last_strong_is_al = true; } _ => {} } let class_before_w456 = processing_classes[i]; // // // (separators only) // (see below for W6 terminator code) // match processing_classes[i] { // EN => { // W5. If a run of ETs is adjacent to an EN, change the ETs to EN. for j in &et_run_indices { processing_classes[*j] = EN; } et_run_indices.clear(); } // // ES | CS => { // See https://github.com/servo/unicode-bidi/issues/86 for improving this. // We want to make sure we check the correct next character by skipping past the rest // of this one. if let Some((_, char_len)) = text.char_at(i) { let mut next_class = sequence .iter_forwards_from(i + char_len, run_index) .map(|j| processing_classes[j]) // .find(not_removed_by_x9) .unwrap_or(sequence.eos); if next_class == EN && last_strong_is_al { // Apply W2 to next_class. We know that last_strong_is_al // has no chance of changing on this character so we can still assume its value // will be the same by the time we get to it. next_class = AN; } processing_classes[i] = match (prev_class_before_w4, processing_classes[i], next_class) { // W4 (EN, ES, EN) | (EN, CS, EN) => EN, // W4 (AN, CS, AN) => AN, // W6 (separators only) (_, _, _) => ON, }; // W6 + // We have to do this before W5 gets its grubby hands on these characters and thinks // they're part of an ET run. // We check for ON to ensure that we had hit the W6 branch above, since this `ES | CS` match // arm handles both W4 and W6. if processing_classes[i] == ON { for idx in sequence.iter_backwards_from(i, run_index) { let class = &mut processing_classes[idx]; if *class != BN { break; } *class = ON; } for idx in sequence.iter_forwards_from(i + char_len, run_index) { let class = &mut processing_classes[idx]; if *class != BN { break; } *class = ON; } } } else { // We're in the middle of a character, copy over work done for previous bytes // since it's going to be the same answer. processing_classes[i] = processing_classes[i - 1]; } } // ET => { match prev_class_before_w5 { EN => processing_classes[i] = EN, _ => { // // If there was a BN run before this, that's now a part of this ET run. et_run_indices.extend(bn_run_indices.clone()); // In case this is followed by an EN. et_run_indices.push(i); } } } _ => {} } // Common loop iteration code // // // BN runs would have already continued the loop, clear them before we get to the next one. bn_run_indices.clear(); // W6 above only deals with separators, so it doesn't change anything W5 cares about, // so we still can update this after running that part of W6. prev_class_before_w5 = processing_classes[i]; // (terminators only) // (see above for W6 separator code) // if prev_class_before_w5 != ET { // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. for j in &et_run_indices { processing_classes[*j] = ON; } et_run_indices.clear(); } // We stashed this before W4/5/6 could get their grubby hands on it, and it's not // used in the W6 terminator code below so we can update it now. prev_class_before_w4 = class_before_w456; } } // Rerun this check in case we ended with a sequence of BNs (i.e., we'd never // hit the end of the for loop above). // W6. If we didn't find an adjacent EN, turn any ETs into ON instead. for j in &et_run_indices { processing_classes[*j] = ON; } et_run_indices.clear(); // W7. If the previous strong char was L, change EN to L. let mut last_strong_is_l = sequence.sos == L; for i in sequence.runs.iter().cloned().flatten() { match processing_classes[i] { EN if last_strong_is_l => { processing_classes[i] = L; } L => { last_strong_is_l = true; } R | AL => { last_strong_is_l = false; } // // Already scanning past BN here. _ => {} } } } #[cfg(feature = "smallvec")] type BracketPairVec = SmallVec<[BracketPair; 8]>; #[cfg(not(feature = "smallvec"))] type BracketPairVec = Vec; /// 3.3.5 Resolving Neutral Types /// /// #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn resolve_neutral<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( text: &'a T, data_source: &D, sequence: &IsolatingRunSequence, levels: &[Level], original_classes: &[BidiClass], processing_classes: &mut [BidiClass], ) { // e = embedding direction let e: BidiClass = levels[sequence.runs[0].start].bidi_class(); let not_e = if e == BidiClass::L { BidiClass::R } else { BidiClass::L }; // N0. Process bracket pairs. // > Identify the bracket pairs in the current isolating run sequence according to BD16. // We use processing_classes, not original_classes, due to BD14/BD15 let mut bracket_pairs = BracketPairVec::new(); identify_bracket_pairs( text, data_source, sequence, processing_classes, &mut bracket_pairs, ); // > For each bracket-pair element in the list of pairs of text positions // // Note: Rust ranges are interpreted as [start..end), be careful using `pair` directly // for indexing as it will include the opening bracket pair but not the closing one. for pair in bracket_pairs { #[cfg(feature = "std")] debug_assert!( pair.start < processing_classes.len(), "identify_bracket_pairs returned a range that is out of bounds!" ); #[cfg(feature = "std")] debug_assert!( pair.end < processing_classes.len(), "identify_bracket_pairs returned a range that is out of bounds!" ); let mut found_e = false; let mut found_not_e = false; let mut class_to_set = None; let start_char_len = T::char_len(text.subrange(pair.start..pair.end).chars().next().unwrap()); // > Inspect the bidirectional types of the characters enclosed within the bracket pair. // // `pair` is [start, end) so we will end up processing the opening character but not the closing one. // for enclosed_i in sequence.iter_forwards_from(pair.start + start_char_len, pair.start_run) { if enclosed_i >= pair.end { #[cfg(feature = "std")] debug_assert!( enclosed_i == pair.end, "If we skipped past this, the iterator is broken" ); break; } let class = processing_classes[enclosed_i]; if class == e { found_e = true; } else if class == not_e { found_not_e = true; } else if matches!(class, BidiClass::EN | BidiClass::AN) { // > Within this scope, bidirectional types EN and AN are treated as R. if e == BidiClass::L { found_not_e = true; } else { found_e = true; } } // If we have found a character with the class of the embedding direction // we can bail early. if found_e { break; } } // > If any strong type (either L or R) matching the embedding direction is found if found_e { // > .. set the type for both brackets in the pair to match the embedding direction class_to_set = Some(e); // > Otherwise, if there is a strong type it must be opposite the embedding direction } else if found_not_e { // > Therefore, test for an established context with a preceding strong type by // > checking backwards before the opening paired bracket // > until the first strong type (L, R, or sos) is found. // (see note above about processing_classes and character boundaries) let mut previous_strong = sequence .iter_backwards_from(pair.start, pair.start_run) .map(|i| processing_classes[i]) .find(|class| { matches!( class, BidiClass::L | BidiClass::R | BidiClass::EN | BidiClass::AN ) }) .unwrap_or(sequence.sos); // > Within this scope, bidirectional types EN and AN are treated as R. if matches!(previous_strong, BidiClass::EN | BidiClass::AN) { previous_strong = BidiClass::R; } // > If the preceding strong type is also opposite the embedding direction, // > context is established, // > so set the type for both brackets in the pair to that direction. // AND // > Otherwise set the type for both brackets in the pair to the embedding direction. // > Either way it gets set to previous_strong // // Both branches amount to setting the type to the strong type. class_to_set = Some(previous_strong); } if let Some(class_to_set) = class_to_set { // Update all processing classes corresponding to the start and end elements, as requested. // We should include all bytes of the character, not the first one. let end_char_len = T::char_len(text.subrange(pair.end..text.len()).chars().next().unwrap()); for class in &mut processing_classes[pair.start..pair.start + start_char_len] { *class = class_to_set; } for class in &mut processing_classes[pair.end..pair.end + end_char_len] { *class = class_to_set; } // for idx in sequence.iter_backwards_from(pair.start, pair.start_run) { let class = &mut processing_classes[idx]; if *class != BN { break; } *class = class_to_set; } // > Any number of characters that had original bidirectional character type NSM prior to the application of // > W1 that immediately follow a paired bracket which changed to L or R under N0 should change to match the type of their preceding bracket. // This rule deals with sequences of NSMs, so we can just update them all at once, we don't need to worry // about character boundaries. We do need to be careful to skip the full set of bytes for the parentheses characters. let nsm_start = pair.start + start_char_len; for idx in sequence.iter_forwards_from(nsm_start, pair.start_run) { let class = original_classes[idx]; if class == BidiClass::NSM || processing_classes[idx] == BN { processing_classes[idx] = class_to_set; } else { break; } } let nsm_end = pair.end + end_char_len; for idx in sequence.iter_forwards_from(nsm_end, pair.end_run) { let class = original_classes[idx]; if class == BidiClass::NSM || processing_classes[idx] == BN { processing_classes[idx] = class_to_set; } else { break; } } } // > Otherwise, there are no strong types within the bracket pair // > Therefore, do not set the type for that bracket pair } // N1 and N2. // Indices of every byte in this isolating run sequence let mut indices = sequence.runs.iter().flat_map(Clone::clone); let mut prev_class = sequence.sos; while let Some(mut i) = indices.next() { // Process sequences of NI characters. #[cfg(feature = "smallvec")] let mut ni_run = SmallVec::<[usize; 8]>::new(); #[cfg(not(feature = "smallvec"))] let mut ni_run = Vec::new(); // The BN is for if is_NI(processing_classes[i]) || processing_classes[i] == BN { // Consume a run of consecutive NI characters. ni_run.push(i); let mut next_class; loop { match indices.next() { Some(j) => { i = j; next_class = processing_classes[j]; // The BN is for if is_NI(next_class) || next_class == BN { ni_run.push(i); } else { break; } } None => { next_class = sequence.eos; break; } }; } // N1-N2. // // // let new_class = match (prev_class, next_class) { (L, L) => L, (R, R) | (R, AN) | (R, EN) | (AN, R) | (AN, AN) | (AN, EN) | (EN, R) | (EN, AN) | (EN, EN) => R, (_, _) => e, }; for j in &ni_run { processing_classes[*j] = new_class; } ni_run.clear(); } prev_class = processing_classes[i]; } } struct BracketPair { /// The text-relative index of the opening bracket. start: usize, /// The text-relative index of the closing bracket. end: usize, /// The index of the run (in the run sequence) that the opening bracket is in. start_run: usize, /// The index of the run (in the run sequence) that the closing bracket is in. end_run: usize, } /// 3.1.3 Identifying Bracket Pairs /// /// Returns all paired brackets in the source, as indices into the /// text source. /// /// fn identify_bracket_pairs<'a, T: TextSource<'a> + ?Sized, D: BidiDataSource>( text: &'a T, data_source: &D, run_sequence: &IsolatingRunSequence, original_classes: &[BidiClass], bracket_pairs: &mut BracketPairVec, ) { #[cfg(feature = "smallvec")] let mut stack = SmallVec::<[(char, usize, usize); 8]>::new(); #[cfg(not(feature = "smallvec"))] let mut stack = Vec::new(); for (run_index, level_run) in run_sequence.runs.iter().enumerate() { for (i, ch) in text.subrange(level_run.clone()).char_indices() { let actual_index = level_run.start + i; // All paren characters are ON. // From BidiBrackets.txt: // > The Unicode property value stability policy guarantees that characters // > which have bpt=o or bpt=c also have bc=ON and Bidi_M=Y if original_classes[actual_index] != BidiClass::ON { continue; } if let Some(matched) = data_source.bidi_matched_opening_bracket(ch) { if matched.is_open { // > If an opening paired bracket is found ... // > ... and there is no room in the stack, // > stop processing BD16 for the remainder of the isolating run sequence. if stack.len() >= 63 { break; } // > ... push its Bidi_Paired_Bracket property value and its text position onto the stack stack.push((matched.opening, actual_index, run_index)) } else { // > If a closing paired bracket is found, do the following // > Declare a variable that holds a reference to the current stack element // > and initialize it with the top element of the stack. // AND // > Else, if the current stack element is not at the bottom of the stack for (stack_index, element) in stack.iter().enumerate().rev() { // > Compare the closing paired bracket being inspected or its canonical // > equivalent to the bracket in the current stack element. if element.0 == matched.opening { // > If the values match, meaning the two characters form a bracket pair, then // > Append the text position in the current stack element together with the // > text position of the closing paired bracket to the list. let pair = BracketPair { start: element.1, end: actual_index, start_run: element.2, end_run: run_index, }; bracket_pairs.push(pair); // > Pop the stack through the current stack element inclusively. stack.truncate(stack_index); break; } } } } } } // > Sort the list of pairs of text positions in ascending order based on // > the text position of the opening paired bracket. bracket_pairs.sort_by_key(|r| r.start); } /// 3.3.6 Resolving Implicit Levels /// /// Returns the maximum embedding level in the paragraph. /// /// #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn resolve_levels(processing_classes: &[BidiClass], levels: &mut [Level]) -> Level { let mut max_level = Level::ltr(); assert_eq!(processing_classes.len(), levels.len()); for i in 0..levels.len() { match (levels[i].is_rtl(), processing_classes[i]) { (false, AN) | (false, EN) => levels[i].raise(2).expect("Level number error"), (false, R) | (true, L) | (true, EN) | (true, AN) => { levels[i].raise(1).expect("Level number error") } // handled here (_, _) => {} } max_level = max(max_level, levels[i]); } max_level } /// Neutral or Isolate formatting character (B, S, WS, ON, FSI, LRI, RLI, PDI) /// /// #[allow(non_snake_case)] fn is_NI(class: BidiClass) -> bool { matches!(class, B | S | WS | ON | FSI | LRI | RLI | PDI) } unicode-bidi-0.3.17/src/level.rs000064400000000000000000000273441046102023000145250ustar 00000000000000// Copyright 2017 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! Bidi Embedding Level //! //! See [`Level`](struct.Level.html) for more details. //! //! use alloc::{ string::{String, ToString}, vec::Vec, }; use core::slice; use super::char_data::BidiClass; /// Embedding Level /// /// Embedding Levels are numbers between 0 and 126 (inclusive), where even values denote a /// left-to-right (LTR) direction and odd values a right-to-left (RTL) direction. /// /// This struct maintains a *valid* status for level numbers, meaning that creating a new level, or /// mutating an existing level, with the value smaller than `0` (before conversion to `u8`) or /// larger than 125 results in an `Error`. /// /// #[derive(Copy, Clone, Debug, Eq, Ord, PartialEq, PartialOrd)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[repr(transparent)] pub struct Level(u8); pub const LTR_LEVEL: Level = Level(0); pub const RTL_LEVEL: Level = Level(1); const MAX_DEPTH: u8 = 125; /// During explicit level resolution, embedding level can go as high as `max_depth`. pub const MAX_EXPLICIT_DEPTH: u8 = MAX_DEPTH; /// During implicit level resolution, embedding level can go as high as `max_depth + 1`. pub const MAX_IMPLICIT_DEPTH: u8 = MAX_DEPTH + 1; /// Errors that can occur on Level creation or mutation #[derive(Debug, PartialEq)] pub enum Error { /// Out-of-range (invalid) embedding level number. OutOfRangeNumber, } impl Level { /// New LTR level with smallest number value (0). #[inline] pub fn ltr() -> Level { LTR_LEVEL } /// New RTL level with smallest number value (1). #[inline] pub fn rtl() -> Level { RTL_LEVEL } /// Maximum depth of the directional status stack during implicit resolutions. pub fn max_implicit_depth() -> u8 { MAX_IMPLICIT_DEPTH } /// Maximum depth of the directional status stack during explicit resolutions. pub fn max_explicit_depth() -> u8 { MAX_EXPLICIT_DEPTH } // == Inquiries == /// Create new level, fail if number is larger than `max_depth + 1`. #[inline] pub fn new(number: u8) -> Result { if number <= MAX_IMPLICIT_DEPTH { Ok(Level(number)) } else { Err(Error::OutOfRangeNumber) } } /// Create new level, fail if number is larger than `max_depth`. #[inline] pub fn new_explicit(number: u8) -> Result { if number <= MAX_EXPLICIT_DEPTH { Ok(Level(number)) } else { Err(Error::OutOfRangeNumber) } } // == Inquiries == /// The level number. #[inline] pub fn number(&self) -> u8 { self.0 } /// If this level is left-to-right. #[inline] pub fn is_ltr(&self) -> bool { self.0 % 2 == 0 } /// If this level is right-to-left. #[inline] pub fn is_rtl(&self) -> bool { self.0 % 2 == 1 } // == Mutators == /// Raise level by `amount`, fail if number is larger than `max_depth + 1`. #[inline] pub fn raise(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_add(amount) { Some(number) => { if number <= MAX_IMPLICIT_DEPTH { self.0 = number; Ok(()) } else { Err(Error::OutOfRangeNumber) } } None => Err(Error::OutOfRangeNumber), } } /// Raise level by `amount`, fail if number is larger than `max_depth`. #[inline] pub fn raise_explicit(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_add(amount) { Some(number) => { if number <= MAX_EXPLICIT_DEPTH { self.0 = number; Ok(()) } else { Err(Error::OutOfRangeNumber) } } None => Err(Error::OutOfRangeNumber), } } /// Lower level by `amount`, fail if number goes below zero. #[inline] pub fn lower(&mut self, amount: u8) -> Result<(), Error> { match self.0.checked_sub(amount) { Some(number) => { self.0 = number; Ok(()) } None => Err(Error::OutOfRangeNumber), } } // == Helpers == /// The next LTR (even) level greater than this, or fail if number is larger than `max_depth`. #[inline] pub fn new_explicit_next_ltr(&self) -> Result { Level::new_explicit((self.0 + 2) & !1) } /// The next RTL (odd) level greater than this, or fail if number is larger than `max_depth`. #[inline] pub fn new_explicit_next_rtl(&self) -> Result { Level::new_explicit((self.0 + 1) | 1) } /// The lowest RTL (odd) level greater than or equal to this, or fail if number is larger than /// `max_depth + 1`. #[inline] pub fn new_lowest_ge_rtl(&self) -> Result { Level::new(self.0 | 1) } /// Generate a character type based on a level (as specified in steps X10 and N2). #[inline] pub fn bidi_class(&self) -> BidiClass { if self.is_rtl() { BidiClass::R } else { BidiClass::L } } pub fn vec(v: &[u8]) -> Vec { v.iter().map(|&x| x.into()).collect() } /// Converts a byte slice to a slice of Levels /// /// Does _not_ check if each level is within bounds (`<=` [`MAX_IMPLICIT_DEPTH`]), /// which is not a requirement for safety but is a requirement for correctness of the algorithm. pub fn from_slice_unchecked(v: &[u8]) -> &[Level] { debug_assert_eq!(core::mem::size_of::(), core::mem::size_of::()); unsafe { // Safety: The two arrays are the same size and layout-compatible since // Level is `repr(transparent)` over `u8` slice::from_raw_parts(v as *const [u8] as *const u8 as *const Level, v.len()) } } } /// If levels has any RTL (odd) level /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(levels: &[Level]) -> bool { levels.iter().any(|&lvl| lvl.is_rtl()) } impl From for u8 { /// Convert to the level number #[inline] fn from(val: Level) -> Self { val.number() } } impl From for Level { /// Create level by number #[inline] fn from(number: u8) -> Level { Level::new(number).expect("Level number error") } } /// Used for matching levels in conformance tests impl<'a> PartialEq<&'a str> for Level { #[inline] fn eq(&self, s: &&'a str) -> bool { *s == "x" || *s == self.0.to_string() } } /// Used for matching levels in conformance tests impl PartialEq for Level { #[inline] fn eq(&self, s: &String) -> bool { self == &s.as_str() } } #[cfg(test)] mod tests { use super::*; #[test] fn test_new() { assert_eq!(Level::new(0), Ok(Level(0))); assert_eq!(Level::new(1), Ok(Level(1))); assert_eq!(Level::new(10), Ok(Level(10))); assert_eq!(Level::new(125), Ok(Level(125))); assert_eq!(Level::new(126), Ok(Level(126))); assert_eq!(Level::new(127), Err(Error::OutOfRangeNumber)); assert_eq!(Level::new(255), Err(Error::OutOfRangeNumber)); } #[test] fn test_new_explicit() { assert_eq!(Level::new_explicit(0), Ok(Level(0))); assert_eq!(Level::new_explicit(1), Ok(Level(1))); assert_eq!(Level::new_explicit(10), Ok(Level(10))); assert_eq!(Level::new_explicit(125), Ok(Level(125))); assert_eq!(Level::new_explicit(126), Err(Error::OutOfRangeNumber)); assert_eq!(Level::new_explicit(255), Err(Error::OutOfRangeNumber)); } #[test] fn test_is_ltr() { assert_eq!(Level(0).is_ltr(), true); assert_eq!(Level(1).is_ltr(), false); assert_eq!(Level(10).is_ltr(), true); assert_eq!(Level(11).is_ltr(), false); assert_eq!(Level(124).is_ltr(), true); assert_eq!(Level(125).is_ltr(), false); } #[test] fn test_is_rtl() { assert_eq!(Level(0).is_rtl(), false); assert_eq!(Level(1).is_rtl(), true); assert_eq!(Level(10).is_rtl(), false); assert_eq!(Level(11).is_rtl(), true); assert_eq!(Level(124).is_rtl(), false); assert_eq!(Level(125).is_rtl(), true); } #[test] fn test_raise() { let mut level = Level::ltr(); assert_eq!(level.number(), 0); assert!(level.raise(100).is_ok()); assert_eq!(level.number(), 100); assert!(level.raise(26).is_ok()); assert_eq!(level.number(), 126); assert!(level.raise(1).is_err()); // invalid! assert!(level.raise(250).is_err()); // overflow! assert_eq!(level.number(), 126); } #[test] fn test_raise_explicit() { let mut level = Level::ltr(); assert_eq!(level.number(), 0); assert!(level.raise_explicit(100).is_ok()); assert_eq!(level.number(), 100); assert!(level.raise_explicit(25).is_ok()); assert_eq!(level.number(), 125); assert!(level.raise_explicit(1).is_err()); // invalid! assert!(level.raise_explicit(250).is_err()); // overflow! assert_eq!(level.number(), 125); } #[test] fn test_lower() { let mut level = Level::rtl(); assert_eq!(level.number(), 1); assert!(level.lower(1).is_ok()); assert_eq!(level.number(), 0); assert!(level.lower(1).is_err()); // underflow! assert!(level.lower(250).is_err()); // underflow! assert_eq!(level.number(), 0); } #[test] fn test_has_rtl() { assert_eq!(has_rtl(&Level::vec(&[0, 0, 0])), false); assert_eq!(has_rtl(&Level::vec(&[0, 1, 0])), true); assert_eq!(has_rtl(&Level::vec(&[0, 2, 0])), false); assert_eq!(has_rtl(&Level::vec(&[0, 125, 0])), true); assert_eq!(has_rtl(&Level::vec(&[0, 126, 0])), false); } #[test] fn test_into() { let level = Level::rtl(); let number: u8 = level.into(); assert_eq!(1u8, number); } #[test] fn test_vec() { assert_eq!( Level::vec(&[0, 1, 125]), vec![Level(0), Level(1), Level(125)] ); } #[test] fn test_str_eq() { assert_eq!(Level::vec(&[0, 1, 4, 125]), vec!["0", "1", "x", "125"]); assert_ne!(Level::vec(&[0, 1, 4, 125]), vec!["0", "1", "5", "125"]); } #[test] fn test_string_eq() { assert_eq!( Level::vec(&[0, 1, 4, 125]), vec!["0".to_string(), "1".to_string(), "x".to_string(), "125".to_string()] ); } } #[cfg(all(feature = "serde", test))] mod serde_tests { use super::*; use serde_test::{assert_tokens, Token}; #[test] fn test_statics() { assert_tokens( &Level::ltr(), &[Token::NewtypeStruct { name: "Level" }, Token::U8(0)], ); assert_tokens( &Level::rtl(), &[Token::NewtypeStruct { name: "Level" }, Token::U8(1)], ); } #[test] fn test_new() { let level = Level::new(42).unwrap(); assert_tokens( &level, &[Token::NewtypeStruct { name: "Level" }, Token::U8(42)], ); } } unicode-bidi-0.3.17/src/lib.rs000064400000000000000000002445371046102023000141710ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! This crate implements the [Unicode Bidirectional Algorithm][tr9] for display of mixed //! right-to-left and left-to-right text. It is written in safe Rust, compatible with the //! current stable release. //! //! ## Example //! //! ```rust //! # #[cfg(feature = "hardcoded-data")] { //! use unicode_bidi::BidiInfo; //! //! // This example text is defined using `concat!` because some browsers //! // and text editors have trouble displaying bidi strings. //! let text = concat![ //! "א", //! "ב", //! "ג", //! "a", //! "b", //! "c", //! ]; //! //! // Resolve embedding levels within the text. Pass `None` to detect the //! // paragraph level automatically. //! let bidi_info = BidiInfo::new(&text, None); //! //! // This paragraph has embedding level 1 because its first strong character is RTL. //! assert_eq!(bidi_info.paragraphs.len(), 1); //! let para = &bidi_info.paragraphs[0]; //! assert_eq!(para.level.number(), 1); //! assert_eq!(para.level.is_rtl(), true); //! //! // Re-ordering is done after wrapping each paragraph into a sequence of //! // lines. For this example, I'll just use a single line that spans the //! // entire paragraph. //! let line = para.range.clone(); //! //! let display = bidi_info.reorder_line(para, line); //! assert_eq!(display, concat![ //! "a", //! "b", //! "c", //! "ג", //! "ב", //! "א", //! ]); //! # } // feature = "hardcoded-data" //! ``` //! //! # Features //! //! - `std`: Enabled by default, but can be disabled to make `unicode_bidi` //! `#![no_std]` + `alloc` compatible. //! - `hardcoded-data`: Enabled by default. Includes hardcoded Unicode bidi data and more convenient APIs. //! - `serde`: Adds [`serde::Serialize`] and [`serde::Deserialize`] //! implementations to relevant types. //! //! [tr9]: #![no_std] // We need to link to std to make doc tests work on older Rust versions #[cfg(feature = "std")] extern crate std; #[macro_use] extern crate alloc; #[cfg(feature = "smallvec")] extern crate smallvec; pub mod data_source; pub mod deprecated; pub mod format_chars; pub mod level; pub mod utf16; mod char_data; mod explicit; mod implicit; mod prepare; pub use crate::char_data::{BidiClass, UNICODE_VERSION}; pub use crate::data_source::BidiDataSource; pub use crate::level::{Level, LTR_LEVEL, RTL_LEVEL}; pub use crate::prepare::{LevelRun, LevelRunVec}; #[cfg(feature = "hardcoded-data")] pub use crate::char_data::{bidi_class, HardcodedBidiData}; use alloc::borrow::Cow; use alloc::string::String; use alloc::vec::Vec; use core::char; use core::cmp; use core::iter::repeat; use core::ops::Range; use core::str::CharIndices; #[cfg(feature = "smallvec")] use smallvec::SmallVec; use crate::format_chars as chars; use crate::BidiClass::*; /// Trait that abstracts over a text source for use by the bidi algorithms. /// We implement this for str (UTF-8) and for [u16] (UTF-16, native-endian). /// (For internal unicode-bidi use; API may be unstable.) /// This trait is sealed and cannot be implemented for types outside this crate. pub trait TextSource<'text>: private::Sealed { type CharIter: Iterator; type CharIndexIter: Iterator; type IndexLenIter: Iterator; /// Return the length of the text in code units. #[doc(hidden)] fn len(&self) -> usize; /// Get the character at a given code unit index, along with its length in code units. /// Returns None if index is out of range, or points inside a multi-code-unit character. /// Returns REPLACEMENT_CHARACTER for any unpaired surrogates in UTF-16. #[doc(hidden)] fn char_at(&self, index: usize) -> Option<(char, usize)>; /// Return a subrange of the text, indexed by code units. /// (We don't implement all of the Index trait, just the minimum we use.) #[doc(hidden)] fn subrange(&self, range: Range) -> &Self; /// An iterator over the text returning Unicode characters, /// REPLACEMENT_CHAR for invalid code units. #[doc(hidden)] fn chars(&'text self) -> Self::CharIter; /// An iterator over the text returning (index, char) tuples, /// where index is the starting code-unit index of the character, /// and char is its Unicode value (or REPLACEMENT_CHAR if invalid). #[doc(hidden)] fn char_indices(&'text self) -> Self::CharIndexIter; /// An iterator over the text returning (index, length) tuples, /// where index is the starting code-unit index of the character, /// and length is its length in code units. #[doc(hidden)] fn indices_lengths(&'text self) -> Self::IndexLenIter; /// Number of code units the given character uses. #[doc(hidden)] fn char_len(ch: char) -> usize; } mod private { pub trait Sealed {} // Implement for str and [u16] only. impl Sealed for str {} impl Sealed for [u16] {} } #[derive(PartialEq, Debug)] pub enum Direction { Ltr, Rtl, Mixed, } /// Bidi information about a single paragraph #[derive(Clone, Debug, PartialEq)] pub struct ParagraphInfo { /// The paragraphs boundaries within the text, as byte indices. /// /// TODO: Shrink this to only include the starting index? pub range: Range, /// The paragraph embedding level. /// /// pub level: Level, } impl ParagraphInfo { /// Gets the length of the paragraph in the source text. pub fn len(&self) -> usize { self.range.end - self.range.start } } /// Initial bidi information of the text. /// /// Contains the text paragraphs and `BidiClass` of its characters. #[derive(PartialEq, Debug)] pub struct InitialInfo<'text> { /// The text pub text: &'text str, /// The BidiClass of the character at each byte in the text. /// If a character is multiple bytes, its class will appear multiple times in the vector. pub original_classes: Vec, /// The boundaries and level of each paragraph within the text. pub paragraphs: Vec, } impl<'text> InitialInfo<'text> { /// Find the paragraphs and BidiClasses in a string of text. /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] pub fn new(text: &str, default_para_level: Option) -> InitialInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature) /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option, ) -> InitialInfo<'a> { InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base } } /// Extended version of InitialInfo (not public API). #[derive(PartialEq, Debug)] struct InitialInfoExt<'text> { /// The base InitialInfo for the text, recording its paragraphs and bidi classes. base: InitialInfo<'text>, /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that /// requires no further bidi processing (i.e. there are no RTL characters or bidi /// control codes present), and whether any bidi isolation controls are present. flags: Vec, } #[derive(PartialEq, Debug)] struct ParagraphInfoFlags { is_pure_ltr: bool, has_isolate_controls: bool, } impl<'text> InitialInfoExt<'text> { /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature) /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option, ) -> InitialInfoExt<'a> { let mut paragraphs = Vec::::new(); let mut flags = Vec::::new(); let (original_classes, _, _, _) = compute_initial_info( data_source, text, default_para_level, Some((&mut paragraphs, &mut flags)), ); InitialInfoExt { base: InitialInfo { text, original_classes, paragraphs, }, flags, } } } /// Implementation of initial-info computation for both BidiInfo and ParagraphBidiInfo. /// To treat the text as (potentially) multiple paragraphs, the caller should pass the /// pair of optional outparam arrays to receive the ParagraphInfo and pure-ltr flags /// for each paragraph. Passing None for split_paragraphs will ignore any paragraph- /// separator characters in the text, treating it just as a single paragraph. /// Returns the array of BidiClass values for each code unit of the text, along with /// the embedding level and pure-ltr flag for the *last* (or only) paragraph. fn compute_initial_info<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, default_para_level: Option, mut split_paragraphs: Option<(&mut Vec, &mut Vec)>, ) -> (Vec, Level, bool, bool) { let mut original_classes = Vec::with_capacity(text.len()); // The stack contains the starting code unit index for each nested isolate we're inside. #[cfg(feature = "smallvec")] let mut isolate_stack = SmallVec::<[usize; 8]>::new(); #[cfg(not(feature = "smallvec"))] let mut isolate_stack = Vec::new(); debug_assert!( if let Some((ref paragraphs, ref flags)) = split_paragraphs { paragraphs.is_empty() && flags.is_empty() } else { true } ); let mut para_start = 0; let mut para_level = default_para_level; // Per-paragraph flag: can subsequent processing be skipped? Set to false if any // RTL characters or bidi control characters are encountered in the paragraph. let mut is_pure_ltr = true; // Set to true if any bidi isolation controls are present in the paragraph. let mut has_isolate_controls = false; #[cfg(feature = "flame_it")] flame::start("compute_initial_info(): iter text.char_indices()"); for (i, c) in text.char_indices() { let class = data_source.bidi_class(c); #[cfg(feature = "flame_it")] flame::start("original_classes.extend()"); let len = T::char_len(c); original_classes.extend(repeat(class).take(len)); #[cfg(feature = "flame_it")] flame::end("original_classes.extend()"); match class { B => { if let Some((ref mut paragraphs, ref mut flags)) = split_paragraphs { // P1. Split the text into separate paragraphs. The paragraph separator is kept // with the previous paragraph. let para_end = i + len; paragraphs.push(ParagraphInfo { range: para_start..para_end, // P3. If no character is found in p2, set the paragraph level to zero. level: para_level.unwrap_or(LTR_LEVEL), }); flags.push(ParagraphInfoFlags { is_pure_ltr, has_isolate_controls, }); // Reset state for the start of the next paragraph. para_start = para_end; // TODO: Support defaulting to direction of previous paragraph // // para_level = default_para_level; is_pure_ltr = true; has_isolate_controls = false; isolate_stack.clear(); } } L | R | AL => { if class != L { is_pure_ltr = false; } match isolate_stack.last() { Some(&start) => { if original_classes[start] == FSI { // X5c. If the first strong character between FSI and its matching // PDI is R or AL, treat it as RLI. Otherwise, treat it as LRI. for j in 0..T::char_len(chars::FSI) { original_classes[start + j] = if class == L { LRI } else { RLI }; } } } None => { if para_level.is_none() { // P2. Find the first character of type L, AL, or R, while skipping // any characters between an isolate initiator and its matching // PDI. para_level = Some(if class != L { RTL_LEVEL } else { LTR_LEVEL }); } } } } AN | LRE | RLE | LRO | RLO => { is_pure_ltr = false; } RLI | LRI | FSI => { is_pure_ltr = false; has_isolate_controls = true; isolate_stack.push(i); } PDI => { isolate_stack.pop(); } _ => {} } } if let Some((paragraphs, flags)) = split_paragraphs { if para_start < text.len() { paragraphs.push(ParagraphInfo { range: para_start..text.len(), level: para_level.unwrap_or(LTR_LEVEL), }); flags.push(ParagraphInfoFlags { is_pure_ltr, has_isolate_controls, }); } debug_assert_eq!(paragraphs.len(), flags.len()); } debug_assert_eq!(original_classes.len(), text.len()); #[cfg(feature = "flame_it")] flame::end("compute_initial_info(): iter text.char_indices()"); ( original_classes, para_level.unwrap_or(LTR_LEVEL), is_pure_ltr, has_isolate_controls, ) } /// Bidi information of the text. /// /// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a /// character is multiple bytes wide, then its class and level will appear multiple times in these /// vectors. // TODO: Impl `struct StringProperty { values: Vec }` and use instead of Vec #[derive(Debug, PartialEq)] pub struct BidiInfo<'text> { /// The text pub text: &'text str, /// The BidiClass of the character at each byte in the text. pub original_classes: Vec, /// The directional embedding level of each byte in the text. pub levels: Vec, /// The boundaries and paragraph embedding level of each paragraph within the text. /// /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs? /// Or just don't include the first paragraph, which always starts at 0? pub paragraphs: Vec, } impl<'text> BidiInfo<'text> { /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph. /// /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] #[inline] pub fn new(text: &str, default_para_level: Option) -> BidiInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature). /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option, ) -> BidiInfo<'a> { let InitialInfoExt { base, flags, .. } = InitialInfoExt::new_with_data_source(data_source, text, default_para_level); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = base.original_classes.clone(); for (para, flags) in base.paragraphs.iter().zip(flags.iter()) { let text = &text[para.range.clone()]; let original_classes = &base.original_classes[para.range.clone()]; compute_bidi_info_for_para( data_source, para, flags.is_pure_ltr, flags.has_isolate_controls, text, original_classes, &mut processing_classes, &mut levels, ); } BidiInfo { text, original_classes: base.original_classes, paragraphs: base.paragraphs, levels, } } /// Produce the levels for this paragraph as needed for reordering, one level per *byte* /// in the paragraph. The returned vector includes bytes that are not included /// in the `line`, but will not adjust them. /// /// This runs [Rule L1], you can run /// [Rule L2] by calling [`Self::reorder_visual()`]. /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead /// to avoid non-byte indices. /// /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`]. /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range) -> Vec { assert!(line.start <= self.levels.len()); assert!(line.end <= self.levels.len()); let mut levels = self.levels.clone(); let line_classes = &self.original_classes[line.clone()]; let line_levels = &mut levels[line.clone()]; reorder_levels( line_classes, line_levels, self.text.subrange(line), para.level, ); levels } /// Produce the levels for this paragraph as needed for reordering, one level per *character* /// in the paragraph. The returned vector includes characters that are not included /// in the `line`, but will not adjust them. /// /// This runs [Rule L1], you can run /// [Rule L2] by calling [`Self::reorder_visual()`]. /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead /// to avoid non-byte indices. /// /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`]. /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels_per_char( &self, para: &ParagraphInfo, line: Range, ) -> Vec { let levels = self.reordered_levels(para, line); self.text.char_indices().map(|(i, _)| levels[i]).collect() } /// Re-order a line based on resolved levels and return the line in display order. /// /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring. /// /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reorder_line(&self, para: &ParagraphInfo, line: Range) -> Cow<'text, str> { if !level::has_rtl(&self.levels[line.clone()]) { return self.text[line].into(); } let (levels, runs) = self.visual_runs(para, line.clone()); reorder_line(self.text, line, levels, runs) } /// Reorders pre-calculated levels of a sequence of characters. /// /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is /// intended to be used when an application has determined the levels of the objects (character sequences) /// and just needs to have them reordered. /// /// the index map will result in `indexMap[visualIndex]==logicalIndex`. /// /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have /// information about the actual text. /// /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level` /// is for a single code point. /// /// /// # # Example /// ``` /// use unicode_bidi::BidiInfo; /// use unicode_bidi::Level; /// /// let l0 = Level::from(0); /// let l1 = Level::from(1); /// let l2 = Level::from(2); /// /// let levels = vec![l0, l0, l0, l0]; /// let index_map = BidiInfo::reorder_visual(&levels); /// assert_eq!(levels.len(), index_map.len()); /// assert_eq!(index_map, [0, 1, 2, 3]); /// /// let levels: Vec = vec![l0, l0, l0, l1, l1, l1, l2, l2]; /// let index_map = BidiInfo::reorder_visual(&levels); /// assert_eq!(levels.len(), index_map.len()); /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]); /// ``` #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn reorder_visual(levels: &[Level]) -> Vec { reorder_visual(levels) } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of bytes indices within `levels`. /// /// The first return value is a vector of levels used by the reordering algorithm, /// i.e. the result of [Rule L1]. The second return value is a vector of level runs, /// the result of [Rule L2], showing the visual order that each level run (a run of text with the /// same level) should be displayed. Within each run, the display order can be checked /// against the Level vector. /// /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring), /// as that should be handled by the engine using this API. /// /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead /// of producing a level map, since one may wish to deal with the fact that this is operating on /// byte rather than character indices. /// /// /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn visual_runs( &self, para: &ParagraphInfo, line: Range, ) -> (Vec, Vec) { let levels = self.reordered_levels(para, line.clone()); visual_runs_for_line(levels, &line) } /// If processed text has any computed RTL levels /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(&self) -> bool { level::has_rtl(&self.levels) } } /// Bidi information of text treated as a single paragraph. /// /// The `original_classes` and `levels` vectors are indexed by byte offsets into the text. If a /// character is multiple bytes wide, then its class and level will appear multiple times in these /// vectors. #[derive(Debug, PartialEq)] pub struct ParagraphBidiInfo<'text> { /// The text pub text: &'text str, /// The BidiClass of the character at each byte in the text. pub original_classes: Vec, /// The directional embedding level of each byte in the text. pub levels: Vec, /// The paragraph embedding level. pub paragraph_level: Level, /// Whether the paragraph is purely LTR. pub is_pure_ltr: bool, } impl<'text> ParagraphBidiInfo<'text> { /// Determine the bidi embedding level. /// /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] #[inline] pub fn new(text: &str, default_para_level: Option) -> ParagraphBidiInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Determine the bidi embedding level, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature). /// /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source, /// and should be kept in sync with it. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a str, default_para_level: Option, ) -> ParagraphBidiInfo<'a> { // Here we could create a ParagraphInitialInfo struct to parallel the one // used by BidiInfo, but there doesn't seem any compelling reason for it. let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) = compute_initial_info(data_source, text, default_para_level, None); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = original_classes.clone(); let para_info = ParagraphInfo { range: Range { start: 0, end: text.len(), }, level: paragraph_level, }; compute_bidi_info_for_para( data_source, ¶_info, is_pure_ltr, has_isolate_controls, text, &original_classes, &mut processing_classes, &mut levels, ); ParagraphBidiInfo { text, original_classes, levels, paragraph_level, is_pure_ltr, } } /// Produce the levels for this paragraph as needed for reordering, one level per *byte* /// in the paragraph. The returned vector includes bytes that are not included /// in the `line`, but will not adjust them. /// /// See BidiInfo::reordered_levels for details. /// /// (This should be kept in sync with BidiInfo::reordered_levels.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels(&self, line: Range) -> Vec { assert!(line.start <= self.levels.len()); assert!(line.end <= self.levels.len()); let mut levels = self.levels.clone(); let line_classes = &self.original_classes[line.clone()]; let line_levels = &mut levels[line.clone()]; reorder_levels( line_classes, line_levels, self.text.subrange(line), self.paragraph_level, ); levels } /// Produce the levels for this paragraph as needed for reordering, one level per *character* /// in the paragraph. The returned vector includes characters that are not included /// in the `line`, but will not adjust them. /// /// See BidiInfo::reordered_levels_per_char for details. /// /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels_per_char(&self, line: Range) -> Vec { let levels = self.reordered_levels(line); self.text.char_indices().map(|(i, _)| levels[i]).collect() } /// Re-order a line based on resolved levels and return the line in display order. /// /// See BidiInfo::reorder_line for details. /// /// (This should be kept in sync with BidiInfo::reorder_line.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reorder_line(&self, line: Range) -> Cow<'text, str> { if !level::has_rtl(&self.levels[line.clone()]) { return self.text[line].into(); } let (levels, runs) = self.visual_runs(line.clone()); reorder_line(self.text, line, levels, runs) } /// Reorders pre-calculated levels of a sequence of characters. /// /// See BidiInfo::reorder_visual for details. #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn reorder_visual(levels: &[Level]) -> Vec { reorder_visual(levels) } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of bytes indices within `levels`. /// /// See BidiInfo::visual_runs for details. /// /// (This should be kept in sync with BidiInfo::visual_runs.) #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn visual_runs(&self, line: Range) -> (Vec, Vec) { let levels = self.reordered_levels(line.clone()); visual_runs_for_line(levels, &line) } /// If processed text has any computed RTL levels /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(&self) -> bool { !self.is_pure_ltr } /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels. #[inline] pub fn direction(&self) -> Direction { para_direction(&self.levels) } } /// Return a line of the text in display order based on resolved levels. /// /// `text` the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis /// `line` a range of byte indices within `text` corresponding to one line /// `levels` array of `Level` values, with `line`'s levels reordered into visual order /// `runs` array of `LevelRun`s in visual order /// /// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or /// `ParagraphBidiInfo::visual_runs()` for the line of interest.) /// /// Returns: the reordered text of the line. /// /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring. /// /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 fn reorder_line( text: &str, line: Range, levels: Vec, runs: Vec, ) -> Cow<'_, str> { // If all isolating run sequences are LTR, no reordering is needed if runs.iter().all(|run| levels[run.start].is_ltr()) { return text[line].into(); } let mut result = String::with_capacity(line.len()); for run in runs { if levels[run.start].is_rtl() { result.extend(text[run].chars().rev()); } else { result.push_str(&text[run]); } } result.into() } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of code-unit indices within `levels`. /// /// The first return value is a vector of levels used by the reordering algorithm, /// i.e. the result of [Rule L1]. The second return value is a vector of level runs, /// the result of [Rule L2], showing the visual order that each level run (a run of text with the /// same level) should be displayed. Within each run, the display order can be checked /// against the Level vector. /// /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring), /// as that should be handled by the engine using this API. /// /// Conceptually, this is the same as running [`reordered_levels()`] followed by /// [`reorder_visual()`], however it returns the result as a list of level runs instead /// of producing a level map, since one may wish to deal with the fact that this is operating on /// byte rather than character indices. /// /// /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 fn visual_runs_for_line(levels: Vec, line: &Range) -> (Vec, Vec) { // Find consecutive level runs. let mut runs = Vec::new(); let mut start = line.start; let mut run_level = levels[start]; let mut min_level = run_level; let mut max_level = run_level; for (i, &new_level) in levels.iter().enumerate().take(line.end).skip(start + 1) { if new_level != run_level { // End of the previous run, start of a new one. runs.push(start..i); start = i; run_level = new_level; min_level = cmp::min(run_level, min_level); max_level = cmp::max(run_level, max_level); } } runs.push(start..line.end); let run_count = runs.len(); // Re-order the odd runs. // // Stop at the lowest *odd* level. min_level = min_level.new_lowest_ge_rtl().expect("Level error"); // This loop goes through contiguous chunks of level runs that have a level // ≥ max_level and reverses their contents, reducing max_level by 1 each time. while max_level >= min_level { // Look for the start of a sequence of consecutive runs of max_level or higher. let mut seq_start = 0; while seq_start < run_count { if levels[runs[seq_start].start] < max_level { seq_start += 1; continue; } // Found the start of a sequence. Now find the end. let mut seq_end = seq_start + 1; while seq_end < run_count { if levels[runs[seq_end].start] < max_level { break; } seq_end += 1; } // Reverse the runs within this sequence. runs[seq_start..seq_end].reverse(); seq_start = seq_end; } max_level .lower(1) .expect("Lowering embedding level below zero"); } (levels, runs) } /// Reorders pre-calculated levels of a sequence of characters. /// /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is /// intended to be used when an application has determined the levels of the objects (character sequences) /// and just needs to have them reordered. /// /// the index map will result in `indexMap[visualIndex]==logicalIndex`. /// /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have /// information about the actual text. /// /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level` /// is for a single code point. fn reorder_visual(levels: &[Level]) -> Vec { // Gets the next range of characters after start_index with a level greater // than or equal to `max` fn next_range(levels: &[level::Level], mut start_index: usize, max: Level) -> Range { if levels.is_empty() || start_index >= levels.len() { return start_index..start_index; } while let Some(l) = levels.get(start_index) { if *l >= max { break; } start_index += 1; } if levels.get(start_index).is_none() { // If at the end of the array, adding one will // produce an out-of-range end element return start_index..start_index; } let mut end_index = start_index + 1; while let Some(l) = levels.get(end_index) { if *l < max { return start_index..end_index; } end_index += 1; } start_index..end_index } // This implementation is similar to the L2 implementation in `visual_runs()` // but it cannot benefit from a precalculated LevelRun vector so needs to be different. if levels.is_empty() { return vec![]; } // Get the min and max levels let (mut min, mut max) = levels .iter() .fold((levels[0], levels[0]), |(min, max), &l| { (cmp::min(min, l), cmp::max(max, l)) }); // Initialize an index map let mut result: Vec = (0..levels.len()).collect(); if min == max && min.is_ltr() { // Everything is LTR and at the same level, do nothing return result; } // Stop at the lowest *odd* level, since everything below that // is LTR and does not need further reordering min = min.new_lowest_ge_rtl().expect("Level error"); // For each max level, take all contiguous chunks of // levels ≥ max and reverse them // // We can do this check with the original levels instead of checking reorderings because all // prior reorderings will have been for contiguous chunks of levels >> max, which will // be a subset of these chunks anyway. while min <= max { let mut range = 0..0; loop { range = next_range(levels, range.end, max); result[range.clone()].reverse(); if range.end >= levels.len() { break; } } max.lower(1).expect("Level error"); } result } /// The core of BidiInfo initialization, factored out into a function that both /// the utf-8 and utf-16 versions of BidiInfo can use. fn compute_bidi_info_for_para<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, para: &ParagraphInfo, is_pure_ltr: bool, has_isolate_controls: bool, text: &'a T, original_classes: &[BidiClass], processing_classes: &mut [BidiClass], levels: &mut Vec, ) { let new_len = levels.len() + para.range.len(); levels.resize(new_len, para.level); if para.level == LTR_LEVEL && is_pure_ltr { return; } let processing_classes = &mut processing_classes[para.range.clone()]; let levels = &mut levels[para.range.clone()]; let mut level_runs = LevelRunVec::new(); explicit::compute( text, para.level, original_classes, levels, processing_classes, &mut level_runs, ); let mut sequences = prepare::IsolatingRunSequenceVec::new(); prepare::isolating_run_sequences( para.level, original_classes, levels, level_runs, has_isolate_controls, &mut sequences, ); for sequence in &sequences { implicit::resolve_weak(text, sequence, processing_classes); implicit::resolve_neutral( text, data_source, sequence, levels, original_classes, processing_classes, ); } implicit::resolve_levels(processing_classes, levels); assign_levels_to_removed_chars(para.level, original_classes, levels); } /// Produce the levels for this paragraph as needed for reordering, one level per *code unit* /// in the paragraph. The returned vector includes code units that are not included /// in the `line`, but will not adjust them. /// /// This runs [Rule L1] /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 fn reorder_levels<'a, T: TextSource<'a> + ?Sized>( line_classes: &[BidiClass], line_levels: &mut [Level], line_text: &'a T, para_level: Level, ) { // Reset some whitespace chars to paragraph level. // let mut reset_from: Option = Some(0); let mut reset_to: Option = None; let mut prev_level = para_level; for (i, c) in line_text.char_indices() { match line_classes[i] { // Segment separator, Paragraph separator B | S => { assert_eq!(reset_to, None); reset_to = Some(i + T::char_len(c)); if reset_from.is_none() { reset_from = Some(i); } } // Whitespace, isolate formatting WS | FSI | LRI | RLI | PDI => { if reset_from.is_none() { reset_from = Some(i); } } // // same as above + set the level RLE | LRE | RLO | LRO | PDF | BN => { if reset_from.is_none() { reset_from = Some(i); } // also set the level to previous line_levels[i] = prev_level; } _ => { reset_from = None; } } if let (Some(from), Some(to)) = (reset_from, reset_to) { for level in &mut line_levels[from..to] { *level = para_level; } reset_from = None; reset_to = None; } prev_level = line_levels[i]; } if let Some(from) = reset_from { for level in &mut line_levels[from..] { *level = para_level; } } } /// Contains a reference of `BidiInfo` and one of its `paragraphs`. /// And it supports all operation in the `Paragraph` that needs also its /// `BidiInfo` such as `direction`. #[derive(Debug)] pub struct Paragraph<'a, 'text> { pub info: &'a BidiInfo<'text>, pub para: &'a ParagraphInfo, } impl<'a, 'text> Paragraph<'a, 'text> { #[inline] pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> { Paragraph { info, para } } /// Returns if the paragraph is Left direction, right direction or mixed. #[inline] pub fn direction(&self) -> Direction { para_direction(&self.info.levels[self.para.range.clone()]) } /// Returns the `Level` of a certain character in the paragraph. #[inline] pub fn level_at(&self, pos: usize) -> Level { let actual_position = self.para.range.start + pos; self.info.levels[actual_position] } } /// Return the directionality of the paragraph (Left, Right or Mixed) from its levels. #[cfg_attr(feature = "flame_it", flamer::flame)] fn para_direction(levels: &[Level]) -> Direction { let mut ltr = false; let mut rtl = false; for level in levels { if level.is_ltr() { ltr = true; if rtl { return Direction::Mixed; } } if level.is_rtl() { rtl = true; if ltr { return Direction::Mixed; } } } if ltr { return Direction::Ltr; } Direction::Rtl } /// Assign levels to characters removed by rule X9. /// /// The levels assigned to these characters are not specified by the algorithm. This function /// assigns each one the level of the previous character, to avoid breaking level runs. #[cfg_attr(feature = "flame_it", flamer::flame)] fn assign_levels_to_removed_chars(para_level: Level, classes: &[BidiClass], levels: &mut [Level]) { for i in 0..levels.len() { if prepare::removed_by_x9(classes[i]) { levels[i] = if i > 0 { levels[i - 1] } else { para_level }; } } } /// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm. /// /// See rules P2 and P3. /// /// The base direction is derived from the first character in the string with bidi character type /// L, R, or AL. If the first such character has type L, Direction::Ltr is returned. If the first /// such character has type R or AL, Direction::Rtl is returned. /// /// If the string does not contain any character of these types (outside of embedded isolate runs), /// then Direction::Mixed is returned (but should be considered as meaning "neutral" or "unknown", /// not in fact mixed directions). /// /// This is a lightweight function for use when only the base direction is needed and no further /// bidi processing of the text is needed. /// /// If the text contains paragraph separators, this function considers only the first paragraph. #[cfg(feature = "hardcoded-data")] #[inline] pub fn get_base_direction<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction { get_base_direction_with_data_source(&HardcodedBidiData, text) } /// Get the base direction of the text provided according to the Unicode Bidirectional Algorithm, /// considering the full text if the first paragraph is all-neutral. /// /// This is the same as get_base_direction except that it does not stop at the first block /// separator, but just resets the embedding level and continues to look for a strongly- /// directional character. So the result will be the base direction of the first paragraph /// that is not purely neutral characters. #[cfg(feature = "hardcoded-data")] #[inline] pub fn get_base_direction_full<'a, T: TextSource<'a> + ?Sized>(text: &'a T) -> Direction { get_base_direction_full_with_data_source(&HardcodedBidiData, text) } #[inline] pub fn get_base_direction_with_data_source<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, ) -> Direction { get_base_direction_impl(data_source, text, false) } #[inline] pub fn get_base_direction_full_with_data_source< 'a, D: BidiDataSource, T: TextSource<'a> + ?Sized, >( data_source: &D, text: &'a T, ) -> Direction { get_base_direction_impl(data_source, text, true) } fn get_base_direction_impl<'a, D: BidiDataSource, T: TextSource<'a> + ?Sized>( data_source: &D, text: &'a T, use_full_text: bool, ) -> Direction { let mut isolate_level = 0; for c in text.chars() { match data_source.bidi_class(c) { LRI | RLI | FSI => isolate_level += 1, PDI if isolate_level > 0 => isolate_level -= 1, L if isolate_level == 0 => return Direction::Ltr, R | AL if isolate_level == 0 => return Direction::Rtl, B if !use_full_text => break, B if use_full_text => isolate_level = 0, _ => (), } } // If no strong char was found, return Mixed. Normally this will be treated as Ltr by callers // (see rule P3), but we don't map this to Ltr here so that a caller that wants to apply other // heuristics to an all-neutral paragraph can tell the difference. Direction::Mixed } /// Implementation of TextSource for UTF-8 text (a string slice). impl<'text> TextSource<'text> for str { type CharIter = core::str::Chars<'text>; type CharIndexIter = core::str::CharIndices<'text>; type IndexLenIter = Utf8IndexLenIter<'text>; #[inline] fn len(&self) -> usize { (self as &str).len() } #[inline] fn char_at(&self, index: usize) -> Option<(char, usize)> { if let Some(slice) = self.get(index..) { if let Some(ch) = slice.chars().next() { return Some((ch, ch.len_utf8())); } } None } #[inline] fn subrange(&self, range: Range) -> &Self { &(self as &str)[range] } #[inline] fn chars(&'text self) -> Self::CharIter { (self as &str).chars() } #[inline] fn char_indices(&'text self) -> Self::CharIndexIter { (self as &str).char_indices() } #[inline] fn indices_lengths(&'text self) -> Self::IndexLenIter { Utf8IndexLenIter::new(self) } #[inline] fn char_len(ch: char) -> usize { ch.len_utf8() } } /// Iterator over (UTF-8) string slices returning (index, char_len) tuple. #[derive(Debug)] pub struct Utf8IndexLenIter<'text> { iter: CharIndices<'text>, } impl<'text> Utf8IndexLenIter<'text> { #[inline] pub fn new(text: &'text str) -> Self { Utf8IndexLenIter { iter: text.char_indices(), } } } impl Iterator for Utf8IndexLenIter<'_> { type Item = (usize, usize); #[inline] fn next(&mut self) -> Option { if let Some((pos, ch)) = self.iter.next() { return Some((pos, ch.len_utf8())); } None } } #[cfg(test)] fn to_utf16(s: &str) -> Vec { s.encode_utf16().collect() } #[cfg(test)] #[cfg(feature = "hardcoded-data")] mod tests { use super::*; use utf16::{ BidiInfo as BidiInfoU16, InitialInfo as InitialInfoU16, Paragraph as ParagraphU16, ParagraphBidiInfo as ParagraphBidiInfoU16, }; #[test] fn test_utf16_text_source() { let text: &[u16] = &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800]; assert_eq!(text.char_at(0), Some(('A', 1))); assert_eq!(text.char_at(1), Some(('\u{10401}', 2))); assert_eq!(text.char_at(2), None); assert_eq!(text.char_at(3), Some((' ', 1))); assert_eq!(text.char_at(4), Some((char::REPLACEMENT_CHARACTER, 1))); assert_eq!(text.char_at(5), Some((' ', 1))); assert_eq!(text.char_at(6), Some((char::REPLACEMENT_CHARACTER, 1))); assert_eq!(text.char_at(7), Some((' ', 1))); assert_eq!(text.char_at(8), Some((char::REPLACEMENT_CHARACTER, 1))); assert_eq!(text.char_at(9), Some((char::REPLACEMENT_CHARACTER, 1))); assert_eq!(text.char_at(10), None); } #[test] fn test_utf16_char_iter() { let text: &[u16] = &[0x41, 0xD801, 0xDC01, 0x20, 0xD800, 0x20, 0xDFFF, 0x20, 0xDC00, 0xD800]; assert_eq!(text.len(), 10); assert_eq!(text.chars().count(), 9); let mut chars = text.chars(); assert_eq!(chars.next(), Some('A')); assert_eq!(chars.next(), Some('\u{10401}')); assert_eq!(chars.next(), Some(' ')); assert_eq!(chars.next(), Some('\u{FFFD}')); assert_eq!(chars.next(), Some(' ')); assert_eq!(chars.next(), Some('\u{FFFD}')); assert_eq!(chars.next(), Some(' ')); assert_eq!(chars.next(), Some('\u{FFFD}')); assert_eq!(chars.next(), Some('\u{FFFD}')); assert_eq!(chars.next(), None); } #[test] fn test_initial_text_info() { let tests = vec![ ( // text "a1", // expected bidi classes per utf-8 byte vec![L, EN], // expected paragraph-info for utf-8 vec![ParagraphInfo { range: 0..2, level: LTR_LEVEL, }], // expected bidi classes per utf-16 code unit vec![L, EN], // expected paragraph-info for utf-16 vec![ParagraphInfo { range: 0..2, level: LTR_LEVEL, }], ), ( // Arabic, space, Hebrew "\u{0639} \u{05D0}", vec![AL, AL, WS, R, R], vec![ParagraphInfo { range: 0..5, level: RTL_LEVEL, }], vec![AL, WS, R], vec![ParagraphInfo { range: 0..3, level: RTL_LEVEL, }], ), ( // SMP characters from Kharoshthi, Cuneiform, Adlam: "\u{10A00}\u{12000}\u{1E900}", vec![R, R, R, R, L, L, L, L, R, R, R, R], vec![ParagraphInfo { range: 0..12, level: RTL_LEVEL, }], vec![R, R, L, L, R, R], vec![ParagraphInfo { range: 0..6, level: RTL_LEVEL, }], ), ( "a\u{2029}b", vec![L, B, B, B, L], vec![ ParagraphInfo { range: 0..4, level: LTR_LEVEL, }, ParagraphInfo { range: 4..5, level: LTR_LEVEL, }, ], vec![L, B, L], vec![ ParagraphInfo { range: 0..2, level: LTR_LEVEL, }, ParagraphInfo { range: 2..3, level: LTR_LEVEL, }, ], ), ( "\u{2068}א\u{2069}a", // U+2068 FSI, U+2069 PDI vec![RLI, RLI, RLI, R, R, PDI, PDI, PDI, L], vec![ParagraphInfo { range: 0..9, level: LTR_LEVEL, }], vec![RLI, R, PDI, L], vec![ParagraphInfo { range: 0..4, level: LTR_LEVEL, }], ), ]; for t in tests { assert_eq!( InitialInfo::new(t.0, None), InitialInfo { text: t.0, original_classes: t.1, paragraphs: t.2, } ); let text = &to_utf16(t.0); assert_eq!( InitialInfoU16::new(text, None), InitialInfoU16 { text, original_classes: t.3, paragraphs: t.4, } ); } } #[test] #[cfg(feature = "hardcoded-data")] fn test_process_text() { let tests = vec![ ( // text "", // base level Some(RTL_LEVEL), // levels Level::vec(&[]), // original_classes vec![], // paragraphs vec![], // levels_u16 Level::vec(&[]), // original_classes_u16 vec![], // paragraphs_u16 vec![], ), ( // text "abc123", // base level Some(LTR_LEVEL), // levels Level::vec(&[0, 0, 0, 0, 0, 0]), // original_classes vec![L, L, L, EN, EN, EN], // paragraphs vec![ParagraphInfo { range: 0..6, level: LTR_LEVEL, }], // levels_u16 Level::vec(&[0, 0, 0, 0, 0, 0]), // original_classes_u16 vec![L, L, L, EN, EN, EN], // paragraphs_u16 vec![ParagraphInfo { range: 0..6, level: LTR_LEVEL, }], ), ( "abc \u{05D0}\u{05D1}\u{05D2}", Some(LTR_LEVEL), Level::vec(&[0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), vec![L, L, L, WS, R, R, R, R, R, R], vec![ParagraphInfo { range: 0..10, level: LTR_LEVEL, }], Level::vec(&[0, 0, 0, 0, 1, 1, 1]), vec![L, L, L, WS, R, R, R], vec![ParagraphInfo { range: 0..7, level: LTR_LEVEL, }], ), ( "abc \u{05D0}\u{05D1}\u{05D2}", Some(RTL_LEVEL), Level::vec(&[2, 2, 2, 1, 1, 1, 1, 1, 1, 1]), vec![L, L, L, WS, R, R, R, R, R, R], vec![ParagraphInfo { range: 0..10, level: RTL_LEVEL, }], Level::vec(&[2, 2, 2, 1, 1, 1, 1]), vec![L, L, L, WS, R, R, R], vec![ParagraphInfo { range: 0..7, level: RTL_LEVEL, }], ), ( "\u{05D0}\u{05D1}\u{05D2} abc", Some(LTR_LEVEL), Level::vec(&[1, 1, 1, 1, 1, 1, 0, 0, 0, 0]), vec![R, R, R, R, R, R, WS, L, L, L], vec![ParagraphInfo { range: 0..10, level: LTR_LEVEL, }], Level::vec(&[1, 1, 1, 0, 0, 0, 0]), vec![R, R, R, WS, L, L, L], vec![ParagraphInfo { range: 0..7, level: LTR_LEVEL, }], ), ( "\u{05D0}\u{05D1}\u{05D2} abc", None, Level::vec(&[1, 1, 1, 1, 1, 1, 1, 2, 2, 2]), vec![R, R, R, R, R, R, WS, L, L, L], vec![ParagraphInfo { range: 0..10, level: RTL_LEVEL, }], Level::vec(&[1, 1, 1, 1, 2, 2, 2]), vec![R, R, R, WS, L, L, L], vec![ParagraphInfo { range: 0..7, level: RTL_LEVEL, }], ), ( "\u{063A}2\u{0638} \u{05D0}2\u{05D2}", Some(LTR_LEVEL), Level::vec(&[1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]), vec![AL, AL, EN, AL, AL, WS, R, R, EN, R, R], vec![ParagraphInfo { range: 0..11, level: LTR_LEVEL, }], Level::vec(&[1, 2, 1, 1, 1, 2, 1]), vec![AL, EN, AL, WS, R, EN, R], vec![ParagraphInfo { range: 0..7, level: LTR_LEVEL, }], ), ( "a א.\nג", None, Level::vec(&[0, 0, 1, 1, 0, 0, 1, 1]), vec![L, WS, R, R, CS, B, R, R], vec![ ParagraphInfo { range: 0..6, level: LTR_LEVEL, }, ParagraphInfo { range: 6..8, level: RTL_LEVEL, }, ], Level::vec(&[0, 0, 1, 0, 0, 1]), vec![L, WS, R, CS, B, R], vec![ ParagraphInfo { range: 0..5, level: LTR_LEVEL, }, ParagraphInfo { range: 5..6, level: RTL_LEVEL, }, ], ), // BidiTest:69635 (AL ET EN) ( "\u{060B}\u{20CF}\u{06F9}", None, Level::vec(&[1, 1, 1, 1, 1, 2, 2]), vec![AL, AL, ET, ET, ET, EN, EN], vec![ParagraphInfo { range: 0..7, level: RTL_LEVEL, }], Level::vec(&[1, 1, 2]), vec![AL, ET, EN], vec![ParagraphInfo { range: 0..3, level: RTL_LEVEL, }], ), ]; for t in tests { assert_eq!( BidiInfo::new(t.0, t.1), BidiInfo { text: t.0, levels: t.2.clone(), original_classes: t.3.clone(), paragraphs: t.4.clone(), } ); // If it was empty, also test that ParagraphBidiInfo handles it safely. if t.4.len() == 0 { assert_eq!( ParagraphBidiInfo::new(t.0, t.1), ParagraphBidiInfo { text: t.0, original_classes: t.3.clone(), levels: t.2.clone(), paragraph_level: RTL_LEVEL, is_pure_ltr: true, } ) } // If it was a single paragraph, also test ParagraphBidiInfo. if t.4.len() == 1 { assert_eq!( ParagraphBidiInfo::new(t.0, t.1), ParagraphBidiInfo { text: t.0, original_classes: t.3, levels: t.2.clone(), paragraph_level: t.4[0].level, is_pure_ltr: !level::has_rtl(&t.2), } ) } let text = &to_utf16(t.0); assert_eq!( BidiInfoU16::new(text, t.1), BidiInfoU16 { text, levels: t.5.clone(), original_classes: t.6.clone(), paragraphs: t.7.clone(), } ); if t.7.len() == 1 { assert_eq!( ParagraphBidiInfoU16::new(text, t.1), ParagraphBidiInfoU16 { text: text, original_classes: t.6.clone(), levels: t.5.clone(), paragraph_level: t.7[0].level, is_pure_ltr: !level::has_rtl(&t.5), } ) } } } #[test] #[cfg(feature = "hardcoded-data")] fn test_paragraph_bidi_info() { // Passing text that includes a paragraph break to the ParagraphBidiInfo API: // this is a misuse of the API by the client, but our behavior is safe & // consistent. The embedded paragraph break acts like a separator (tab) would. let tests = vec![ ( "a א.\nג", None, // utf-8 results: vec![L, WS, R, R, CS, B, R, R], Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]), // utf-16 results: vec![L, WS, R, CS, B, R], Level::vec(&[0, 0, 1, 1, 1, 1]), // paragraph level; is_pure_ltr LTR_LEVEL, false, ), ( "\u{5d1} a.\nb.", None, // utf-8 results: vec![R, R, WS, L, CS, B, L, CS], Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]), // utf-16 results: vec![R, WS, L, CS, B, L, CS], Level::vec(&[1, 1, 2, 2, 2, 2, 1]), // paragraph level; is_pure_ltr RTL_LEVEL, false, ), ( "a א.\tג", None, // utf-8 results: vec![L, WS, R, R, CS, S, R, R], Level::vec(&[0, 0, 1, 1, 1, 1, 1, 1]), // utf-16 results: vec![L, WS, R, CS, S, R], Level::vec(&[0, 0, 1, 1, 1, 1]), // paragraph level; is_pure_ltr LTR_LEVEL, false, ), ( "\u{5d1} a.\tb.", None, // utf-8 results: vec![R, R, WS, L, CS, S, L, CS], Level::vec(&[1, 1, 1, 2, 2, 2, 2, 1]), // utf-16 results: vec![R, WS, L, CS, S, L, CS], Level::vec(&[1, 1, 2, 2, 2, 2, 1]), // paragraph level; is_pure_ltr RTL_LEVEL, false, ), ]; for t in tests { assert_eq!( ParagraphBidiInfo::new(t.0, t.1), ParagraphBidiInfo { text: t.0, original_classes: t.2, levels: t.3, paragraph_level: t.6, is_pure_ltr: t.7, } ); let text = &to_utf16(t.0); assert_eq!( ParagraphBidiInfoU16::new(text, t.1), ParagraphBidiInfoU16 { text: text, original_classes: t.4, levels: t.5, paragraph_level: t.6, is_pure_ltr: t.7, } ); } } #[test] #[cfg(feature = "hardcoded-data")] fn test_bidi_info_has_rtl() { let tests = vec![ // ASCII only ("123", None, false), ("123", Some(LTR_LEVEL), false), ("123", Some(RTL_LEVEL), false), ("abc", None, false), ("abc", Some(LTR_LEVEL), false), ("abc", Some(RTL_LEVEL), false), ("abc 123", None, false), ("abc\n123", None, false), // With Hebrew ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(LTR_LEVEL), true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}", Some(RTL_LEVEL), true), ("abc \u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true), ("abc\n\u{05D0}\u{05D1}\u{05BC}\u{05D2}", None, true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} abc", None, true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\nabc", None, true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2} 123", None, true), ("\u{05D0}\u{05D1}\u{05BC}\u{05D2}\n123", None, true), ]; for t in tests { assert_eq!(BidiInfo::new(t.0, t.1).has_rtl(), t.2); assert_eq!(BidiInfoU16::new(&to_utf16(t.0), t.1).has_rtl(), t.2); } } #[cfg(feature = "hardcoded-data")] fn reorder_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reorder_line(para, para.range.clone())) .collect() } #[cfg(feature = "hardcoded-data")] fn reorder_paras_u16(text: &[u16]) -> Vec> { let bidi_info = BidiInfoU16::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reorder_line(para, para.range.clone())) .collect() } #[test] #[cfg(feature = "hardcoded-data")] fn test_reorder_line() { let tests = vec![ // Bidi_Class: L L L B L L L B L L L ("abc\ndef\nghi", vec!["abc\n", "def\n", "ghi"]), // Bidi_Class: L L EN B L L EN B L L EN ("ab1\nde2\ngh3", vec!["ab1\n", "de2\n", "gh3"]), // Bidi_Class: L L L B AL AL AL ("abc\nابج", vec!["abc\n", "جبا"]), // Bidi_Class: AL AL AL B L L L ( "\u{0627}\u{0628}\u{062C}\nabc", vec!["\n\u{062C}\u{0628}\u{0627}", "abc"], ), ("1.-2", vec!["1.-2"]), ("1-.2", vec!["1-.2"]), ("abc אבג", vec!["abc גבא"]), // Numbers being weak LTR characters, cannot reorder strong RTL ("123 \u{05D0}\u{05D1}\u{05D2}", vec!["גבא 123"]), ("abc\u{202A}def", vec!["abc\u{202A}def"]), ( "abc\u{202A}def\u{202C}ghi", vec!["abc\u{202A}def\u{202C}ghi"], ), ( "abc\u{2066}def\u{2069}ghi", vec!["abc\u{2066}def\u{2069}ghi"], ), // Testing for RLE Character ("\u{202B}abc אבג\u{202C}", vec!["\u{202b}גבא abc\u{202c}"]), // Testing neutral characters ("\u{05D0}בג? אבג", vec!["גבא ?גבא"]), // Testing neutral characters with special case ("A אבג?", vec!["A גבא?"]), // Testing neutral characters with Implicit RTL Marker ("A אבג?\u{200F}", vec!["A \u{200F}?גבא"]), ("\u{05D0}בג abc", vec!["abc גבא"]), ("abc\u{2067}.-\u{2069}ghi", vec!["abc\u{2067}-.\u{2069}ghi"]), ( "Hello, \u{2068}\u{202E}world\u{202C}\u{2069}!", vec!["Hello, \u{2068}\u{202E}\u{202C}dlrow\u{2069}!"], ), // With mirrorable characters in RTL run ("\u{05D0}(ב)ג.", vec![".ג)ב(א"]), // With mirrorable characters on level boundary ("\u{05D0}ב(גד[&ef].)gh", vec!["gh).]ef&[דג(בא"]), ]; for t in tests { assert_eq!(reorder_paras(t.0), t.1); let expect_utf16 = t.1.iter().map(|v| to_utf16(v)).collect::>(); assert_eq!(reorder_paras_u16(&to_utf16(t.0)), expect_utf16); } } fn reordered_levels_for_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels(para, para.range.clone())) .collect() } fn reordered_levels_per_char_for_paras(text: &str) -> Vec> { let bidi_info = BidiInfo::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone())) .collect() } fn reordered_levels_for_paras_u16(text: &[u16]) -> Vec> { let bidi_info = BidiInfoU16::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels(para, para.range.clone())) .collect() } fn reordered_levels_per_char_for_paras_u16(text: &[u16]) -> Vec> { let bidi_info = BidiInfoU16::new(text, None); bidi_info .paragraphs .iter() .map(|para| bidi_info.reordered_levels_per_char(para, para.range.clone())) .collect() } #[test] #[cfg(feature = "hardcoded-data")] fn test_reordered_levels() { let tests = vec![ // BidiTest:946 (LRI PDI) ( "\u{2067}\u{2069}", vec![Level::vec(&[0, 0, 0, 0, 0, 0])], vec![Level::vec(&[0, 0])], vec![Level::vec(&[0, 0])], ), // BidiTest:69635 (AL ET EN) ( "\u{060B}\u{20CF}\u{06F9}", vec![Level::vec(&[1, 1, 1, 1, 1, 2, 2])], vec![Level::vec(&[1, 1, 2])], vec![Level::vec(&[1, 1, 2])], ), ]; for t in tests { assert_eq!(reordered_levels_for_paras(t.0), t.1); assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2); let text = &to_utf16(t.0); assert_eq!(reordered_levels_for_paras_u16(text), t.3); assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2); } let tests = vec![ // BidiTest:291284 (AN RLI PDF R) ( "\u{0605}\u{2067}\u{202C}\u{0590}", vec![&["2", "2", "0", "0", "0", "x", "x", "x", "1", "1"]], vec![&["2", "0", "x", "1"]], vec![&["2", "0", "x", "1"]], ), ]; for t in tests { assert_eq!(reordered_levels_for_paras(t.0), t.1); assert_eq!(reordered_levels_per_char_for_paras(t.0), t.2); let text = &to_utf16(t.0); assert_eq!(reordered_levels_for_paras_u16(text), t.3); assert_eq!(reordered_levels_per_char_for_paras_u16(text), t.2); } let text = "aa טֶ"; let bidi_info = BidiInfo::new(text, None); assert_eq!( bidi_info.reordered_levels(&bidi_info.paragraphs[0], 3..7), Level::vec(&[0, 0, 0, 1, 1, 1, 1]), ); let text = &to_utf16(text); let bidi_info = BidiInfoU16::new(text, None); assert_eq!( bidi_info.reordered_levels(&bidi_info.paragraphs[0], 1..4), Level::vec(&[0, 0, 0, 1, 1]), ); } #[test] fn test_paragraph_info_len() { let text = "hello world"; let bidi_info = BidiInfo::new(text, None); assert_eq!(bidi_info.paragraphs.len(), 1); assert_eq!(bidi_info.paragraphs[0].len(), text.len()); let text2 = "How are you"; let whole_text = format!("{}\n{}", text, text2); let bidi_info = BidiInfo::new(&whole_text, None); assert_eq!(bidi_info.paragraphs.len(), 2); // The first paragraph include the paragraph separator. // TODO: investigate if the paragraph separator character // should not be part of any paragraph. assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1); assert_eq!(bidi_info.paragraphs[1].len(), text2.len()); let text = &to_utf16(text); let bidi_info = BidiInfoU16::new(text, None); assert_eq!(bidi_info.paragraphs.len(), 1); assert_eq!(bidi_info.paragraphs[0].len(), text.len()); let text2 = &to_utf16(text2); let whole_text = &to_utf16(&whole_text); let bidi_info = BidiInfoU16::new(&whole_text, None); assert_eq!(bidi_info.paragraphs.len(), 2); assert_eq!(bidi_info.paragraphs[0].len(), text.len() + 1); assert_eq!(bidi_info.paragraphs[1].len(), text2.len()); } #[test] fn test_direction() { let ltr_text = "hello world"; let rtl_text = "أهلا بكم"; let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text); let bidi_info = BidiInfo::new(&all_paragraphs, None); assert_eq!(bidi_info.paragraphs.len(), 3); let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]); let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]); let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]); assert_eq!(p_ltr.direction(), Direction::Ltr); assert_eq!(p_rtl.direction(), Direction::Rtl); assert_eq!(p_mixed.direction(), Direction::Mixed); let all_paragraphs = &to_utf16(&all_paragraphs); let bidi_info = BidiInfoU16::new(&all_paragraphs, None); assert_eq!(bidi_info.paragraphs.len(), 3); let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]); let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]); let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]); assert_eq!(p_ltr.direction(), Direction::Ltr); assert_eq!(p_rtl.direction(), Direction::Rtl); assert_eq!(p_mixed.direction(), Direction::Mixed); } #[test] fn test_edge_cases_direction() { // No paragraphs for empty text. let empty = ""; let bidi_info = BidiInfo::new(empty, Option::from(RTL_LEVEL)); assert_eq!(bidi_info.paragraphs.len(), 0); let empty = &to_utf16(empty); let bidi_info = BidiInfoU16::new(empty, Option::from(RTL_LEVEL)); assert_eq!(bidi_info.paragraphs.len(), 0); let tests = vec![ // The paragraph separator will take the value of the default direction // which is left to right. ("\n", None, Direction::Ltr), // The paragraph separator will take the value of the given initial direction // which is left to right. ("\n", Option::from(LTR_LEVEL), Direction::Ltr), // The paragraph separator will take the value of the given initial direction // which is right to left. ("\n", Option::from(RTL_LEVEL), Direction::Rtl), ]; for t in tests { let bidi_info = BidiInfo::new(t.0, t.1); assert_eq!(bidi_info.paragraphs.len(), 1); let p = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]); assert_eq!(p.direction(), t.2); let text = &to_utf16(t.0); let bidi_info = BidiInfoU16::new(text, t.1); let p = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]); assert_eq!(p.direction(), t.2); } } #[test] fn test_level_at() { let ltr_text = "hello world"; let rtl_text = "أهلا بكم"; let all_paragraphs = format!("{}\n{}\n{}{}", ltr_text, rtl_text, ltr_text, rtl_text); let bidi_info = BidiInfo::new(&all_paragraphs, None); assert_eq!(bidi_info.paragraphs.len(), 3); let p_ltr = Paragraph::new(&bidi_info, &bidi_info.paragraphs[0]); let p_rtl = Paragraph::new(&bidi_info, &bidi_info.paragraphs[1]); let p_mixed = Paragraph::new(&bidi_info, &bidi_info.paragraphs[2]); assert_eq!(p_ltr.level_at(0), LTR_LEVEL); assert_eq!(p_rtl.level_at(0), RTL_LEVEL); assert_eq!(p_mixed.level_at(0), LTR_LEVEL); assert_eq!(p_mixed.info.levels.len(), 54); assert_eq!(p_mixed.para.range.start, 28); assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL); let all_paragraphs = &to_utf16(&all_paragraphs); let bidi_info = BidiInfoU16::new(&all_paragraphs, None); assert_eq!(bidi_info.paragraphs.len(), 3); let p_ltr = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[0]); let p_rtl = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[1]); let p_mixed = ParagraphU16::new(&bidi_info, &bidi_info.paragraphs[2]); assert_eq!(p_ltr.level_at(0), LTR_LEVEL); assert_eq!(p_rtl.level_at(0), RTL_LEVEL); assert_eq!(p_mixed.level_at(0), LTR_LEVEL); assert_eq!(p_mixed.info.levels.len(), 40); assert_eq!(p_mixed.para.range.start, 21); assert_eq!(p_mixed.level_at(ltr_text.len()), RTL_LEVEL); } #[test] fn test_get_base_direction() { let tests = vec![ ("", Direction::Mixed), // return Mixed if no strong character found ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed), ("3.14\npi", Direction::Mixed), // only first paragraph is considered ("[123 'abc']", Direction::Ltr), ("[123 '\u{0628}' abc", Direction::Rtl), ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed), ]; for t in tests { assert_eq!(get_base_direction(t.0), t.1); let text = &to_utf16(t.0); assert_eq!(get_base_direction(text.as_slice()), t.1); } } #[test] fn test_get_base_direction_full() { let tests = vec![ ("", Direction::Mixed), // return Mixed if no strong character found ("123[]-+\u{2019}\u{2060}\u{00bf}?", Direction::Mixed), ("3.14\npi", Direction::Ltr), // direction taken from the second paragraph ("3.14\n\u{05D0}", Direction::Rtl), // direction taken from the second paragraph ("[123 'abc']", Direction::Ltr), ("[123 '\u{0628}' abc", Direction::Rtl), ("[123 '\u{2066}abc\u{2069}'\u{0628}]", Direction::Rtl), // embedded isolate is ignored ("[123 '\u{2066}abc\u{2068}'\u{0628}]", Direction::Mixed), ("[123 '\u{2066}abc\u{2068}'\n\u{0628}]", Direction::Rtl), // \n resets embedding level ]; for t in tests { assert_eq!(get_base_direction_full(t.0), t.1); let text = &to_utf16(t.0); assert_eq!(get_base_direction_full(text.as_slice()), t.1); } } } #[cfg(all(feature = "serde", feature = "hardcoded-data", test))] mod serde_tests { use super::*; use serde_test::{assert_tokens, Token}; #[test] fn test_levels() { let text = "abc אבג"; let bidi_info = BidiInfo::new(text, None); let levels = bidi_info.levels; assert_eq!(text.as_bytes().len(), 10); assert_eq!(levels.len(), 10); assert_tokens( &levels, &[ Token::Seq { len: Some(10) }, Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(0), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::NewtypeStruct { name: "Level" }, Token::U8(1), Token::SeqEnd, ], ); } } unicode-bidi-0.3.17/src/prepare.rs000064400000000000000000000437731046102023000150600ustar 00000000000000// Copyright 2015 The Servo Project Developers. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. //! 3.3.3 Preparations for Implicit Processing //! //! use alloc::vec::Vec; use core::cmp::max; use core::ops::Range; #[cfg(feature = "smallvec")] use smallvec::{smallvec, SmallVec}; use super::level::Level; use super::BidiClass::{self, *}; /// A maximal substring of characters with the same embedding level. /// /// Represented as a range of byte indices. pub type LevelRun = Range; #[cfg(feature = "smallvec")] pub type LevelRunVec = SmallVec<[LevelRun; 8]>; #[cfg(not(feature = "smallvec"))] pub type LevelRunVec = Vec; /// Output of `isolating_run_sequences` (steps X9-X10) #[derive(Debug, PartialEq)] pub struct IsolatingRunSequence { pub runs: Vec, pub sos: BidiClass, // Start-of-sequence type. pub eos: BidiClass, // End-of-sequence type. } #[cfg(feature = "smallvec")] pub type IsolatingRunSequenceVec = SmallVec<[IsolatingRunSequence; 8]>; #[cfg(not(feature = "smallvec"))] pub type IsolatingRunSequenceVec = Vec; /// Compute the set of isolating run sequences. /// /// An isolating run sequence is a maximal sequence of level runs such that for all level runs /// except the last one in the sequence, the last character of the run is an isolate initiator /// whose matching PDI is the first character of the next level run in the sequence. /// /// Note: This function does *not* return the sequences in order by their first characters. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn isolating_run_sequences( para_level: Level, original_classes: &[BidiClass], levels: &[Level], runs: LevelRunVec, has_isolate_controls: bool, isolating_run_sequences: &mut IsolatingRunSequenceVec, ) { // Per http://www.unicode.org/reports/tr9/#BD13: // "In the absence of isolate initiators, each isolating run sequence in a paragraph // consists of exactly one level run, and each level run constitutes a separate // isolating run sequence." // We can take a simplified path to handle this case. if !has_isolate_controls { isolating_run_sequences.reserve_exact(runs.len()); for run in runs { // Determine the `sos` and `eos` class for the sequence. // let run_levels = &levels[run.clone()]; let run_classes = &original_classes[run.clone()]; let seq_level = run_levels[run_classes .iter() .position(|c| not_removed_by_x9(c)) .unwrap_or(0)]; let end_level = run_levels[run_classes .iter() .rposition(|c| not_removed_by_x9(c)) .unwrap_or(run.end - run.start - 1)]; // Get the level of the last non-removed char before the run. let pred_level = match original_classes[..run.start] .iter() .rposition(not_removed_by_x9) { Some(idx) => levels[idx], None => para_level, }; // Get the level of the next non-removed char after the run. let succ_level = match original_classes[run.end..] .iter() .position(not_removed_by_x9) { Some(idx) => levels[run.end + idx], None => para_level, }; isolating_run_sequences.push(IsolatingRunSequence { runs: vec![run], sos: max(seq_level, pred_level).bidi_class(), eos: max(end_level, succ_level).bidi_class(), }); } return; } // Compute the set of isolating run sequences. // let mut sequences = Vec::with_capacity(runs.len()); // When we encounter an isolate initiator, we push the current sequence onto the // stack so we can resume it after the matching PDI. #[cfg(feature = "smallvec")] let mut stack: SmallVec<[Vec>; 8]> = smallvec![vec![]]; #[cfg(not(feature = "smallvec"))] let mut stack = vec![vec![]]; for run in runs { assert!(!run.is_empty()); assert!(!stack.is_empty()); let start_class = original_classes[run.start]; // > In rule X10, [..] skip over any BNs when [..]. // > Do the same when determining if the last character of the sequence is an isolate initiator. // // let end_class = original_classes[run.start..run.end] .iter() .copied() .rev() .find(not_removed_by_x9) .unwrap_or(start_class); let mut sequence = if start_class == PDI && stack.len() > 1 { // Continue a previous sequence interrupted by an isolate. stack.pop().unwrap() } else { // Start a new sequence. Vec::new() }; sequence.push(run); if matches!(end_class, RLI | LRI | FSI) { // Resume this sequence after the isolate. stack.push(sequence); } else { // This sequence is finished. sequences.push(sequence); } } // Pop any remaining sequences off the stack. sequences.extend(stack.into_iter().rev().filter(|seq| !seq.is_empty())); // Determine the `sos` and `eos` class for each sequence. // for sequence in sequences { assert!(!sequence.is_empty()); let start_of_seq = sequence[0].start; let runs_len = sequence.len(); let end_of_seq = sequence[runs_len - 1].end; let mut result = IsolatingRunSequence { runs: sequence, sos: L, eos: L, }; // > (not counting characters removed by X9) let seq_level = levels[result .iter_forwards_from(start_of_seq, 0) .find(|i| not_removed_by_x9(&original_classes[*i])) .unwrap_or(start_of_seq)]; // XXXManishearth the spec talks of a start and end level, // but for a given IRS the two should be equivalent, yes? let end_level = levels[result .iter_backwards_from(end_of_seq, runs_len - 1) .find(|i| not_removed_by_x9(&original_classes[*i])) .unwrap_or(end_of_seq - 1)]; #[cfg(test)] for idx in result.runs.clone().into_iter().flatten() { if not_removed_by_x9(&original_classes[idx]) { assert_eq!(seq_level, levels[idx]); } } // Get the level of the last non-removed char before the runs. let pred_level = match original_classes[..start_of_seq] .iter() .rposition(not_removed_by_x9) { Some(idx) => levels[idx], None => para_level, }; // Get the last non-removed character to check if it is an isolate initiator. // The spec calls for an unmatched one, but matched isolate initiators // will never be at the end of a level run (otherwise there would be more to the run). // We unwrap_or(BN) because BN marks removed classes and it won't matter for the check. let last_non_removed = original_classes[..end_of_seq] .iter() .copied() .rev() .find(not_removed_by_x9) .unwrap_or(BN); // Get the level of the next non-removed char after the runs. let succ_level = if matches!(last_non_removed, RLI | LRI | FSI) { para_level } else { match original_classes[end_of_seq..] .iter() .position(not_removed_by_x9) { Some(idx) => levels[end_of_seq + idx], None => para_level, } }; result.sos = max(seq_level, pred_level).bidi_class(); result.eos = max(end_level, succ_level).bidi_class(); isolating_run_sequences.push(result); } } impl IsolatingRunSequence { /// Given a text-relative position `pos` and an index of the level run it is in, /// produce an iterator of all characters after and pos (`pos..`) that are in this /// run sequence pub(crate) fn iter_forwards_from( &self, pos: usize, level_run_index: usize, ) -> impl Iterator + '_ { let runs = &self.runs[level_run_index..]; // Check that it is in range // (we can't use contains() since we want an inclusive range) #[cfg(feature = "std")] debug_assert!(runs[0].start <= pos && pos <= runs[0].end); (pos..runs[0].end).chain(runs[1..].iter().flat_map(Clone::clone)) } /// Given a text-relative position `pos` and an index of the level run it is in, /// produce an iterator of all characters before and excludingpos (`..pos`) that are in this /// run sequence pub(crate) fn iter_backwards_from( &self, pos: usize, level_run_index: usize, ) -> impl Iterator + '_ { let prev_runs = &self.runs[..level_run_index]; let current = &self.runs[level_run_index]; // Check that it is in range // (we can't use contains() since we want an inclusive range) #[cfg(feature = "std")] debug_assert!(current.start <= pos && pos <= current.end); (current.start..pos) .rev() .chain(prev_runs.iter().rev().flat_map(Clone::clone)) } } /// Finds the level runs in a paragraph. /// /// /// /// This is only used by tests; normally level runs are identified during explicit::compute. #[cfg(test)] fn level_runs(levels: &[Level], original_classes: &[BidiClass]) -> Vec { assert_eq!(levels.len(), original_classes.len()); let mut runs = Vec::new(); if levels.is_empty() { return runs; } let mut current_run_level = levels[0]; let mut current_run_start = 0; for i in 1..levels.len() { if !removed_by_x9(original_classes[i]) && levels[i] != current_run_level { // End the last run and start a new one. runs.push(current_run_start..i); current_run_level = levels[i]; current_run_start = i; } } runs.push(current_run_start..levels.len()); runs } /// Should this character be ignored in steps after X9? /// /// pub fn removed_by_x9(class: BidiClass) -> bool { matches!(class, RLE | LRE | RLO | LRO | PDF | BN) } // For use as a predicate for `position` / `rposition` pub fn not_removed_by_x9(class: &BidiClass) -> bool { !removed_by_x9(*class) } #[cfg(test)] mod tests { use super::*; #[test] fn test_level_runs() { assert_eq!(level_runs(&Level::vec(&[]), &[]), &[]); assert_eq!( level_runs(&Level::vec(&[0, 0, 0, 1, 1, 2, 0, 0]), &[L; 8]), &[0..3, 3..5, 5..6, 6..8] ); } // From #[rustfmt::skip] #[test] fn test_isolating_run_sequences() { // == Example 1 == // text1·RLE·text2·PDF·RLE·text3·PDF·text4 // index 0 1 2 3 4 5 6 7 let classes = &[L, RLE, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); isolating_run_sequences( para_level, classes, &Level::vec(levels), level_runs(&Level::vec(levels), classes).into(), false, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2], vec![2..7], vec![7..8]] ); // == Example 2 == // text1·RLI·text2·PDI·RLI·text3·PDI·text4 // index 0 1 2 3 4 5 6 7 let classes = &[L, RLI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); isolating_run_sequences( para_level, classes, &Level::vec(levels), level_runs(&Level::vec(levels), classes).into(), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2, 3..5, 6..8], vec![2..3], vec![5..6]] ); // == Example 3 == // text1·RLI·text2·LRI·text3·RLE·text4·PDF·text5·PDI·text6·PDI·text7 // index 0 1 2 3 4 5 6 7 8 9 10 11 12 let classes = &[L, RLI, L, LRI, L, RLE, L, PDF, L, PDI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 3, 3, 3, 2, 1, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); isolating_run_sequences( para_level, classes, &Level::vec(levels), level_runs(&Level::vec(levels), classes).into(), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); assert_eq!( sequences.iter().map(|s| s.runs.clone()).collect::>(), vec![vec![0..2, 11..13], vec![2..4, 9..11], vec![4..6], vec![6..8], vec![8..9]] ); } // From #[rustfmt::skip] #[test] fn test_isolating_run_sequences_sos_and_eos() { // == Example 1 == // text1·RLE·text2·LRE·text3·PDF·text4·PDF·RLE·text5·PDF·text6 // index 0 1 2 3 4 5 6 7 8 9 10 11 let classes = &[L, RLE, L, LRE, L, PDF, L, PDF, RLE, L, PDF, L]; let levels = &[0, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); isolating_run_sequences( para_level, classes, &Level::vec(levels), level_runs(&Level::vec(levels), classes).into(), false, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1 assert_eq!( &sequences[0], &IsolatingRunSequence { runs: vec![0..2], sos: L, eos: R, } ); // text2 assert_eq!( &sequences[1], &IsolatingRunSequence { runs: vec![2..4], sos: R, eos: L, } ); // text3 assert_eq!( &sequences[2], &IsolatingRunSequence { runs: vec![4..6], sos: L, eos: L, } ); // text4 text5 assert_eq!( &sequences[3], &IsolatingRunSequence { runs: vec![6..11], sos: L, eos: R, } ); // text6 assert_eq!( &sequences[4], &IsolatingRunSequence { runs: vec![11..12], sos: R, eos: L, } ); // == Example 2 == // text1·RLI·text2·LRI·text3·PDI·text4·PDI·RLI·text5·PDI·text6 // index 0 1 2 3 4 5 6 7 8 9 10 11 let classes = &[L, RLI, L, LRI, L, PDI, L, PDI, RLI, L, PDI, L]; let levels = &[0, 0, 1, 1, 2, 1, 1, 0, 0, 1, 0, 0]; let para_level = Level::ltr(); let mut sequences = IsolatingRunSequenceVec::new(); isolating_run_sequences( para_level, classes, &Level::vec(levels), level_runs(&Level::vec(levels), classes).into(), true, &mut sequences); sequences.sort_by(|a, b| a.runs[0].clone().cmp(b.runs[0].clone())); // text1·RLI·PDI·RLI·PDI·text6 assert_eq!( &sequences[0], &IsolatingRunSequence { runs: vec![0..2, 7..9, 10..12], sos: L, eos: L, } ); // text2·LRI·PDI·text4 assert_eq!( &sequences[1], &IsolatingRunSequence { runs: vec![2..4, 5..7], sos: R, eos: R, } ); // text3 assert_eq!( &sequences[2], &IsolatingRunSequence { runs: vec![4..5], sos: L, eos: L, } ); // text5 assert_eq!( &sequences[3], &IsolatingRunSequence { runs: vec![9..10], sos: R, eos: R, } ); } #[test] fn test_removed_by_x9() { let rem_classes = &[RLE, LRE, RLO, LRO, PDF, BN]; let not_classes = &[L, RLI, AL, LRI, PDI]; for x in rem_classes { assert_eq!(removed_by_x9(*x), true); } for x in not_classes { assert_eq!(removed_by_x9(*x), false); } } #[test] fn test_not_removed_by_x9() { let non_x9_classes = &[L, R, AL, EN, ES, ET, AN, CS, NSM, B, S, WS, ON, LRI, RLI, FSI, PDI]; for x in non_x9_classes { assert_eq!(not_removed_by_x9(&x), true); } } } unicode-bidi-0.3.17/src/utf16.rs000064400000000000000000000720451046102023000143610ustar 00000000000000// Copyright 2023 The Mozilla Foundation. See the // COPYRIGHT file at the top-level directory of this distribution. // // Licensed under the Apache License, Version 2.0 or the MIT license // , at your // option. This file may not be copied, modified, or distributed // except according to those terms. use super::TextSource; use alloc::borrow::Cow; use alloc::vec::Vec; use core::char; use core::ops::Range; use crate::{ compute_bidi_info_for_para, compute_initial_info, level, para_direction, reorder_levels, reorder_visual, visual_runs_for_line, }; use crate::{ BidiClass, BidiDataSource, Direction, Level, LevelRun, ParagraphInfo, ParagraphInfoFlags, }; #[cfg(feature = "hardcoded-data")] use crate::HardcodedBidiData; /// Initial bidi information of the text (UTF-16 version). /// /// Contains the text paragraphs and `BidiClass` of its characters. #[derive(PartialEq, Debug)] pub struct InitialInfo<'text> { /// The text pub text: &'text [u16], /// The BidiClass of the character at each code unit in the text. /// If a character is multiple code units, its class will appear multiple times in the vector. pub original_classes: Vec, /// The boundaries and level of each paragraph within the text. pub paragraphs: Vec, } impl<'text> InitialInfo<'text> { /// Find the paragraphs and BidiClasses in a string of text. /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] pub fn new(text: &[u16], default_para_level: Option) -> InitialInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature) /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option, ) -> InitialInfo<'a> { InitialInfoExt::new_with_data_source(data_source, text, default_para_level).base } } /// Extended version of InitialInfo (not public API). #[derive(PartialEq, Debug)] struct InitialInfoExt<'text> { /// The base InitialInfo for the text, recording its paragraphs and bidi classes. base: InitialInfo<'text>, /// Parallel to base.paragraphs, records whether each paragraph is "pure LTR" that /// requires no further bidi processing (i.e. there are no RTL characters or bidi /// control codes present). flags: Vec, } impl<'text> InitialInfoExt<'text> { /// Find the paragraphs and BidiClasses in a string of text, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`InitialInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature) /// /// /// /// Also sets the class for each First Strong Isolate initiator (FSI) to LRI or RLI if a strong /// character is found before the matching PDI. If no strong character is found, the class will /// remain FSI, and it's up to later stages to treat these as LRI when needed. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option, ) -> InitialInfoExt<'a> { let mut paragraphs = Vec::::new(); let mut flags = Vec::::new(); let (original_classes, _, _, _) = compute_initial_info( data_source, text, default_para_level, Some((&mut paragraphs, &mut flags)), ); InitialInfoExt { base: InitialInfo { text, original_classes, paragraphs, }, flags, } } } /// Bidi information of the text (UTF-16 version). /// /// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text. If a /// character is multiple code units wide, then its class and level will appear multiple times in these /// vectors. // TODO: Impl `struct StringProperty { values: Vec }` and use instead of Vec #[derive(Debug, PartialEq)] pub struct BidiInfo<'text> { /// The text pub text: &'text [u16], /// The BidiClass of the character at each byte in the text. pub original_classes: Vec, /// The directional embedding level of each byte in the text. pub levels: Vec, /// The boundaries and paragraph embedding level of each paragraph within the text. /// /// TODO: Use SmallVec or similar to avoid overhead when there are only one or two paragraphs? /// Or just don't include the first paragraph, which always starts at 0? pub paragraphs: Vec, } impl<'text> BidiInfo<'text> { /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph. /// /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] #[inline] pub fn new(text: &[u16], default_para_level: Option) -> BidiInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Split the text into paragraphs and determine the bidi embedding levels for each paragraph, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature). /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option, ) -> BidiInfo<'a> { let InitialInfoExt { base, flags, .. } = InitialInfoExt::new_with_data_source(data_source, text, default_para_level); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = base.original_classes.clone(); for (para, flags) in base.paragraphs.iter().zip(flags.iter()) { let text = &text[para.range.clone()]; let original_classes = &base.original_classes[para.range.clone()]; compute_bidi_info_for_para( data_source, para, flags.is_pure_ltr, flags.has_isolate_controls, text, original_classes, &mut processing_classes, &mut levels, ); } BidiInfo { text, original_classes: base.original_classes, paragraphs: base.paragraphs, levels, } } /// Produce the levels for this paragraph as needed for reordering, one level per *byte* /// in the paragraph. The returned vector includes bytes that are not included /// in the `line`, but will not adjust them. /// /// This runs [Rule L1], you can run /// [Rule L2] by calling [`Self::reorder_visual()`]. /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead /// to avoid non-byte indices. /// /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`]. /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels(&self, para: &ParagraphInfo, line: Range) -> Vec { assert!(line.start <= self.levels.len()); assert!(line.end <= self.levels.len()); let mut levels = self.levels.clone(); let line_classes = &self.original_classes[line.clone()]; let line_levels = &mut levels[line.clone()]; let line_str: &[u16] = &self.text[line.clone()]; reorder_levels(line_classes, line_levels, line_str, para.level); levels } /// Produce the levels for this paragraph as needed for reordering, one level per *character* /// in the paragraph. The returned vector includes characters that are not included /// in the `line`, but will not adjust them. /// /// This runs [Rule L1], you can run /// [Rule L2] by calling [`Self::reorder_visual()`]. /// If doing so, you may prefer to use [`Self::reordered_levels_per_char()`] instead /// to avoid non-byte indices. /// /// For an all-in-one reordering solution, consider using [`Self::reorder_visual()`]. /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels_per_char( &self, para: &ParagraphInfo, line: Range, ) -> Vec { let levels = self.reordered_levels(para, line); self.text.char_indices().map(|(i, _)| levels[i]).collect() } /// Re-order a line based on resolved levels and return the line in display order. /// /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring. /// /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reorder_line(&self, para: &ParagraphInfo, line: Range) -> Cow<'text, [u16]> { if !level::has_rtl(&self.levels[line.clone()]) { return self.text[line].into(); } let (levels, runs) = self.visual_runs(para, line.clone()); reorder_line(self.text, line, levels, runs) } /// Reorders pre-calculated levels of a sequence of characters. /// /// NOTE: This is a convenience method that does not use a `Paragraph` object. It is /// intended to be used when an application has determined the levels of the objects (character sequences) /// and just needs to have them reordered. /// /// the index map will result in `indexMap[visualIndex]==logicalIndex`. /// /// This only runs [Rule L2](http://www.unicode.org/reports/tr9/#L2) as it does not have /// information about the actual text. /// /// Furthermore, if `levels` is an array that is aligned with code units, bytes within a codepoint may be /// reversed. You may need to fix up the map to deal with this. Alternatively, only pass in arrays where each `Level` /// is for a single code point. /// /// /// # # Example /// ``` /// use unicode_bidi::BidiInfo; /// use unicode_bidi::Level; /// /// let l0 = Level::from(0); /// let l1 = Level::from(1); /// let l2 = Level::from(2); /// /// let levels = vec![l0, l0, l0, l0]; /// let index_map = BidiInfo::reorder_visual(&levels); /// assert_eq!(levels.len(), index_map.len()); /// assert_eq!(index_map, [0, 1, 2, 3]); /// /// let levels: Vec = vec![l0, l0, l0, l1, l1, l1, l2, l2]; /// let index_map = BidiInfo::reorder_visual(&levels); /// assert_eq!(levels.len(), index_map.len()); /// assert_eq!(index_map, [0, 1, 2, 6, 7, 5, 4, 3]); /// ``` #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn reorder_visual(levels: &[Level]) -> Vec { reorder_visual(levels) } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of bytes indices within `levels`. /// /// The first return value is a vector of levels used by the reordering algorithm, /// i.e. the result of [Rule L1]. The second return value is a vector of level runs, /// the result of [Rule L2], showing the visual order that each level run (a run of text with the /// same level) should be displayed. Within each run, the display order can be checked /// against the Level vector. /// /// This does not handle [Rule L3] (combining characters) or [Rule L4] (mirroring), /// as that should be handled by the engine using this API. /// /// Conceptually, this is the same as running [`Self::reordered_levels()`] followed by /// [`Self::reorder_visual()`], however it returns the result as a list of level runs instead /// of producing a level map, since one may wish to deal with the fact that this is operating on /// byte rather than character indices. /// /// /// /// [Rule L1]: https://www.unicode.org/reports/tr9/#L1 /// [Rule L2]: https://www.unicode.org/reports/tr9/#L2 /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn visual_runs( &self, para: &ParagraphInfo, line: Range, ) -> (Vec, Vec) { let levels = self.reordered_levels(para, line.clone()); visual_runs_for_line(levels, &line) } /// If processed text has any computed RTL levels /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(&self) -> bool { level::has_rtl(&self.levels) } } /// Bidi information of text treated as a single paragraph. /// /// The `original_classes` and `levels` vectors are indexed by code unit offsets into the text. If a /// character is multiple code units wide, then its class and level will appear multiple times in these /// vectors. #[derive(Debug, PartialEq)] pub struct ParagraphBidiInfo<'text> { /// The text pub text: &'text [u16], /// The BidiClass of the character at each byte in the text. pub original_classes: Vec, /// The directional embedding level of each byte in the text. pub levels: Vec, /// The paragraph embedding level. pub paragraph_level: Level, /// Whether the paragraph is purely LTR. pub is_pure_ltr: bool, } impl<'text> ParagraphBidiInfo<'text> { /// Determine the bidi embedding level. /// /// /// The `hardcoded-data` Cargo feature (enabled by default) must be enabled to use this. /// /// TODO: In early steps, check for special cases that allow later steps to be skipped. like /// text that is entirely LTR. See the `nsBidi` class from Gecko for comparison. /// /// TODO: Support auto-RTL base direction #[cfg_attr(feature = "flame_it", flamer::flame)] #[cfg(feature = "hardcoded-data")] #[inline] pub fn new(text: &[u16], default_para_level: Option) -> ParagraphBidiInfo<'_> { Self::new_with_data_source(&HardcodedBidiData, text, default_para_level) } /// Determine the bidi embedding level, with a custom [`BidiDataSource`] /// for Bidi data. If you just wish to use the hardcoded Bidi data, please use [`BidiInfo::new()`] /// instead (enabled with tbe default `hardcoded-data` Cargo feature). /// /// (This is the single-paragraph equivalent of BidiInfo::new_with_data_source, /// and should be kept in sync with it. #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn new_with_data_source<'a, D: BidiDataSource>( data_source: &D, text: &'a [u16], default_para_level: Option, ) -> ParagraphBidiInfo<'a> { // Here we could create a ParagraphInitialInfo struct to parallel the one // used by BidiInfo, but there doesn't seem any compelling reason for it. let (original_classes, paragraph_level, is_pure_ltr, has_isolate_controls) = compute_initial_info(data_source, text, default_para_level, None); let mut levels = Vec::::with_capacity(text.len()); let mut processing_classes = original_classes.clone(); let para_info = ParagraphInfo { range: Range { start: 0, end: text.len(), }, level: paragraph_level, }; compute_bidi_info_for_para( data_source, ¶_info, is_pure_ltr, has_isolate_controls, text, &original_classes, &mut processing_classes, &mut levels, ); ParagraphBidiInfo { text, original_classes, levels, paragraph_level, is_pure_ltr, } } /// Produce the levels for this paragraph as needed for reordering, one level per *code unit* /// in the paragraph. The returned vector includes code units that are not included /// in the `line`, but will not adjust them. /// /// See BidiInfo::reordered_levels for details. /// /// (This should be kept in sync with BidiInfo::reordered_levels.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels(&self, line: Range) -> Vec { assert!(line.start <= self.levels.len()); assert!(line.end <= self.levels.len()); let mut levels = self.levels.clone(); let line_classes = &self.original_classes[line.clone()]; let line_levels = &mut levels[line.clone()]; reorder_levels( line_classes, line_levels, self.text.subrange(line), self.paragraph_level, ); levels } /// Produce the levels for this paragraph as needed for reordering, one level per *character* /// in the paragraph. The returned vector includes characters that are not included /// in the `line`, but will not adjust them. /// /// See BidiInfo::reordered_levels_per_char for details. /// /// (This should be kept in sync with BidiInfo::reordered_levels_per_char.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reordered_levels_per_char(&self, line: Range) -> Vec { let levels = self.reordered_levels(line); self.text.char_indices().map(|(i, _)| levels[i]).collect() } /// Re-order a line based on resolved levels and return the line in display order. /// /// See BidiInfo::reorder_line for details. /// /// (This should be kept in sync with BidiInfo::reorder_line.) #[cfg_attr(feature = "flame_it", flamer::flame)] pub fn reorder_line(&self, line: Range) -> Cow<'text, [u16]> { if !level::has_rtl(&self.levels[line.clone()]) { return self.text[line].into(); } let (levels, runs) = self.visual_runs(line.clone()); reorder_line(self.text, line, levels, runs) } /// Reorders pre-calculated levels of a sequence of characters. /// /// See BidiInfo::reorder_visual for details. #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn reorder_visual(levels: &[Level]) -> Vec { reorder_visual(levels) } /// Find the level runs within a line and return them in visual order. /// /// `line` is a range of code-unit indices within `levels`. /// /// See `BidiInfo::visual_runs` for details. /// /// (This should be kept in sync with BidiInfo::visual_runs.) #[cfg_attr(feature = "flame_it", flamer::flame)] #[inline] pub fn visual_runs(&self, line: Range) -> (Vec, Vec) { let levels = self.reordered_levels(line.clone()); visual_runs_for_line(levels, &line) } /// If processed text has any computed RTL levels /// /// This information is usually used to skip re-ordering of text when no RTL level is present #[inline] pub fn has_rtl(&self) -> bool { !self.is_pure_ltr } /// Return the paragraph's Direction (Ltr, Rtl, or Mixed) based on its levels. #[inline] pub fn direction(&self) -> Direction { para_direction(&self.levels) } } /// Return a line of the text in display order based on resolved levels. /// /// `text` the full text passed to the `BidiInfo` or `ParagraphBidiInfo` for analysis /// `line` a range of byte indices within `text` corresponding to one line /// `levels` array of `Level` values, with `line`'s levels reordered into visual order /// `runs` array of `LevelRun`s in visual order /// /// (`levels` and `runs` are the result of calling `BidiInfo::visual_runs()` or /// `ParagraphBidiInfo::visual_runs()` for the line of interest.) /// /// Returns: the reordered text of the line. /// /// This does not apply [Rule L3] or [Rule L4] around combining characters or mirroring. /// /// [Rule L3]: https://www.unicode.org/reports/tr9/#L3 /// [Rule L4]: https://www.unicode.org/reports/tr9/#L4 fn reorder_line( text: &[u16], line: Range, levels: Vec, runs: Vec, ) -> Cow<'_, [u16]> { // If all isolating run sequences are LTR, no reordering is needed if runs.iter().all(|run| levels[run.start].is_ltr()) { return text[line].into(); } let mut result = Vec::::with_capacity(line.len()); for run in runs { if levels[run.start].is_rtl() { let mut buf = [0; 2]; for c in text[run].chars().rev() { result.extend(c.encode_utf16(&mut buf).iter()); } } else { result.extend(text[run].iter()); } } result.into() } /// Contains a reference of `BidiInfo` and one of its `paragraphs`. /// And it supports all operation in the `Paragraph` that needs also its /// `BidiInfo` such as `direction`. #[derive(Debug)] pub struct Paragraph<'a, 'text> { pub info: &'a BidiInfo<'text>, pub para: &'a ParagraphInfo, } impl<'a, 'text> Paragraph<'a, 'text> { #[inline] pub fn new(info: &'a BidiInfo<'text>, para: &'a ParagraphInfo) -> Paragraph<'a, 'text> { Paragraph { info, para } } /// Returns if the paragraph is Left direction, right direction or mixed. #[inline] pub fn direction(&self) -> Direction { para_direction(&self.info.levels[self.para.range.clone()]) } /// Returns the `Level` of a certain character in the paragraph. #[inline] pub fn level_at(&self, pos: usize) -> Level { let actual_position = self.para.range.start + pos; self.info.levels[actual_position] } } /// Implementation of TextSource for UTF-16 text in a [u16] array. /// Note that there could be unpaired surrogates present! // Convenience functions to check whether a UTF16 code unit is a surrogate. #[inline] fn is_high_surrogate(code: u16) -> bool { (code & 0xFC00) == 0xD800 } #[inline] fn is_low_surrogate(code: u16) -> bool { (code & 0xFC00) == 0xDC00 } impl<'text> TextSource<'text> for [u16] { type CharIter = Utf16CharIter<'text>; type CharIndexIter = Utf16CharIndexIter<'text>; type IndexLenIter = Utf16IndexLenIter<'text>; #[inline] fn len(&self) -> usize { (self as &[u16]).len() } fn char_at(&self, index: usize) -> Option<(char, usize)> { if index >= self.len() { return None; } // Get the indicated code unit and try simply converting it to a char; // this will fail if it is half of a surrogate pair. let c = self[index]; if let Some(ch) = char::from_u32(c.into()) { return Some((ch, 1)); } // If it's a low surrogate, and was immediately preceded by a high surrogate, // then we're in the middle of a (valid) character, and should return None. if is_low_surrogate(c) && index > 0 && is_high_surrogate(self[index - 1]) { return None; } // Otherwise, try to decode, returning REPLACEMENT_CHARACTER for errors. if let Some(ch) = char::decode_utf16(self[index..].iter().cloned()).next() { if let Ok(ch) = ch { // This must be a surrogate pair, otherwise char::from_u32() above should // have succeeded! debug_assert!(ch.len_utf16() == 2, "BMP should have already been handled"); return Some((ch, ch.len_utf16())); } } else { debug_assert!( false, "Why did decode_utf16 return None when we're not at the end?" ); return None; } // Failed to decode UTF-16: we must have encountered an unpaired surrogate. // Return REPLACEMENT_CHARACTER (not None), to continue processing the following text // and keep indexing correct. Some((char::REPLACEMENT_CHARACTER, 1)) } #[inline] fn subrange(&self, range: Range) -> &Self { &(self as &[u16])[range] } #[inline] fn chars(&'text self) -> Self::CharIter { Utf16CharIter::new(self) } #[inline] fn char_indices(&'text self) -> Self::CharIndexIter { Utf16CharIndexIter::new(self) } #[inline] fn indices_lengths(&'text self) -> Self::IndexLenIter { Utf16IndexLenIter::new(self) } #[inline] fn char_len(ch: char) -> usize { ch.len_utf16() } } /// Iterator over UTF-16 text in a [u16] slice, returning (index, char_len) tuple. #[derive(Debug)] pub struct Utf16IndexLenIter<'text> { text: &'text [u16], cur_pos: usize, } impl<'text> Utf16IndexLenIter<'text> { #[inline] pub fn new(text: &'text [u16]) -> Self { Utf16IndexLenIter { text, cur_pos: 0 } } } impl Iterator for Utf16IndexLenIter<'_> { type Item = (usize, usize); #[inline] fn next(&mut self) -> Option { if let Some((_, char_len)) = self.text.char_at(self.cur_pos) { let result = (self.cur_pos, char_len); self.cur_pos += char_len; return Some(result); } None } } /// Iterator over UTF-16 text in a [u16] slice, returning (index, char) tuple. #[derive(Debug)] pub struct Utf16CharIndexIter<'text> { text: &'text [u16], cur_pos: usize, } impl<'text> Utf16CharIndexIter<'text> { pub fn new(text: &'text [u16]) -> Self { Utf16CharIndexIter { text, cur_pos: 0 } } } impl Iterator for Utf16CharIndexIter<'_> { type Item = (usize, char); fn next(&mut self) -> Option { if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) { let result = (self.cur_pos, ch); self.cur_pos += char_len; return Some(result); } None } } /// Iterator over UTF-16 text in a [u16] slice, returning Unicode chars. /// (Unlike the other iterators above, this also supports reverse iteration.) #[derive(Debug)] pub struct Utf16CharIter<'text> { text: &'text [u16], cur_pos: usize, end_pos: usize, } impl<'text> Utf16CharIter<'text> { pub fn new(text: &'text [u16]) -> Self { Utf16CharIter { text, cur_pos: 0, end_pos: text.len(), } } } impl Iterator for Utf16CharIter<'_> { type Item = char; fn next(&mut self) -> Option { if let Some((ch, char_len)) = self.text.char_at(self.cur_pos) { self.cur_pos += char_len; return Some(ch); } None } } impl DoubleEndedIterator for Utf16CharIter<'_> { fn next_back(&mut self) -> Option { if self.end_pos <= self.cur_pos { return None; } self.end_pos -= 1; if let Some(ch) = char::from_u32(self.text[self.end_pos] as u32) { return Some(ch); } if self.end_pos > self.cur_pos { if let Some((ch, char_len)) = self.text.char_at(self.end_pos - 1) { if char_len == 2 { self.end_pos -= 1; return Some(ch); } } } Some(char::REPLACEMENT_CHARACTER) } }