regex-syntax-0.8.2/.cargo_vcs_info.json0000644000000001520000000000100134770ustar { "git": { "sha1": "1dbeee73b9fcde708502d3d5f799b198fe3a6cf5" }, "path_in_vcs": "regex-syntax" }regex-syntax-0.8.2/Cargo.toml0000644000000026200000000000100114770ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "regex-syntax" version = "0.8.2" authors = [ "The Rust Project Developers", "Andrew Gallant ", ] description = "A regular expression parser." documentation = "https://docs.rs/regex-syntax" readme = "README.md" license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" [package.metadata.docs.rs] all-features = true rustdoc-args = [ "--cfg", "docsrs", ] [dependencies.arbitrary] version = "1.3.0" features = ["derive"] optional = true [features] arbitrary = ["dep:arbitrary"] default = [ "std", "unicode", ] std = [] unicode = [ "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", ] unicode-age = [] unicode-bool = [] unicode-case = [] unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] regex-syntax-0.8.2/Cargo.toml.orig0000644000000026660000000000100124500ustar [package] name = "regex-syntax" version = "0.8.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features [features] default = ["std", "unicode"] std = [] arbitrary = ["dep:arbitrary"] unicode = [ "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", ] unicode-age = [] unicode-bool = [] unicode-case = [] unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] [dependencies] arbitrary = { version = "1.3.0", features = ["derive"], optional = true } [package.metadata.docs.rs] # We want to document all features. all-features = true # Since this crate's feature setup is pretty complicated, it is worth opting # into a nightly unstable option to show the features that need to be enabled # for public API items. To do that, we set 'docsrs', and when that's enabled, # we enable the 'doc_auto_cfg' feature. # # To test this locally, run: # # RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features rustdoc-args = ["--cfg", "docsrs"] regex-syntax-0.8.2/Cargo.toml.orig000064400000000000000000000026661046102023000151720ustar 00000000000000[package] name = "regex-syntax" version = "0.8.2" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-syntax" documentation = "https://docs.rs/regex-syntax" description = "A regular expression parser." workspace = ".." edition = "2021" rust-version = "1.65" # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-syntax/*/#crate-features [features] default = ["std", "unicode"] std = [] arbitrary = ["dep:arbitrary"] unicode = [ "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", ] unicode-age = [] unicode-bool = [] unicode-case = [] unicode-gencat = [] unicode-perl = [] unicode-script = [] unicode-segment = [] [dependencies] arbitrary = { version = "1.3.0", features = ["derive"], optional = true } [package.metadata.docs.rs] # We want to document all features. all-features = true # Since this crate's feature setup is pretty complicated, it is worth opting # into a nightly unstable option to show the features that need to be enabled # for public API items. To do that, we set 'docsrs', and when that's enabled, # we enable the 'doc_auto_cfg' feature. # # To test this locally, run: # # RUSTDOCFLAGS="--cfg docsrs" cargo +nightly doc --all-features rustdoc-args = ["--cfg", "docsrs"] regex-syntax-0.8.2/LICENSE-APACHE000064400000000000000000000251371046102023000142250ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. regex-syntax-0.8.2/LICENSE-MIT000064400000000000000000000020571046102023000137310ustar 00000000000000Copyright (c) 2014 The Rust Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regex-syntax-0.8.2/README.md000064400000000000000000000074011046102023000135520ustar 00000000000000regex-syntax ============ This crate provides a robust regular expression parser. [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex-syntax.svg)](https://crates.io/crates/regex-syntax) ### Documentation https://docs.rs/regex-syntax ### Overview There are two primary types exported by this crate: `Ast` and `Hir`. The former is a faithful abstract syntax of a regular expression, and can convert regular expressions back to their concrete syntax while mostly preserving its original form. The latter type is a high level intermediate representation of a regular expression that is amenable to analysis and compilation into byte codes or automata. An `Hir` achieves this by drastically simplifying the syntactic structure of the regular expression. While an `Hir` can be converted back to its equivalent concrete syntax, the result is unlikely to resemble the original concrete syntax that produced the `Hir`. ### Example This example shows how to parse a pattern string into its HIR: ```rust use regex_syntax::{hir::Hir, parse}; let hir = parse("a|b").unwrap(); assert_eq!(hir, Hir::alternation(vec![ Hir::literal("a".as_bytes()), Hir::literal("b".as_bytes()), ])); ``` ### Safety This crate has no `unsafe` code and sets `forbid(unsafe_code)`. While it's possible this crate could use `unsafe` code in the future, the standard for doing so is extremely high. In general, most code in this crate is not performance critical, since it tends to be dwarfed by the time it takes to compile a regular expression into an automaton. Therefore, there is little need for extreme optimization, and therefore, use of `unsafe`. The standard for using `unsafe` in this crate is extremely high because this crate is intended to be reasonably safe to use with user supplied regular expressions. Therefore, while there may be bugs in the regex parser itself, they should _never_ result in memory unsafety unless there is either a bug in the compiler or the standard library. (Since `regex-syntax` has zero dependencies.) ### Crate features By default, this crate bundles a fairly large amount of Unicode data tables (a source size of ~750KB). Because of their large size, one can disable some or all of these data tables. If a regular expression attempts to use Unicode data that is not available, then an error will occur when translating the `Ast` to the `Hir`. The full set of features one can disable are [in the "Crate features" section of the documentation](https://docs.rs/regex-syntax/*/#crate-features). ### Testing Simply running `cargo test` will give you very good coverage. However, because of the large number of features exposed by this crate, a `test` script is included in this directory which will test several feature combinations. This is the same script that is run in CI. ### Motivation The primary purpose of this crate is to provide the parser used by `regex`. Specifically, this crate is treated as an implementation detail of the `regex`, and is primarily developed for the needs of `regex`. Since this crate is an implementation detail of `regex`, it may experience breaking change releases at a different cadence from `regex`. This is only possible because this crate is _not_ a public dependency of `regex`. Another consequence of this de-coupling is that there is no direct way to compile a `regex::Regex` from a `regex_syntax::hir::Hir`. Instead, one must first convert the `Hir` to a string (via its `std::fmt::Display`) and then compile that via `Regex::new`. While this does repeat some work, compilation typically takes much longer than parsing. Stated differently, the coupling between `regex` and `regex-syntax` exists only at the level of the concrete syntax. regex-syntax-0.8.2/src/ast/mod.rs000064400000000000000000001676421046102023000150140ustar 00000000000000/*! Defines an abstract syntax for regular expressions. */ use core::cmp::Ordering; use alloc::{boxed::Box, string::String, vec, vec::Vec}; pub use crate::ast::visitor::{visit, Visitor}; pub mod parse; pub mod print; mod visitor; /// An error that occurred while parsing a regular expression into an abstract /// syntax tree. /// /// Note that not all ASTs represents a valid regular expression. For example, /// an AST is constructed without error for `\p{Quux}`, but `Quux` is not a /// valid Unicode property name. That particular error is reported when /// translating an AST to the high-level intermediate representation (`HIR`). #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Error { /// The kind of error. kind: ErrorKind, /// The original pattern that the parser generated the error from. Every /// span in an error is a valid range into this string. pattern: String, /// The span of this error. span: Span, } impl Error { /// Return the type of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } /// The original pattern string in which this error occurred. /// /// Every span reported by this error is reported in terms of this string. pub fn pattern(&self) -> &str { &self.pattern } /// Return the span at which this error occurred. pub fn span(&self) -> &Span { &self.span } /// Return an auxiliary span. This span exists only for some errors that /// benefit from being able to point to two locations in the original /// regular expression. For example, "duplicate" errors will have the /// main error position set to the duplicate occurrence while its /// auxiliary span will be set to the initial occurrence. pub fn auxiliary_span(&self) -> Option<&Span> { use self::ErrorKind::*; match self.kind { FlagDuplicate { ref original } => Some(original), FlagRepeatedNegation { ref original, .. } => Some(original), GroupNameDuplicate { ref original, .. } => Some(original), _ => None, } } } /// The type of an error that occurred while building an AST. /// /// This error type is marked as `non_exhaustive`. This means that adding a /// new variant is not considered a breaking change. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ErrorKind { /// The capturing group limit was exceeded. /// /// Note that this represents a limit on the total number of capturing /// groups in a regex and not necessarily the number of nested capturing /// groups. That is, the nest limit can be low and it is still possible for /// this error to occur. CaptureLimitExceeded, /// An invalid escape sequence was found in a character class set. ClassEscapeInvalid, /// An invalid character class range was found. An invalid range is any /// range where the start is greater than the end. ClassRangeInvalid, /// An invalid range boundary was found in a character class. Range /// boundaries must be a single literal codepoint, but this error indicates /// that something else was found, such as a nested class. ClassRangeLiteral, /// An opening `[` was found with no corresponding closing `]`. ClassUnclosed, /// Note that this error variant is no longer used. Namely, a decimal /// number can only appear as a repetition quantifier. When the number /// in a repetition quantifier is empty, then it gets its own specialized /// error, `RepetitionCountDecimalEmpty`. DecimalEmpty, /// An invalid decimal number was given where one was expected. DecimalInvalid, /// A bracketed hex literal was empty. EscapeHexEmpty, /// A bracketed hex literal did not correspond to a Unicode scalar value. EscapeHexInvalid, /// An invalid hexadecimal digit was found. EscapeHexInvalidDigit, /// EOF was found before an escape sequence was completed. EscapeUnexpectedEof, /// An unrecognized escape sequence. EscapeUnrecognized, /// A dangling negation was used when setting flags, e.g., `i-`. FlagDanglingNegation, /// A flag was used twice, e.g., `i-i`. FlagDuplicate { /// The position of the original flag. The error position /// points to the duplicate flag. original: Span, }, /// The negation operator was used twice, e.g., `-i-s`. FlagRepeatedNegation { /// The position of the original negation operator. The error position /// points to the duplicate negation operator. original: Span, }, /// Expected a flag but got EOF, e.g., `(?`. FlagUnexpectedEof, /// Unrecognized flag, e.g., `a`. FlagUnrecognized, /// A duplicate capture name was found. GroupNameDuplicate { /// The position of the initial occurrence of the capture name. The /// error position itself points to the duplicate occurrence. original: Span, }, /// A capture group name is empty, e.g., `(?P<>abc)`. GroupNameEmpty, /// An invalid character was seen for a capture group name. This includes /// errors where the first character is a digit (even though subsequent /// characters are allowed to be digits). GroupNameInvalid, /// A closing `>` could not be found for a capture group name. GroupNameUnexpectedEof, /// An unclosed group, e.g., `(ab`. /// /// The span of this error corresponds to the unclosed parenthesis. GroupUnclosed, /// An unopened group, e.g., `ab)`. GroupUnopened, /// The nest limit was exceeded. The limit stored here is the limit /// configured in the parser. NestLimitExceeded(u32), /// The range provided in a counted repetition operator is invalid. The /// range is invalid if the start is greater than the end. RepetitionCountInvalid, /// An opening `{` was not followed by a valid decimal value. /// For example, `x{}` or `x{]}` would fail. RepetitionCountDecimalEmpty, /// An opening `{` was found with no corresponding closing `}`. RepetitionCountUnclosed, /// A repetition operator was applied to a missing sub-expression. This /// occurs, for example, in the regex consisting of just a `*` or even /// `(?i)*`. It is, however, possible to create a repetition operating on /// an empty sub-expression. For example, `()*` is still considered valid. RepetitionMissing, /// The special word boundary syntax, `\b{something}`, was used, but /// either EOF without `}` was seen, or an invalid character in the /// braces was seen. SpecialWordBoundaryUnclosed, /// The special word boundary syntax, `\b{something}`, was used, but /// `something` was not recognized as a valid word boundary kind. SpecialWordBoundaryUnrecognized, /// The syntax `\b{` was observed, but afterwards the end of the pattern /// was observed without being able to tell whether it was meant to be a /// bounded repetition on the `\b` or the beginning of a special word /// boundary assertion. SpecialWordOrRepetitionUnexpectedEof, /// The Unicode class is not valid. This typically occurs when a `\p` is /// followed by something other than a `{`. UnicodeClassInvalid, /// When octal support is disabled, this error is produced when an octal /// escape is used. The octal escape is assumed to be an invocation of /// a backreference, which is the common case. UnsupportedBackreference, /// When syntax similar to PCRE's look-around is used, this error is /// returned. Some example syntaxes that are rejected include, but are /// not necessarily limited to, `(?=re)`, `(?!re)`, `(?<=re)` and /// `(?) -> core::fmt::Result { crate::error::Formatter::from(self).fmt(f) } } impl core::fmt::Display for ErrorKind { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; match *self { CaptureLimitExceeded => write!( f, "exceeded the maximum number of \ capturing groups ({})", u32::MAX ), ClassEscapeInvalid => { write!(f, "invalid escape sequence found in character class") } ClassRangeInvalid => write!( f, "invalid character class range, \ the start must be <= the end" ), ClassRangeLiteral => { write!(f, "invalid range boundary, must be a literal") } ClassUnclosed => write!(f, "unclosed character class"), DecimalEmpty => write!(f, "decimal literal empty"), DecimalInvalid => write!(f, "decimal literal invalid"), EscapeHexEmpty => write!(f, "hexadecimal literal empty"), EscapeHexInvalid => { write!(f, "hexadecimal literal is not a Unicode scalar value") } EscapeHexInvalidDigit => write!(f, "invalid hexadecimal digit"), EscapeUnexpectedEof => write!( f, "incomplete escape sequence, \ reached end of pattern prematurely" ), EscapeUnrecognized => write!(f, "unrecognized escape sequence"), FlagDanglingNegation => { write!(f, "dangling flag negation operator") } FlagDuplicate { .. } => write!(f, "duplicate flag"), FlagRepeatedNegation { .. } => { write!(f, "flag negation operator repeated") } FlagUnexpectedEof => { write!(f, "expected flag but got end of regex") } FlagUnrecognized => write!(f, "unrecognized flag"), GroupNameDuplicate { .. } => { write!(f, "duplicate capture group name") } GroupNameEmpty => write!(f, "empty capture group name"), GroupNameInvalid => write!(f, "invalid capture group character"), GroupNameUnexpectedEof => write!(f, "unclosed capture group name"), GroupUnclosed => write!(f, "unclosed group"), GroupUnopened => write!(f, "unopened group"), NestLimitExceeded(limit) => write!( f, "exceed the maximum number of \ nested parentheses/brackets ({})", limit ), RepetitionCountInvalid => write!( f, "invalid repetition count range, \ the start must be <= the end" ), RepetitionCountDecimalEmpty => { write!(f, "repetition quantifier expects a valid decimal") } RepetitionCountUnclosed => { write!(f, "unclosed counted repetition") } RepetitionMissing => { write!(f, "repetition operator missing expression") } SpecialWordBoundaryUnclosed => { write!( f, "special word boundary assertion is either \ unclosed or contains an invalid character", ) } SpecialWordBoundaryUnrecognized => { write!( f, "unrecognized special word boundary assertion, \ valid choices are: start, end, start-half \ or end-half", ) } SpecialWordOrRepetitionUnexpectedEof => { write!( f, "found either the beginning of a special word \ boundary or a bounded repetition on a \\b with \ an opening brace, but no closing brace", ) } UnicodeClassInvalid => { write!(f, "invalid Unicode character class") } UnsupportedBackreference => { write!(f, "backreferences are not supported") } UnsupportedLookAround => write!( f, "look-around, including look-ahead and look-behind, \ is not supported" ), } } } /// Span represents the position information of a single AST item. /// /// All span positions are absolute byte offsets that can be used on the /// original regular expression that was parsed. #[derive(Clone, Copy, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Span { /// The start byte offset. pub start: Position, /// The end byte offset. pub end: Position, } impl core::fmt::Debug for Span { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "Span({:?}, {:?})", self.start, self.end) } } impl Ord for Span { fn cmp(&self, other: &Span) -> Ordering { (&self.start, &self.end).cmp(&(&other.start, &other.end)) } } impl PartialOrd for Span { fn partial_cmp(&self, other: &Span) -> Option { Some(self.cmp(other)) } } /// A single position in a regular expression. /// /// A position encodes one half of a span, and include the byte offset, line /// number and column number. #[derive(Clone, Copy, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Position { /// The absolute offset of this position, starting at `0` from the /// beginning of the regular expression pattern string. pub offset: usize, /// The line number, starting at `1`. pub line: usize, /// The approximate column number, starting at `1`. pub column: usize, } impl core::fmt::Debug for Position { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Position(o: {:?}, l: {:?}, c: {:?})", self.offset, self.line, self.column ) } } impl Ord for Position { fn cmp(&self, other: &Position) -> Ordering { self.offset.cmp(&other.offset) } } impl PartialOrd for Position { fn partial_cmp(&self, other: &Position) -> Option { Some(self.cmp(other)) } } impl Span { /// Create a new span with the given positions. pub fn new(start: Position, end: Position) -> Span { Span { start, end } } /// Create a new span using the given position as the start and end. pub fn splat(pos: Position) -> Span { Span::new(pos, pos) } /// Create a new span by replacing the starting the position with the one /// given. pub fn with_start(self, pos: Position) -> Span { Span { start: pos, ..self } } /// Create a new span by replacing the ending the position with the one /// given. pub fn with_end(self, pos: Position) -> Span { Span { end: pos, ..self } } /// Returns true if and only if this span occurs on a single line. pub fn is_one_line(&self) -> bool { self.start.line == self.end.line } /// Returns true if and only if this span is empty. That is, it points to /// a single position in the concrete syntax of a regular expression. pub fn is_empty(&self) -> bool { self.start.offset == self.end.offset } } impl Position { /// Create a new position with the given information. /// /// `offset` is the absolute offset of the position, starting at `0` from /// the beginning of the regular expression pattern string. /// /// `line` is the line number, starting at `1`. /// /// `column` is the approximate column number, starting at `1`. pub fn new(offset: usize, line: usize, column: usize) -> Position { Position { offset, line, column } } } /// An abstract syntax tree for a singular expression along with comments /// found. /// /// Comments are not stored in the tree itself to avoid complexity. Each /// comment contains a span of precisely where it occurred in the original /// regular expression. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct WithComments { /// The actual ast. pub ast: Ast, /// All comments found in the original regular expression. pub comments: Vec, } /// A comment from a regular expression with an associated span. /// /// A regular expression can only contain comments when the `x` flag is /// enabled. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Comment { /// The span of this comment, including the beginning `#` and ending `\n`. pub span: Span, /// The comment text, starting with the first character following the `#` /// and ending with the last character preceding the `\n`. pub comment: String, } /// An abstract syntax tree for a single regular expression. /// /// An `Ast`'s `fmt::Display` implementation uses constant stack space and heap /// space proportional to the size of the `Ast`. /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the `Ast`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Ast { /// An empty regex that matches everything. Empty(Box), /// A set of flags, e.g., `(?is)`. Flags(Box), /// A single character literal, which includes escape sequences. Literal(Box), /// The "any character" class. Dot(Box), /// A single zero-width assertion. Assertion(Box), /// A single Unicode character class, e.g., `\pL` or `\p{Greek}`. ClassUnicode(Box), /// A single perl character class, e.g., `\d` or `\W`. ClassPerl(Box), /// A single bracketed character class set, which may contain zero or more /// character ranges and/or zero or more nested classes. e.g., /// `[a-zA-Z\pL]`. ClassBracketed(Box), /// A repetition operator applied to an arbitrary regular expression. Repetition(Box), /// A grouped regular expression. Group(Box), /// An alternation of regular expressions. Alternation(Box), /// A concatenation of regular expressions. Concat(Box), } impl Ast { /// Create an "empty" AST item. pub fn empty(span: Span) -> Ast { Ast::Empty(Box::new(span)) } /// Create a "flags" AST item. pub fn flags(e: SetFlags) -> Ast { Ast::Flags(Box::new(e)) } /// Create a "literal" AST item. pub fn literal(e: Literal) -> Ast { Ast::Literal(Box::new(e)) } /// Create a "dot" AST item. pub fn dot(span: Span) -> Ast { Ast::Dot(Box::new(span)) } /// Create a "assertion" AST item. pub fn assertion(e: Assertion) -> Ast { Ast::Assertion(Box::new(e)) } /// Create a "Unicode class" AST item. pub fn class_unicode(e: ClassUnicode) -> Ast { Ast::ClassUnicode(Box::new(e)) } /// Create a "Perl class" AST item. pub fn class_perl(e: ClassPerl) -> Ast { Ast::ClassPerl(Box::new(e)) } /// Create a "bracketed class" AST item. pub fn class_bracketed(e: ClassBracketed) -> Ast { Ast::ClassBracketed(Box::new(e)) } /// Create a "repetition" AST item. pub fn repetition(e: Repetition) -> Ast { Ast::Repetition(Box::new(e)) } /// Create a "group" AST item. pub fn group(e: Group) -> Ast { Ast::Group(Box::new(e)) } /// Create a "alternation" AST item. pub fn alternation(e: Alternation) -> Ast { Ast::Alternation(Box::new(e)) } /// Create a "concat" AST item. pub fn concat(e: Concat) -> Ast { Ast::Concat(Box::new(e)) } /// Return the span of this abstract syntax tree. pub fn span(&self) -> &Span { match *self { Ast::Empty(ref span) => span, Ast::Flags(ref x) => &x.span, Ast::Literal(ref x) => &x.span, Ast::Dot(ref span) => span, Ast::Assertion(ref x) => &x.span, Ast::ClassUnicode(ref x) => &x.span, Ast::ClassPerl(ref x) => &x.span, Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, } } /// Return true if and only if this Ast is empty. pub fn is_empty(&self) -> bool { match *self { Ast::Empty(_) => true, _ => false, } } /// Returns true if and only if this AST has any (including possibly empty) /// subexpressions. fn has_subexprs(&self) -> bool { match *self { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) => false, Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) | Ast::Concat(_) => true, } } } /// Print a display representation of this Ast. /// /// This does not preserve any of the original whitespace formatting that may /// have originally been present in the concrete syntax from which this Ast /// was generated. /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Ast`. impl core::fmt::Display for Ast { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use crate::ast::print::Printer; Printer::new().print(self, f) } } /// An alternation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Alternation { /// The span of this alternation. pub span: Span, /// The alternate regular expressions. pub asts: Vec, } impl Alternation { /// Return this alternation as an AST. /// /// If this alternation contains zero ASTs, then `Ast::empty` is returned. /// If this alternation contains exactly 1 AST, then the corresponding AST /// is returned. Otherwise, `Ast::alternation` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), _ => Ast::alternation(self), } } } /// A concatenation of regular expressions. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Concat { /// The span of this concatenation. pub span: Span, /// The concatenation regular expressions. pub asts: Vec, } impl Concat { /// Return this concatenation as an AST. /// /// If this alternation contains zero ASTs, then `Ast::empty` is returned. /// If this alternation contains exactly 1 AST, then the corresponding AST /// is returned. Otherwise, `Ast::concat` is returned. pub fn into_ast(mut self) -> Ast { match self.asts.len() { 0 => Ast::empty(self.span), 1 => self.asts.pop().unwrap(), _ => Ast::concat(self), } } } /// A single literal expression. /// /// A literal corresponds to a single Unicode scalar value. Literals may be /// represented in their literal form, e.g., `a` or in their escaped form, /// e.g., `\x61`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Literal { /// The span of this literal. pub span: Span, /// The kind of this literal. pub kind: LiteralKind, /// The Unicode scalar value corresponding to this literal. pub c: char, } impl Literal { /// If this literal was written as a `\x` hex escape, then this returns /// the corresponding byte value. Otherwise, this returns `None`. pub fn byte(&self) -> Option { match self.kind { LiteralKind::HexFixed(HexLiteralKind::X) => { u8::try_from(self.c).ok() } _ => None, } } } /// The kind of a single literal expression. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum LiteralKind { /// The literal is written verbatim, e.g., `a` or `☃`. Verbatim, /// The literal is written as an escape because it is otherwise a special /// regex meta character, e.g., `\*` or `\[`. Meta, /// The literal is written as an escape despite the fact that the escape is /// unnecessary, e.g., `\%` or `\/`. Superfluous, /// The literal is written as an octal escape, e.g., `\141`. Octal, /// The literal is written as a hex code with a fixed number of digits /// depending on the type of the escape, e.g., `\x61` or or `\u0061` or /// `\U00000061`. HexFixed(HexLiteralKind), /// The literal is written as a hex code with a bracketed number of /// digits. The only restriction is that the bracketed hex code must refer /// to a valid Unicode scalar value. HexBrace(HexLiteralKind), /// The literal is written as a specially recognized escape, e.g., `\f` /// or `\n`. Special(SpecialLiteralKind), } /// The type of a special literal. /// /// A special literal is a special escape sequence recognized by the regex /// parser, e.g., `\f` or `\n`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum SpecialLiteralKind { /// Bell, spelled `\a` (`\x07`). Bell, /// Form feed, spelled `\f` (`\x0C`). FormFeed, /// Tab, spelled `\t` (`\x09`). Tab, /// Line feed, spelled `\n` (`\x0A`). LineFeed, /// Carriage return, spelled `\r` (`\x0D`). CarriageReturn, /// Vertical tab, spelled `\v` (`\x0B`). VerticalTab, /// Space, spelled `\ ` (`\x20`). Note that this can only appear when /// parsing in verbose mode. Space, } /// The type of a Unicode hex literal. /// /// Note that all variants behave the same when used with brackets. They only /// differ when used without brackets in the number of hex digits that must /// follow. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum HexLiteralKind { /// A `\x` prefix. When used without brackets, this form is limited to /// two digits. X, /// A `\u` prefix. When used without brackets, this form is limited to /// four digits. UnicodeShort, /// A `\U` prefix. When used without brackets, this form is limited to /// eight digits. UnicodeLong, } impl HexLiteralKind { /// The number of digits that must be used with this literal form when /// used without brackets. When used with brackets, there is no /// restriction on the number of digits. pub fn digits(&self) -> u32 { match *self { HexLiteralKind::X => 2, HexLiteralKind::UnicodeShort => 4, HexLiteralKind::UnicodeLong => 8, } } } /// A Perl character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassPerl { /// The span of this class. pub span: Span, /// The kind of Perl class. pub kind: ClassPerlKind, /// Whether the class is negated or not. e.g., `\d` is not negated but /// `\D` is. pub negated: bool, } /// The available Perl character classes. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassPerlKind { /// Decimal numbers. Digit, /// Whitespace. Space, /// Word characters. Word, } /// An ASCII character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassAscii { /// The span of this class. pub span: Span, /// The kind of ASCII class. pub kind: ClassAsciiKind, /// Whether the class is negated or not. e.g., `[[:alpha:]]` is not negated /// but `[[:^alpha:]]` is. pub negated: bool, } /// The available ASCII character classes. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassAsciiKind { /// `[0-9A-Za-z]` Alnum, /// `[A-Za-z]` Alpha, /// `[\x00-\x7F]` Ascii, /// `[ \t]` Blank, /// `[\x00-\x1F\x7F]` Cntrl, /// `[0-9]` Digit, /// `[!-~]` Graph, /// `[a-z]` Lower, /// `[ -~]` Print, /// `[!-/:-@\[-`{-~]` Punct, /// `[\t\n\v\f\r ]` Space, /// `[A-Z]` Upper, /// `[0-9A-Za-z_]` Word, /// `[0-9A-Fa-f]` Xdigit, } impl ClassAsciiKind { /// Return the corresponding ClassAsciiKind variant for the given name. /// /// The name given should correspond to the lowercase version of the /// variant name. e.g., `cntrl` is the name for `ClassAsciiKind::Cntrl`. /// /// If no variant with the corresponding name exists, then `None` is /// returned. pub fn from_name(name: &str) -> Option { use self::ClassAsciiKind::*; match name { "alnum" => Some(Alnum), "alpha" => Some(Alpha), "ascii" => Some(Ascii), "blank" => Some(Blank), "cntrl" => Some(Cntrl), "digit" => Some(Digit), "graph" => Some(Graph), "lower" => Some(Lower), "print" => Some(Print), "punct" => Some(Punct), "space" => Some(Space), "upper" => Some(Upper), "word" => Some(Word), "xdigit" => Some(Xdigit), _ => None, } } } /// A Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassUnicode { /// The span of this class. pub span: Span, /// Whether this class is negated or not. /// /// Note: be careful when using this attribute. This specifically refers /// to whether the class is written as `\p` or `\P`, where the latter /// is `negated = true`. However, it also possible to write something like /// `\P{scx!=Katakana}` which is actually equivalent to /// `\p{scx=Katakana}` and is therefore not actually negated even though /// `negated = true` here. To test whether this class is truly negated /// or not, use the `is_negated` method. pub negated: bool, /// The kind of Unicode class. pub kind: ClassUnicodeKind, } impl ClassUnicode { /// Returns true if this class has been negated. /// /// Note that this takes the Unicode op into account, if it's present. /// e.g., `is_negated` for `\P{scx!=Katakana}` will return `false`. pub fn is_negated(&self) -> bool { match self.kind { ClassUnicodeKind::NamedValue { op: ClassUnicodeOpKind::NotEqual, .. } => !self.negated, _ => self.negated, } } } /// The available forms of Unicode character classes. #[derive(Clone, Debug, Eq, PartialEq)] pub enum ClassUnicodeKind { /// A one letter abbreviated class, e.g., `\pN`. OneLetter(char), /// A binary property, general category or script. The string may be /// empty. Named(String), /// A property name and an associated value. NamedValue { /// The type of Unicode op used to associate `name` with `value`. op: ClassUnicodeOpKind, /// The property name (which may be empty). name: String, /// The property value (which may be empty). value: String, }, } #[cfg(feature = "arbitrary")] impl arbitrary::Arbitrary<'_> for ClassUnicodeKind { fn arbitrary( u: &mut arbitrary::Unstructured, ) -> arbitrary::Result { #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] { use alloc::string::ToString; use super::unicode_tables::{ property_names::PROPERTY_NAMES, property_values::PROPERTY_VALUES, }; match u.choose_index(3)? { 0 => { let all = PROPERTY_VALUES .iter() .flat_map(|e| e.1.iter()) .filter(|(name, _)| name.len() == 1) .count(); let idx = u.choose_index(all)?; let value = PROPERTY_VALUES .iter() .flat_map(|e| e.1.iter()) .take(idx + 1) .last() .unwrap() .0 .chars() .next() .unwrap(); Ok(ClassUnicodeKind::OneLetter(value)) } 1 => { let all = PROPERTY_VALUES .iter() .map(|e| e.1.len()) .sum::() + PROPERTY_NAMES.len(); let idx = u.choose_index(all)?; let name = PROPERTY_VALUES .iter() .flat_map(|e| e.1.iter()) .chain(PROPERTY_NAMES) .map(|(_, e)| e) .take(idx + 1) .last() .unwrap(); Ok(ClassUnicodeKind::Named(name.to_string())) } 2 => { let all = PROPERTY_VALUES .iter() .map(|e| e.1.len()) .sum::(); let idx = u.choose_index(all)?; let (prop, value) = PROPERTY_VALUES .iter() .flat_map(|e| { e.1.iter().map(|(_, value)| (e.0, value)) }) .take(idx + 1) .last() .unwrap(); Ok(ClassUnicodeKind::NamedValue { op: u.arbitrary()?, name: prop.to_string(), value: value.to_string(), }) } _ => unreachable!("index chosen is impossible"), } } #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] { match u.choose_index(3)? { 0 => Ok(ClassUnicodeKind::OneLetter(u.arbitrary()?)), 1 => Ok(ClassUnicodeKind::Named(u.arbitrary()?)), 2 => Ok(ClassUnicodeKind::NamedValue { op: u.arbitrary()?, name: u.arbitrary()?, value: u.arbitrary()?, }), _ => unreachable!("index chosen is impossible"), } } } fn size_hint(depth: usize) -> (usize, Option) { #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] { arbitrary::size_hint::and_all(&[ usize::size_hint(depth), usize::size_hint(depth), arbitrary::size_hint::or( (0, Some(0)), ClassUnicodeOpKind::size_hint(depth), ), ]) } #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] { arbitrary::size_hint::and( usize::size_hint(depth), arbitrary::size_hint::or_all(&[ char::size_hint(depth), String::size_hint(depth), arbitrary::size_hint::and_all(&[ String::size_hint(depth), String::size_hint(depth), ClassUnicodeOpKind::size_hint(depth), ]), ]), ) } } } /// The type of op used in a Unicode character class. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassUnicodeOpKind { /// A property set to a specific value, e.g., `\p{scx=Katakana}`. Equal, /// A property set to a specific value using a colon, e.g., /// `\p{scx:Katakana}`. Colon, /// A property that isn't a particular value, e.g., `\p{scx!=Katakana}`. NotEqual, } impl ClassUnicodeOpKind { /// Whether the op is an equality op or not. pub fn is_equal(&self) -> bool { match *self { ClassUnicodeOpKind::Equal | ClassUnicodeOpKind::Colon => true, _ => false, } } } /// A bracketed character class, e.g., `[a-z0-9]`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassBracketed { /// The span of this class. pub span: Span, /// Whether this class is negated or not. e.g., `[a]` is not negated but /// `[^a]` is. pub negated: bool, /// The type of this set. A set is either a normal union of things, e.g., /// `[abc]` or a result of applying set operations, e.g., `[\pL--c]`. pub kind: ClassSet, } /// A character class set. /// /// This type corresponds to the internal structure of a bracketed character /// class. That is, every bracketed character is one of two types: a union of /// items (literals, ranges, other bracketed classes) or a tree of binary set /// operations. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSet { /// An item, which can be a single literal, range, nested character class /// or a union of items. Item(ClassSetItem), /// A single binary operation (i.e., &&, -- or ~~). BinaryOp(ClassSetBinaryOp), } impl ClassSet { /// Build a set from a union. pub fn union(ast: ClassSetUnion) -> ClassSet { ClassSet::Item(ClassSetItem::Union(ast)) } /// Return the span of this character class set. pub fn span(&self) -> &Span { match *self { ClassSet::Item(ref x) => x.span(), ClassSet::BinaryOp(ref x) => &x.span, } } /// Return true if and only if this class set is empty. fn is_empty(&self) -> bool { match *self { ClassSet::Item(ClassSetItem::Empty(_)) => true, _ => false, } } } /// A single component of a character class set. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSetItem { /// An empty item. /// /// Note that a bracketed character class cannot contain a single empty /// item. Empty items can appear when using one of the binary operators. /// For example, `[&&]` is the intersection of two empty classes. Empty(Span), /// A single literal. Literal(Literal), /// A range between two literals. Range(ClassSetRange), /// An ASCII character class, e.g., `[:alnum:]` or `[:punct:]`. Ascii(ClassAscii), /// A Unicode character class, e.g., `\pL` or `\p{Greek}`. Unicode(ClassUnicode), /// A perl character class, e.g., `\d` or `\W`. Perl(ClassPerl), /// A bracketed character class set, which may contain zero or more /// character ranges and/or zero or more nested classes. e.g., /// `[a-zA-Z\pL]`. Bracketed(Box), /// A union of items. Union(ClassSetUnion), } impl ClassSetItem { /// Return the span of this character class set item. pub fn span(&self) -> &Span { match *self { ClassSetItem::Empty(ref span) => span, ClassSetItem::Literal(ref x) => &x.span, ClassSetItem::Range(ref x) => &x.span, ClassSetItem::Ascii(ref x) => &x.span, ClassSetItem::Perl(ref x) => &x.span, ClassSetItem::Unicode(ref x) => &x.span, ClassSetItem::Bracketed(ref x) => &x.span, ClassSetItem::Union(ref x) => &x.span, } } } /// A single character class range in a set. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetRange { /// The span of this range. pub span: Span, /// The start of this range. pub start: Literal, /// The end of this range. pub end: Literal, } impl ClassSetRange { /// Returns true if and only if this character class range is valid. /// /// The only case where a range is invalid is if its start is greater than /// its end. pub fn is_valid(&self) -> bool { self.start.c <= self.end.c } } /// A union of items inside a character class set. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetUnion { /// The span of the items in this operation. e.g., the `a-z0-9` in /// `[^a-z0-9]` pub span: Span, /// The sequence of items that make up this union. pub items: Vec, } impl ClassSetUnion { /// Push a new item in this union. /// /// The ending position of this union's span is updated to the ending /// position of the span of the item given. If the union is empty, then /// the starting position of this union is set to the starting position /// of this item. /// /// In other words, if you only use this method to add items to a union /// and you set the spans on each item correctly, then you should never /// need to adjust the span of the union directly. pub fn push(&mut self, item: ClassSetItem) { if self.items.is_empty() { self.span.start = item.span().start; } self.span.end = item.span().end; self.items.push(item); } /// Return this union as a character class set item. /// /// If this union contains zero items, then an empty union is /// returned. If this concatenation contains exactly 1 item, then the /// corresponding item is returned. Otherwise, ClassSetItem::Union is /// returned. pub fn into_item(mut self) -> ClassSetItem { match self.items.len() { 0 => ClassSetItem::Empty(self.span), 1 => self.items.pop().unwrap(), _ => ClassSetItem::Union(self), } } } /// A Unicode character class set operation. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct ClassSetBinaryOp { /// The span of this operation. e.g., the `a-z--[h-p]` in `[a-z--h-p]`. pub span: Span, /// The type of this set operation. pub kind: ClassSetBinaryOpKind, /// The left hand side of the operation. pub lhs: Box, /// The right hand side of the operation. pub rhs: Box, } /// The type of a Unicode character class set operation. /// /// Note that this doesn't explicitly represent union since there is no /// explicit union operator. Concatenation inside a character class corresponds /// to the union operation. #[derive(Clone, Copy, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum ClassSetBinaryOpKind { /// The intersection of two sets, e.g., `\pN&&[a-z]`. Intersection, /// The difference of two sets, e.g., `\pN--[0-9]`. Difference, /// The symmetric difference of two sets. The symmetric difference is the /// set of elements belonging to one but not both sets. /// e.g., `[\pL~~[:ascii:]]`. SymmetricDifference, } /// A single zero-width assertion. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Assertion { /// The span of this assertion. pub span: Span, /// The assertion kind, e.g., `\b` or `^`. pub kind: AssertionKind, } /// An assertion kind. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum AssertionKind { /// `^` StartLine, /// `$` EndLine, /// `\A` StartText, /// `\z` EndText, /// `\b` WordBoundary, /// `\B` NotWordBoundary, /// `\b{start}` WordBoundaryStart, /// `\b{end}` WordBoundaryEnd, /// `\<` (alias for `\b{start}`) WordBoundaryStartAngle, /// `\>` (alias for `\b{end}`) WordBoundaryEndAngle, /// `\b{start-half}` WordBoundaryStartHalf, /// `\b{end-half}` WordBoundaryEndHalf, } /// A repetition operation applied to a regular expression. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Repetition { /// The span of this operation. pub span: Span, /// The actual operation. pub op: RepetitionOp, /// Whether this operation was applied greedily or not. pub greedy: bool, /// The regular expression under repetition. pub ast: Box, } /// The repetition operator itself. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct RepetitionOp { /// The span of this operator. This includes things like `+`, `*?` and /// `{m,n}`. pub span: Span, /// The type of operation. pub kind: RepetitionKind, } /// The kind of a repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum RepetitionKind { /// `?` ZeroOrOne, /// `*` ZeroOrMore, /// `+` OneOrMore, /// `{m,n}` Range(RepetitionRange), } /// A range repetition operator. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum RepetitionRange { /// `{m}` Exactly(u32), /// `{m,}` AtLeast(u32), /// `{m,n}` Bounded(u32, u32), } impl RepetitionRange { /// Returns true if and only if this repetition range is valid. /// /// The only case where a repetition range is invalid is if it is bounded /// and its start is greater than its end. pub fn is_valid(&self) -> bool { match *self { RepetitionRange::Bounded(s, e) if s > e => false, _ => true, } } } /// A grouped regular expression. /// /// This includes both capturing and non-capturing groups. This does **not** /// include flag-only groups like `(?is)`, but does contain any group that /// contains a sub-expression, e.g., `(a)`, `(?Pa)`, `(?:a)` and /// `(?is:a)`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Group { /// The span of this group. pub span: Span, /// The kind of this group. pub kind: GroupKind, /// The regular expression in this group. pub ast: Box, } impl Group { /// If this group is non-capturing, then this returns the (possibly empty) /// set of flags. Otherwise, `None` is returned. pub fn flags(&self) -> Option<&Flags> { match self.kind { GroupKind::NonCapturing(ref flags) => Some(flags), _ => None, } } /// Returns true if and only if this group is capturing. pub fn is_capturing(&self) -> bool { match self.kind { GroupKind::CaptureIndex(_) | GroupKind::CaptureName { .. } => true, GroupKind::NonCapturing(_) => false, } } /// Returns the capture index of this group, if this is a capturing group. /// /// This returns a capture index precisely when `is_capturing` is `true`. pub fn capture_index(&self) -> Option { match self.kind { GroupKind::CaptureIndex(i) => Some(i), GroupKind::CaptureName { ref name, .. } => Some(name.index), GroupKind::NonCapturing(_) => None, } } } /// The kind of a group. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum GroupKind { /// `(a)` CaptureIndex(u32), /// `(?a)` or `(?Pa)` CaptureName { /// True if the `?P<` syntax is used and false if the `?<` syntax is used. starts_with_p: bool, /// The capture name. name: CaptureName, }, /// `(?:a)` and `(?i:a)` NonCapturing(Flags), } /// A capture name. /// /// This corresponds to the name itself between the angle brackets in, e.g., /// `(?Pexpr)`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct CaptureName { /// The span of this capture name. pub span: Span, /// The capture name. pub name: String, /// The capture index. pub index: u32, } #[cfg(feature = "arbitrary")] impl arbitrary::Arbitrary<'_> for CaptureName { fn arbitrary( u: &mut arbitrary::Unstructured, ) -> arbitrary::Result { let len = u.arbitrary_len::()?; if len == 0 { return Err(arbitrary::Error::NotEnoughData); } let mut name: String = String::new(); for _ in 0..len { let ch: char = u.arbitrary()?; let cp = u32::from(ch); let ascii_letter_offset = u8::try_from(cp % 26).unwrap(); let ascii_letter = b'a' + ascii_letter_offset; name.push(char::from(ascii_letter)); } Ok(CaptureName { span: u.arbitrary()?, name, index: u.arbitrary()? }) } fn size_hint(depth: usize) -> (usize, Option) { arbitrary::size_hint::and_all(&[ Span::size_hint(depth), usize::size_hint(depth), u32::size_hint(depth), ]) } } /// A group of flags that is not applied to a particular regular expression. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct SetFlags { /// The span of these flags, including the grouping parentheses. pub span: Span, /// The actual sequence of flags. pub flags: Flags, } /// A group of flags. /// /// This corresponds only to the sequence of flags themselves, e.g., `is-u`. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct Flags { /// The span of this group of flags. pub span: Span, /// A sequence of flag items. Each item is either a flag or a negation /// operator. pub items: Vec, } impl Flags { /// Add the given item to this sequence of flags. /// /// If the item was added successfully, then `None` is returned. If the /// given item is a duplicate, then `Some(i)` is returned, where /// `items[i].kind == item.kind`. pub fn add_item(&mut self, item: FlagsItem) -> Option { for (i, x) in self.items.iter().enumerate() { if x.kind == item.kind { return Some(i); } } self.items.push(item); None } /// Returns the state of the given flag in this set. /// /// If the given flag is in the set but is negated, then `Some(false)` is /// returned. /// /// If the given flag is in the set and is not negated, then `Some(true)` /// is returned. /// /// Otherwise, `None` is returned. pub fn flag_state(&self, flag: Flag) -> Option { let mut negated = false; for x in &self.items { match x.kind { FlagsItemKind::Negation => { negated = true; } FlagsItemKind::Flag(ref xflag) if xflag == &flag => { return Some(!negated); } _ => {} } } None } } /// A single item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub struct FlagsItem { /// The span of this item. pub span: Span, /// The kind of this item. pub kind: FlagsItemKind, } /// The kind of an item in a group of flags. #[derive(Clone, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum FlagsItemKind { /// A negation operator applied to all subsequent flags in the enclosing /// group. Negation, /// A single flag in a group. Flag(Flag), } impl FlagsItemKind { /// Returns true if and only if this item is a negation operator. pub fn is_negation(&self) -> bool { match *self { FlagsItemKind::Negation => true, _ => false, } } } /// A single flag. #[derive(Clone, Copy, Debug, Eq, PartialEq)] #[cfg_attr(feature = "arbitrary", derive(arbitrary::Arbitrary))] pub enum Flag { /// `i` CaseInsensitive, /// `m` MultiLine, /// `s` DotMatchesNewLine, /// `U` SwapGreed, /// `u` Unicode, /// `R` CRLF, /// `x` IgnoreWhitespace, } /// A custom `Drop` impl is used for `Ast` such that it uses constant stack /// space but heap space proportional to the depth of the `Ast`. impl Drop for Ast { fn drop(&mut self) { use core::mem; match *self { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) // Bracketed classes are recursive, they get their own Drop impl. | Ast::ClassBracketed(_) => return, Ast::Repetition(ref x) if !x.ast.has_subexprs() => return, Ast::Group(ref x) if !x.ast.has_subexprs() => return, Ast::Alternation(ref x) if x.asts.is_empty() => return, Ast::Concat(ref x) if x.asts.is_empty() => return, _ => {} } let empty_span = || Span::splat(Position::new(0, 0, 0)); let empty_ast = || Ast::empty(empty_span()); let mut stack = vec![mem::replace(self, empty_ast())]; while let Some(mut ast) = stack.pop() { match ast { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) // Bracketed classes are recursive, so they get their own Drop // impl. | Ast::ClassBracketed(_) => {} Ast::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } Ast::Group(ref mut x) => { stack.push(mem::replace(&mut x.ast, empty_ast())); } Ast::Alternation(ref mut x) => { stack.extend(x.asts.drain(..)); } Ast::Concat(ref mut x) => { stack.extend(x.asts.drain(..)); } } } } } /// A custom `Drop` impl is used for `ClassSet` such that it uses constant /// stack space but heap space proportional to the depth of the `ClassSet`. impl Drop for ClassSet { fn drop(&mut self) { use core::mem; match *self { ClassSet::Item(ref item) => match *item { ClassSetItem::Empty(_) | ClassSetItem::Literal(_) | ClassSetItem::Range(_) | ClassSetItem::Ascii(_) | ClassSetItem::Unicode(_) | ClassSetItem::Perl(_) => return, ClassSetItem::Bracketed(ref x) => { if x.kind.is_empty() { return; } } ClassSetItem::Union(ref x) => { if x.items.is_empty() { return; } } }, ClassSet::BinaryOp(ref op) => { if op.lhs.is_empty() && op.rhs.is_empty() { return; } } } let empty_span = || Span::splat(Position::new(0, 0, 0)); let empty_set = || ClassSet::Item(ClassSetItem::Empty(empty_span())); let mut stack = vec![mem::replace(self, empty_set())]; while let Some(mut set) = stack.pop() { match set { ClassSet::Item(ref mut item) => match *item { ClassSetItem::Empty(_) | ClassSetItem::Literal(_) | ClassSetItem::Range(_) | ClassSetItem::Ascii(_) | ClassSetItem::Unicode(_) | ClassSetItem::Perl(_) => {} ClassSetItem::Bracketed(ref mut x) => { stack.push(mem::replace(&mut x.kind, empty_set())); } ClassSetItem::Union(ref mut x) => { stack.extend(x.items.drain(..).map(ClassSet::Item)); } }, ClassSet::BinaryOp(ref mut op) => { stack.push(mem::replace(&mut op.lhs, empty_set())); stack.push(mem::replace(&mut op.rhs, empty_set())); } } } } } #[cfg(test)] mod tests { use super::*; // We use a thread with an explicit stack size to test that our destructor // for Ast can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit // this test to Windows/Unix. #[test] #[cfg(any(unix, windows))] fn no_stack_overflow_on_drop() { use std::thread; let run = || { let span = || Span::splat(Position::new(0, 0, 0)); let mut ast = Ast::empty(span()); for i in 0..200 { ast = Ast::group(Group { span: span(), kind: GroupKind::CaptureIndex(i), ast: Box::new(ast), }); } assert!(!ast.is_empty()); }; // We run our test on a thread with a small stack size so we can // force the issue more easily. // // NOTE(2023-03-21): It turns out that some platforms (like FreeBSD) // will just barf with very small stack sizes. So we bump this up a bit // to give more room to breath. When I did this, I confirmed that if // I remove the custom `Drop` impl for `Ast`, then this test does // indeed still fail with a stack overflow. (At the time of writing, I // had to bump it all the way up to 32K before the test would pass even // without the custom `Drop` impl. So 16K seems like a safe number // here.) // // See: https://github.com/rust-lang/regex/issues/967 thread::Builder::new() .stack_size(16 << 10) .spawn(run) .unwrap() .join() .unwrap(); } // This tests that our `Ast` has a reasonable size. This isn't a hard rule // and it can be increased if given a good enough reason. But this test // exists because the size of `Ast` was at one point over 200 bytes on a // 64-bit target. Wow. #[test] fn ast_size() { let max = 2 * core::mem::size_of::(); let size = core::mem::size_of::(); assert!( size <= max, "Ast size of {} bytes is bigger than suggested max {}", size, max ); } } regex-syntax-0.8.2/src/ast/parse.rs000064400000000000000000006540731046102023000153460ustar 00000000000000/*! This module provides a regular expression parser. */ use core::{ borrow::Borrow, cell::{Cell, RefCell}, mem, }; use alloc::{ boxed::Box, string::{String, ToString}, vec, vec::Vec, }; use crate::{ ast::{self, Ast, Position, Span}, either::Either, is_escapeable_character, is_meta_character, }; type Result = core::result::Result; /// A primitive is an expression with no sub-expressions. This includes /// literals, assertions and non-set character classes. This representation /// is used as intermediate state in the parser. /// /// This does not include ASCII character classes, since they can only appear /// within a set character class. #[derive(Clone, Debug, Eq, PartialEq)] enum Primitive { Literal(ast::Literal), Assertion(ast::Assertion), Dot(Span), Perl(ast::ClassPerl), Unicode(ast::ClassUnicode), } impl Primitive { /// Return the span of this primitive. fn span(&self) -> &Span { match *self { Primitive::Literal(ref x) => &x.span, Primitive::Assertion(ref x) => &x.span, Primitive::Dot(ref span) => span, Primitive::Perl(ref x) => &x.span, Primitive::Unicode(ref x) => &x.span, } } /// Convert this primitive into a proper AST. fn into_ast(self) -> Ast { match self { Primitive::Literal(lit) => Ast::literal(lit), Primitive::Assertion(assert) => Ast::assertion(assert), Primitive::Dot(span) => Ast::dot(span), Primitive::Perl(cls) => Ast::class_perl(cls), Primitive::Unicode(cls) => Ast::class_unicode(cls), } } /// Convert this primitive into an item in a character class. /// /// If this primitive is not a legal item (i.e., an assertion or a dot), /// then return an error. fn into_class_set_item>( self, p: &ParserI<'_, P>, ) -> Result { use self::Primitive::*; use crate::ast::ClassSetItem; match self { Literal(lit) => Ok(ClassSetItem::Literal(lit)), Perl(cls) => Ok(ClassSetItem::Perl(cls)), Unicode(cls) => Ok(ClassSetItem::Unicode(cls)), x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)), } } /// Convert this primitive into a literal in a character class. In /// particular, literals are the only valid items that can appear in /// ranges. /// /// If this primitive is not a legal item (i.e., a class, assertion or a /// dot), then return an error. fn into_class_literal>( self, p: &ParserI<'_, P>, ) -> Result { use self::Primitive::*; match self { Literal(lit) => Ok(lit), x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)), } } } /// Returns true if the given character is a hexadecimal digit. fn is_hex(c: char) -> bool { ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') } /// Returns true if the given character is a valid in a capture group name. /// /// If `first` is true, then `c` is treated as the first character in the /// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { if first { c == '_' || c.is_alphabetic() } else { c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() } } /// A builder for a regular expression parser. /// /// This builder permits modifying configuration options for the parser. #[derive(Clone, Debug)] pub struct ParserBuilder { ignore_whitespace: bool, nest_limit: u32, octal: bool, } impl Default for ParserBuilder { fn default() -> ParserBuilder { ParserBuilder::new() } } impl ParserBuilder { /// Create a new parser builder with a default configuration. pub fn new() -> ParserBuilder { ParserBuilder { ignore_whitespace: false, nest_limit: 250, octal: false, } } /// Build a parser from this configuration with the given pattern. pub fn build(&self) -> Parser { Parser { pos: Cell::new(Position { offset: 0, line: 1, column: 1 }), capture_index: Cell::new(0), nest_limit: self.nest_limit, octal: self.octal, initial_ignore_whitespace: self.ignore_whitespace, ignore_whitespace: Cell::new(self.ignore_whitespace), comments: RefCell::new(vec![]), stack_group: RefCell::new(vec![]), stack_class: RefCell::new(vec![]), capture_names: RefCell::new(vec![]), scratch: RefCell::new(String::new()), } } /// Set the nesting limit for this parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow for consumers that do structural induction on an `Ast` using /// explicit recursion. While this crate never does this (instead using /// constant stack space and moving the call stack to the heap), other /// crates may. /// /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser /// implementation will limit itself to heap space proportional to the /// length of the pattern string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation, which results in a nest /// depth of `1`. In general, a nest limit is not something that manifests /// in an obvious way in the concrete syntax, therefore, it should not be /// used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { self.nest_limit = limit; self } /// Whether to support octal syntax or not. /// /// Octal syntax is a little-known way of uttering Unicode codepoints in /// a regular expression. For example, `a`, `\x61`, `\u0061` and /// `\141` are all equivalent regular expressions, where the last example /// shows octal syntax. /// /// While supporting octal syntax isn't in and of itself a problem, it does /// make good error messages harder. That is, in PCRE based regex engines, /// syntax like `\0` invokes a backreference, which is explicitly /// unsupported in Rust's regex engine. However, many users expect it to /// be supported. Therefore, when octal support is disabled, the error /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { self.octal = yes; self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insignificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { self.ignore_whitespace = yes; self } } /// A regular expression parser. /// /// This parses a string representation of a regular expression into an /// abstract syntax tree. The size of the tree is proportional to the length /// of the regular expression pattern. /// /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { /// The current position of the parser. pos: Cell, /// The current capture index. capture_index: Cell, /// The maximum number of open parens/brackets allowed. If the parser /// exceeds this number, then an error is returned. nest_limit: u32, /// Whether to support octal syntax or not. When `false`, the parser will /// return an error helpfully pointing out that backreferences are not /// supported. octal: bool, /// The initial setting for `ignore_whitespace` as provided by /// `ParserBuilder`. It is used when resetting the parser's state. initial_ignore_whitespace: bool, /// Whether whitespace should be ignored. When enabled, comments are /// also permitted. ignore_whitespace: Cell, /// A list of comments, in order of appearance. comments: RefCell>, /// A stack of grouped sub-expressions, including alternations. stack_group: RefCell>, /// A stack of nested character classes. This is only non-empty when /// parsing a class. stack_class: RefCell>, /// A sorted sequence of capture names. This is used to detect duplicate /// capture names and report an error if one is detected. capture_names: RefCell>, /// A scratch buffer used in various places. Mostly this is used to /// accumulate relevant characters from parts of a pattern. scratch: RefCell, } /// ParserI is the internal parser implementation. /// /// We use this separate type so that we can carry the provided pattern string /// along with us. In particular, a `Parser` internal state is not tied to any /// one pattern, but `ParserI` is. /// /// This type also lets us use `ParserI<&Parser>` in production code while /// retaining the convenience of `ParserI` for tests, which sometimes /// work against the internal interface of the parser. #[derive(Clone, Debug)] struct ParserI<'s, P> { /// The parser state/configuration. parser: P, /// The full regular expression provided by the user. pattern: &'s str, } /// GroupState represents a single stack frame while parsing nested groups /// and alternations. Each frame records the state up to an opening parenthesis /// or a alternating bracket `|`. #[derive(Clone, Debug)] enum GroupState { /// This state is pushed whenever an opening group is found. Group { /// The concatenation immediately preceding the opening group. concat: ast::Concat, /// The group that has been opened. Its sub-AST is always empty. group: ast::Group, /// Whether this group has the `x` flag enabled or not. ignore_whitespace: bool, }, /// This state is pushed whenever a new alternation branch is found. If /// an alternation branch is found and this state is at the top of the /// stack, then this state should be modified to include the new /// alternation. Alternation(ast::Alternation), } /// ClassState represents a single stack frame while parsing character classes. /// Each frame records the state up to an intersection, difference, symmetric /// difference or nested class. /// /// Note that a parser's character class stack is only non-empty when parsing /// a character class. In all other cases, it is empty. #[derive(Clone, Debug)] enum ClassState { /// This state is pushed whenever an opening bracket is found. Open { /// The union of class items immediately preceding this class. union: ast::ClassSetUnion, /// The class that has been opened. Typically this just corresponds /// to the `[`, but it can also include `[^` since `^` indicates /// negation of the class. set: ast::ClassBracketed, }, /// This state is pushed when a operator is seen. When popped, the stored /// set becomes the left hand side of the operator. Op { /// The type of the operation, i.e., &&, -- or ~~. kind: ast::ClassSetBinaryOpKind, /// The left-hand side of the operator. lhs: ast::ClassSet, }, } impl Parser { /// Create a new parser with a default configuration. /// /// The parser can be run with either the `parse` or `parse_with_comments` /// methods. The parse methods return an abstract syntax tree. /// /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } /// Parse the regular expression into an abstract syntax tree. pub fn parse(&mut self, pattern: &str) -> Result { ParserI::new(self, pattern).parse() } /// Parse the regular expression and return an abstract syntax tree with /// all of the comments found in the pattern. pub fn parse_with_comments( &mut self, pattern: &str, ) -> Result { ParserI::new(self, pattern).parse_with_comments() } /// Reset the internal state of a parser. /// /// This is called at the beginning of every parse. This prevents the /// parser from running with inconsistent state (say, if a previous /// invocation returned an error and the parser is reused). fn reset(&self) { // These settings should be in line with the construction // in `ParserBuilder::build`. self.pos.set(Position { offset: 0, line: 1, column: 1 }); self.ignore_whitespace.set(self.initial_ignore_whitespace); self.comments.borrow_mut().clear(); self.stack_group.borrow_mut().clear(); self.stack_class.borrow_mut().clear(); } } impl<'s, P: Borrow> ParserI<'s, P> { /// Build an internal parser from a parser configuration and a pattern. fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> { ParserI { parser, pattern } } /// Return a reference to the parser state. fn parser(&self) -> &Parser { self.parser.borrow() } /// Return a reference to the pattern being parsed. fn pattern(&self) -> &str { self.pattern } /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error { ast::Error { kind, pattern: self.pattern().to_string(), span } } /// Return the current offset of the parser. /// /// The offset starts at `0` from the beginning of the regular expression /// pattern string. fn offset(&self) -> usize { self.parser().pos.get().offset } /// Return the current line number of the parser. /// /// The line number starts at `1`. fn line(&self) -> usize { self.parser().pos.get().line } /// Return the current column of the parser. /// /// The column number starts at `1` and is reset whenever a `\n` is seen. fn column(&self) -> usize { self.parser().pos.get().column } /// Return the next capturing index. Each subsequent call increments the /// internal index. /// /// The span given should correspond to the location of the opening /// parenthesis. /// /// If the capture limit is exceeded, then an error is returned. fn next_capture_index(&self, span: Span) -> Result { let current = self.parser().capture_index.get(); let i = current.checked_add(1).ok_or_else(|| { self.error(span, ast::ErrorKind::CaptureLimitExceeded) })?; self.parser().capture_index.set(i); Ok(i) } /// Adds the given capture name to this parser. If this capture name has /// already been used, then an error is returned. fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> { let mut names = self.parser().capture_names.borrow_mut(); match names .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str()) { Err(i) => { names.insert(i, cap.clone()); Ok(()) } Ok(i) => Err(self.error( cap.span, ast::ErrorKind::GroupNameDuplicate { original: names[i].span }, )), } } /// Return whether the parser should ignore whitespace or not. fn ignore_whitespace(&self) -> bool { self.parser().ignore_whitespace.get() } /// Return the character at the current position of the parser. /// /// This panics if the current position does not point to a valid char. fn char(&self) -> char { self.char_at(self.offset()) } /// Return the character at the given position. /// /// This panics if the given position does not point to a valid char. fn char_at(&self, i: usize) -> char { self.pattern()[i..] .chars() .next() .unwrap_or_else(|| panic!("expected char at offset {}", i)) } /// Bump the parser to the next Unicode scalar value. /// /// If the end of the input has been reached, then `false` is returned. fn bump(&self) -> bool { if self.is_eof() { return false; } let Position { mut offset, mut line, mut column } = self.pos(); if self.char() == '\n' { line = line.checked_add(1).unwrap(); column = 1; } else { column = column.checked_add(1).unwrap(); } offset += self.char().len_utf8(); self.parser().pos.set(Position { offset, line, column }); self.pattern()[self.offset()..].chars().next().is_some() } /// If the substring starting at the current position of the parser has /// the given prefix, then bump the parser to the character immediately /// following the prefix and return true. Otherwise, don't bump the parser /// and return false. fn bump_if(&self, prefix: &str) -> bool { if self.pattern()[self.offset()..].starts_with(prefix) { for _ in 0..prefix.chars().count() { self.bump(); } true } else { false } } /// Returns true if and only if the parser is positioned at a look-around /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. fn is_lookaround_prefix(&self) -> bool { self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("? bool { if !self.bump() { return false; } self.bump_space(); !self.is_eof() } /// If the `x` flag is enabled (i.e., whitespace insensitivity with /// comments), then this will advance the parser through all whitespace /// and comments to the next non-whitespace non-comment byte. /// /// If the `x` flag is disabled, then this is a no-op. /// /// This should be used selectively throughout the parser where /// arbitrary whitespace is permitted when the `x` flag is enabled. For /// example, `{ 5 , 6}` is equivalent to `{5,6}`. fn bump_space(&self) { if !self.ignore_whitespace() { return; } while !self.is_eof() { if self.char().is_whitespace() { self.bump(); } else if self.char() == '#' { let start = self.pos(); let mut comment_text = String::new(); self.bump(); while !self.is_eof() { let c = self.char(); self.bump(); if c == '\n' { break; } comment_text.push(c); } let comment = ast::Comment { span: Span::new(start, self.pos()), comment: comment_text, }; self.parser().comments.borrow_mut().push(comment); } else { break; } } } /// Peek at the next character in the input without advancing the parser. /// /// If the input has been exhausted, then this returns `None`. fn peek(&self) -> Option { if self.is_eof() { return None; } self.pattern()[self.offset() + self.char().len_utf8()..].chars().next() } /// Like peek, but will ignore spaces when the parser is in whitespace /// insensitive mode. fn peek_space(&self) -> Option { if !self.ignore_whitespace() { return self.peek(); } if self.is_eof() { return None; } let mut start = self.offset() + self.char().len_utf8(); let mut in_comment = false; for (i, c) in self.pattern()[start..].char_indices() { if c.is_whitespace() { continue; } else if !in_comment && c == '#' { in_comment = true; } else if in_comment && c == '\n' { in_comment = false; } else { start += i; break; } } self.pattern()[start..].chars().next() } /// Returns true if the next call to `bump` would return false. fn is_eof(&self) -> bool { self.offset() == self.pattern().len() } /// Return the current position of the parser, which includes the offset, /// line and column. fn pos(&self) -> Position { self.parser().pos.get() } /// Create a span at the current position of the parser. Both the start /// and end of the span are set. fn span(&self) -> Span { Span::splat(self.pos()) } /// Create a span that covers the current character. fn span_char(&self) -> Span { let mut next = Position { offset: self.offset().checked_add(self.char().len_utf8()).unwrap(), line: self.line(), column: self.column().checked_add(1).unwrap(), }; if self.char() == '\n' { next.line += 1; next.column = 1; } Span::new(self.pos(), next) } /// Parse and push a single alternation on to the parser's internal stack. /// If the top of the stack already has an alternation, then add to that /// instead of pushing a new one. /// /// The concatenation given corresponds to a single alternation branch. /// The concatenation returned starts the next branch and is empty. /// /// This assumes the parser is currently positioned at `|` and will advance /// the parser to the character following `|`. #[inline(never)] fn push_alternate(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '|'); concat.span.end = self.pos(); self.push_or_add_alternation(concat); self.bump(); Ok(ast::Concat { span: self.span(), asts: vec![] }) } /// Pushes or adds the given branch of an alternation to the parser's /// internal stack of state. fn push_or_add_alternation(&self, concat: ast::Concat) { use self::GroupState::*; let mut stack = self.parser().stack_group.borrow_mut(); if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() { alts.asts.push(concat.into_ast()); return; } stack.push(Alternation(ast::Alternation { span: Span::new(concat.span.start, self.pos()), asts: vec![concat.into_ast()], })); } /// Parse and push a group AST (and its parent concatenation) on to the /// parser's internal stack. Return a fresh concatenation corresponding /// to the group's sub-AST. /// /// If a set of flags was found (with no group), then the concatenation /// is returned with that set of flags added. /// /// This assumes that the parser is currently positioned on the opening /// parenthesis. It advances the parser to the character at the start /// of the sub-expression (or adjoining expression). /// /// If there was a problem parsing the start of the group, then an error /// is returned. #[inline(never)] fn push_group(&self, mut concat: ast::Concat) -> Result { assert_eq!(self.char(), '('); match self.parse_group()? { Either::Left(set) => { let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace); if let Some(v) = ignore { self.parser().ignore_whitespace.set(v); } concat.asts.push(Ast::flags(set)); Ok(concat) } Either::Right(group) => { let old_ignore_whitespace = self.ignore_whitespace(); let new_ignore_whitespace = group .flags() .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace)) .unwrap_or(old_ignore_whitespace); self.parser().stack_group.borrow_mut().push( GroupState::Group { concat, group, ignore_whitespace: old_ignore_whitespace, }, ); self.parser().ignore_whitespace.set(new_ignore_whitespace); Ok(ast::Concat { span: self.span(), asts: vec![] }) } } } /// Pop a group AST from the parser's internal stack and set the group's /// AST to the given concatenation. Return the concatenation containing /// the group. /// /// This assumes that the parser is currently positioned on the closing /// parenthesis and advances the parser to the character following the `)`. /// /// If no such group could be popped, then an unopened group error is /// returned. #[inline(never)] fn pop_group(&self, mut group_concat: ast::Concat) -> Result { use self::GroupState::*; assert_eq!(self.char(), ')'); let mut stack = self.parser().stack_group.borrow_mut(); let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack .pop() { Some(Group { concat, group, ignore_whitespace }) => { (concat, group, ignore_whitespace, None) } Some(Alternation(alt)) => match stack.pop() { Some(Group { concat, group, ignore_whitespace }) => { (concat, group, ignore_whitespace, Some(alt)) } None | Some(Alternation(_)) => { return Err(self.error( self.span_char(), ast::ErrorKind::GroupUnopened, )); } }, None => { return Err(self .error(self.span_char(), ast::ErrorKind::GroupUnopened)); } }; self.parser().ignore_whitespace.set(ignore_whitespace); group_concat.span.end = self.pos(); self.bump(); group.span.end = self.pos(); match alt { Some(mut alt) => { alt.span.end = group_concat.span.end; alt.asts.push(group_concat.into_ast()); group.ast = Box::new(alt.into_ast()); } None => { group.ast = Box::new(group_concat.into_ast()); } } prior_concat.asts.push(Ast::group(group)); Ok(prior_concat) } /// Pop the last state from the parser's internal stack, if it exists, and /// add the given concatenation to it. There either must be no state or a /// single alternation item on the stack. Any other scenario produces an /// error. /// /// This assumes that the parser has advanced to the end. #[inline(never)] fn pop_group_end(&self, mut concat: ast::Concat) -> Result { concat.span.end = self.pos(); let mut stack = self.parser().stack_group.borrow_mut(); let ast = match stack.pop() { None => Ok(concat.into_ast()), Some(GroupState::Alternation(mut alt)) => { alt.span.end = self.pos(); alt.asts.push(concat.into_ast()); Ok(Ast::alternation(alt)) } Some(GroupState::Group { group, .. }) => { return Err( self.error(group.span, ast::ErrorKind::GroupUnclosed) ); } }; // If we try to pop again, there should be nothing. match stack.pop() { None => ast, Some(GroupState::Alternation(_)) => { // This unreachable is unfortunate. This case can't happen // because the only way we can be here is if there were two // `GroupState::Alternation`s adjacent in the parser's stack, // which we guarantee to never happen because we never push a // `GroupState::Alternation` if one is already at the top of // the stack. unreachable!() } Some(GroupState::Group { group, .. }) => { Err(self.error(group.span, ast::ErrorKind::GroupUnclosed)) } } } /// Parse the opening of a character class and push the current class /// parsing context onto the parser's stack. This assumes that the parser /// is positioned at an opening `[`. The given union should correspond to /// the union of set items built up before seeing the `[`. /// /// If there was a problem parsing the opening of the class, then an error /// is returned. Otherwise, a new union of set items for the class is /// returned (which may be populated with either a `]` or a `-`). #[inline(never)] fn push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result { assert_eq!(self.char(), '['); let (nested_set, nested_union) = self.parse_set_class_open()?; self.parser() .stack_class .borrow_mut() .push(ClassState::Open { union: parent_union, set: nested_set }); Ok(nested_union) } /// Parse the end of a character class set and pop the character class /// parser stack. The union given corresponds to the last union built /// before seeing the closing `]`. The union returned corresponds to the /// parent character class set with the nested class added to it. /// /// This assumes that the parser is positioned at a `]` and will advance /// the parser to the byte immediately following the `]`. /// /// If the stack is empty after popping, then this returns the final /// "top-level" character class AST (where a "top-level" character class /// is one that is not nested inside any other character class). /// /// If there is no corresponding opening bracket on the parser's stack, /// then an error is returned. #[inline(never)] fn pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result> { assert_eq!(self.char(), ']'); let item = ast::ClassSet::Item(nested_union.into_item()); let prevset = self.pop_class_op(item); let mut stack = self.parser().stack_class.borrow_mut(); match stack.pop() { None => { // We can never observe an empty stack: // // 1) We are guaranteed to start with a non-empty stack since // the character class parser is only initiated when it sees // a `[`. // 2) If we ever observe an empty stack while popping after // seeing a `]`, then we signal the character class parser // to terminate. panic!("unexpected empty character class stack") } Some(ClassState::Op { .. }) => { // This panic is unfortunate, but this case is impossible // since we already popped the Op state if one exists above. // Namely, every push to the class parser stack is guarded by // whether an existing Op is already on the top of the stack. // If it is, the existing Op is modified. That is, the stack // can never have consecutive Op states. panic!("unexpected ClassState::Op") } Some(ClassState::Open { mut union, mut set }) => { self.bump(); set.span.end = self.pos(); set.kind = prevset; if stack.is_empty() { Ok(Either::Right(set)) } else { union.push(ast::ClassSetItem::Bracketed(Box::new(set))); Ok(Either::Left(union)) } } } } /// Return an "unclosed class" error whose span points to the most /// recently opened class. /// /// This should only be called while parsing a character class. #[inline(never)] fn unclosed_class_error(&self) -> ast::Error { for state in self.parser().stack_class.borrow().iter().rev() { if let ClassState::Open { ref set, .. } = *state { return self.error(set.span, ast::ErrorKind::ClassUnclosed); } } // We are guaranteed to have a non-empty stack with at least // one open bracket, so we should never get here. panic!("no open character class found") } /// Push the current set of class items on to the class parser's stack as /// the left hand side of the given operator. /// /// A fresh set union is returned, which should be used to build the right /// hand side of this operator. #[inline(never)] fn push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion { let item = ast::ClassSet::Item(next_union.into_item()); let new_lhs = self.pop_class_op(item); self.parser() .stack_class .borrow_mut() .push(ClassState::Op { kind: next_kind, lhs: new_lhs }); ast::ClassSetUnion { span: self.span(), items: vec![] } } /// Pop a character class set from the character class parser stack. If the /// top of the stack is just an item (not an operation), then return the /// given set unchanged. If the top of the stack is an operation, then the /// given set will be used as the rhs of the operation on the top of the /// stack. In that case, the binary operation is returned as a set. #[inline(never)] fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet { let mut stack = self.parser().stack_class.borrow_mut(); let (kind, lhs) = match stack.pop() { Some(ClassState::Op { kind, lhs }) => (kind, lhs), Some(state @ ClassState::Open { .. }) => { stack.push(state); return rhs; } None => unreachable!(), }; let span = Span::new(lhs.span().start, rhs.span().end); ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { span, kind, lhs: Box::new(lhs), rhs: Box::new(rhs), }) } } impl<'s, P: Borrow> ParserI<'s, P> { /// Parse the regular expression into an abstract syntax tree. fn parse(&self) -> Result { self.parse_with_comments().map(|astc| astc.ast) } /// Parse the regular expression and return an abstract syntax tree with /// all of the comments found in the pattern. fn parse_with_comments(&self) -> Result { assert_eq!(self.offset(), 0, "parser can only be used once"); self.parser().reset(); let mut concat = ast::Concat { span: self.span(), asts: vec![] }; loop { self.bump_space(); if self.is_eof() { break; } match self.char() { '(' => concat = self.push_group(concat)?, ')' => concat = self.pop_group(concat)?, '|' => concat = self.push_alternate(concat)?, '[' => { let class = self.parse_set_class()?; concat.asts.push(Ast::class_bracketed(class)); } '?' => { concat = self.parse_uncounted_repetition( concat, ast::RepetitionKind::ZeroOrOne, )?; } '*' => { concat = self.parse_uncounted_repetition( concat, ast::RepetitionKind::ZeroOrMore, )?; } '+' => { concat = self.parse_uncounted_repetition( concat, ast::RepetitionKind::OneOrMore, )?; } '{' => { concat = self.parse_counted_repetition(concat)?; } _ => concat.asts.push(self.parse_primitive()?.into_ast()), } } let ast = self.pop_group_end(concat)?; NestLimiter::new(self).check(&ast)?; Ok(ast::WithComments { ast, comments: mem::replace( &mut *self.parser().comments.borrow_mut(), vec![], ), }) } /// Parses an uncounted repetition operation. An uncounted repetition /// operator includes ?, * and +, but does not include the {m,n} syntax. /// The given `kind` should correspond to the operator observed by the /// caller. /// /// This assumes that the parser is currently positioned at the repetition /// operator and advances the parser to the first character after the /// operator. (Note that the operator may include a single additional `?`, /// which makes the operator ungreedy.) /// /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. #[inline(never)] fn parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result { assert!( self.char() == '?' || self.char() == '*' || self.char() == '+' ); let op_start = self.pos(); let ast = match concat.asts.pop() { Some(ast) => ast, None => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) } }; match ast { Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) } _ => {} } let mut greedy = true; if self.bump() && self.char() == '?' { greedy = false; self.bump(); } concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: Span::new(op_start, self.pos()), kind, }, greedy, ast: Box::new(ast), })); Ok(concat) } /// Parses a counted repetition operation. A counted repetition operator /// corresponds to the {m,n} syntax, and does not include the ?, * or + /// operators. /// /// This assumes that the parser is currently positioned at the opening `{` /// and advances the parser to the first character after the operator. /// (Note that the operator may include a single additional `?`, which /// makes the operator ungreedy.) /// /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. #[inline(never)] fn parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result { assert!(self.char() == '{'); let start = self.pos(); let ast = match concat.asts.pop() { Some(ast) => ast, None => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) } }; match ast { Ast::Empty(_) | Ast::Flags(_) => { return Err( self.error(self.span(), ast::ErrorKind::RepetitionMissing) ) } _ => {} } if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } let count_start = specialize_err( self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, )?; let mut range = ast::RepetitionRange::Exactly(count_start); if self.is_eof() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } if self.char() == ',' { if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } if self.char() != '}' { let count_end = specialize_err( self.parse_decimal(), ast::ErrorKind::DecimalEmpty, ast::ErrorKind::RepetitionCountDecimalEmpty, )?; range = ast::RepetitionRange::Bounded(count_start, count_end); } else { range = ast::RepetitionRange::AtLeast(count_start); } } if self.is_eof() || self.char() != '}' { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::RepetitionCountUnclosed, )); } let mut greedy = true; if self.bump_and_bump_space() && self.char() == '?' { greedy = false; self.bump(); } let op_span = Span::new(start, self.pos()); if !range.is_valid() { return Err( self.error(op_span, ast::ErrorKind::RepetitionCountInvalid) ); } concat.asts.push(Ast::repetition(ast::Repetition { span: ast.span().with_end(self.pos()), op: ast::RepetitionOp { span: op_span, kind: ast::RepetitionKind::Range(range), }, greedy, ast: Box::new(ast), })); Ok(concat) } /// Parse a group (which contains a sub-expression) or a set of flags. /// /// If a group was found, then it is returned with an empty AST. If a set /// of flags is found, then that set is returned. /// /// The parser should be positioned at the opening parenthesis. /// /// This advances the parser to the character before the start of the /// sub-expression (in the case of a group) or to the closing parenthesis /// immediately following the set of flags. /// /// # Errors /// /// If flags are given and incorrectly specified, then a corresponding /// error is returned. /// /// If a capture name is given and it is incorrectly specified, then a /// corresponding error is returned. #[inline(never)] fn parse_group(&self) -> Result> { assert_eq!(self.char(), '('); let open_span = self.span_char(); self.bump(); self.bump_space(); if self.is_lookaround_prefix() { return Err(self.error( Span::new(open_span.start, self.span().end), ast::ErrorKind::UnsupportedLookAround, )); } let inner_span = self.span(); let mut starts_with_p = true; if self.bump_if("?P<") || { starts_with_p = false; self.bump_if("?<") } { let capture_index = self.next_capture_index(open_span)?; let name = self.parse_capture_name(capture_index)?; Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureName { starts_with_p, name }, ast: Box::new(Ast::empty(self.span())), })) } else if self.bump_if("?") { if self.is_eof() { return Err( self.error(open_span, ast::ErrorKind::GroupUnclosed) ); } let flags = self.parse_flags()?; let char_end = self.char(); self.bump(); if char_end == ')' { // We don't allow empty flags, e.g., `(?)`. We instead // interpret it as a repetition operator missing its argument. if flags.items.is_empty() { return Err(self.error( inner_span, ast::ErrorKind::RepetitionMissing, )); } Ok(Either::Left(ast::SetFlags { span: Span { end: self.pos(), ..open_span }, flags, })) } else { assert_eq!(char_end, ':'); Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::NonCapturing(flags), ast: Box::new(Ast::empty(self.span())), })) } } else { let capture_index = self.next_capture_index(open_span)?; Ok(Either::Right(ast::Group { span: open_span, kind: ast::GroupKind::CaptureIndex(capture_index), ast: Box::new(Ast::empty(self.span())), })) } } /// Parses a capture group name. Assumes that the parser is positioned at /// the first character in the name following the opening `<` (and may /// possibly be EOF). This advances the parser to the first character /// following the closing `>`. /// /// The caller must provide the capture index of the group for this name. #[inline(never)] fn parse_capture_name( &self, capture_index: u32, ) -> Result { if self.is_eof() { return Err(self .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); } let start = self.pos(); loop { if self.char() == '>' { break; } if !is_capture_char(self.char(), self.pos() == start) { return Err(self.error( self.span_char(), ast::ErrorKind::GroupNameInvalid, )); } if !self.bump() { break; } } let end = self.pos(); if self.is_eof() { return Err(self .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof)); } assert_eq!(self.char(), '>'); self.bump(); let name = &self.pattern()[start.offset..end.offset]; if name.is_empty() { return Err(self.error( Span::new(start, start), ast::ErrorKind::GroupNameEmpty, )); } let capname = ast::CaptureName { span: Span::new(start, end), name: name.to_string(), index: capture_index, }; self.add_capture_name(&capname)?; Ok(capname) } /// Parse a sequence of flags starting at the current character. /// /// This advances the parser to the character immediately following the /// flags, which is guaranteed to be either `:` or `)`. /// /// # Errors /// /// If any flags are duplicated, then an error is returned. /// /// If the negation operator is used more than once, then an error is /// returned. /// /// If no flags could be found or if the negation operation is not followed /// by any flags, then an error is returned. #[inline(never)] fn parse_flags(&self) -> Result { let mut flags = ast::Flags { span: self.span(), items: vec![] }; let mut last_was_negation = None; while self.char() != ':' && self.char() != ')' { if self.char() == '-' { last_was_negation = Some(self.span_char()); let item = ast::FlagsItem { span: self.span_char(), kind: ast::FlagsItemKind::Negation, }; if let Some(i) = flags.add_item(item) { return Err(self.error( self.span_char(), ast::ErrorKind::FlagRepeatedNegation { original: flags.items[i].span, }, )); } } else { last_was_negation = None; let item = ast::FlagsItem { span: self.span_char(), kind: ast::FlagsItemKind::Flag(self.parse_flag()?), }; if let Some(i) = flags.add_item(item) { return Err(self.error( self.span_char(), ast::ErrorKind::FlagDuplicate { original: flags.items[i].span, }, )); } } if !self.bump() { return Err( self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof) ); } } if let Some(span) = last_was_negation { return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation)); } flags.span.end = self.pos(); Ok(flags) } /// Parse the current character as a flag. Do not advance the parser. /// /// # Errors /// /// If the flag is not recognized, then an error is returned. #[inline(never)] fn parse_flag(&self) -> Result { match self.char() { 'i' => Ok(ast::Flag::CaseInsensitive), 'm' => Ok(ast::Flag::MultiLine), 's' => Ok(ast::Flag::DotMatchesNewLine), 'U' => Ok(ast::Flag::SwapGreed), 'u' => Ok(ast::Flag::Unicode), 'R' => Ok(ast::Flag::CRLF), 'x' => Ok(ast::Flag::IgnoreWhitespace), _ => { Err(self .error(self.span_char(), ast::ErrorKind::FlagUnrecognized)) } } } /// Parse a primitive AST. e.g., A literal, non-set character class or /// assertion. /// /// This assumes that the parser expects a primitive at the current /// location. i.e., All other non-primitive cases have been handled. /// For example, if the parser's position is at `|`, then `|` will be /// treated as a literal (e.g., inside a character class). /// /// This advances the parser to the first character immediately following /// the primitive. fn parse_primitive(&self) -> Result { match self.char() { '\\' => self.parse_escape(), '.' => { let ast = Primitive::Dot(self.span_char()); self.bump(); Ok(ast) } '^' => { let ast = Primitive::Assertion(ast::Assertion { span: self.span_char(), kind: ast::AssertionKind::StartLine, }); self.bump(); Ok(ast) } '$' => { let ast = Primitive::Assertion(ast::Assertion { span: self.span_char(), kind: ast::AssertionKind::EndLine, }); self.bump(); Ok(ast) } c => { let ast = Primitive::Literal(ast::Literal { span: self.span_char(), kind: ast::LiteralKind::Verbatim, c, }); self.bump(); Ok(ast) } } } /// Parse an escape sequence as a primitive AST. /// /// This assumes the parser is positioned at the start of the escape /// sequence, i.e., `\`. It advances the parser to the first position /// immediately following the escape sequence. #[inline(never)] fn parse_escape(&self) -> Result { assert_eq!(self.char(), '\\'); let start = self.pos(); if !self.bump() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::EscapeUnexpectedEof, )); } let c = self.char(); // Put some of the more complicated routines into helpers. match c { '0'..='7' => { if !self.parser().octal { return Err(self.error( Span::new(start, self.span_char().end), ast::ErrorKind::UnsupportedBackreference, )); } let mut lit = self.parse_octal(); lit.span.start = start; return Ok(Primitive::Literal(lit)); } '8'..='9' if !self.parser().octal => { return Err(self.error( Span::new(start, self.span_char().end), ast::ErrorKind::UnsupportedBackreference, )); } 'x' | 'u' | 'U' => { let mut lit = self.parse_hex()?; lit.span.start = start; return Ok(Primitive::Literal(lit)); } 'p' | 'P' => { let mut cls = self.parse_unicode_class()?; cls.span.start = start; return Ok(Primitive::Unicode(cls)); } 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { let mut cls = self.parse_perl_class(); cls.span.start = start; return Ok(Primitive::Perl(cls)); } _ => {} } // Handle all of the one letter sequences inline. self.bump(); let span = Span::new(start, self.pos()); if is_meta_character(c) { return Ok(Primitive::Literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c, })); } if is_escapeable_character(c) { return Ok(Primitive::Literal(ast::Literal { span, kind: ast::LiteralKind::Superfluous, c, })); } let special = |kind, c| { Ok(Primitive::Literal(ast::Literal { span, kind: ast::LiteralKind::Special(kind), c, })) }; match c { 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'), 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'), 't' => special(ast::SpecialLiteralKind::Tab, '\t'), 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'), 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'), 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'), 'A' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::StartText, })), 'z' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::EndText, })), 'b' => { let mut wb = ast::Assertion { span, kind: ast::AssertionKind::WordBoundary, }; // After a \b, we "try" to parse things like \b{start} for // special word boundary assertions. if !self.is_eof() && self.char() == '{' { if let Some(kind) = self.maybe_parse_special_word_boundary(start)? { wb.kind = kind; wb.span.end = self.pos(); } } Ok(Primitive::Assertion(wb)) } 'B' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::NotWordBoundary, })), '<' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::WordBoundaryStartAngle, })), '>' => Ok(Primitive::Assertion(ast::Assertion { span, kind: ast::AssertionKind::WordBoundaryEndAngle, })), _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)), } } /// Attempt to parse a specialty word boundary. That is, `\b{start}`, /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. /// /// This is similar to `maybe_parse_ascii_class` in that, in most cases, /// if it fails it will just return `None` with no error. This is done /// because `\b{5}` is a valid expression and we want to let that be parsed /// by the existing counted repetition parsing code. (I thought about just /// invoking the counted repetition code from here, but it seemed a little /// ham-fisted.) /// /// Unlike `maybe_parse_ascii_class` though, this can return an error. /// Namely, if we definitely know it isn't a counted repetition, then we /// return an error specific to the specialty word boundaries. /// /// This assumes the parser is positioned at a `{` immediately following /// a `\b`. When `None` is returned, the parser is returned to the position /// at which it started: pointing at a `{`. /// /// The position given should correspond to the start of the `\b`. fn maybe_parse_special_word_boundary( &self, wb_start: Position, ) -> Result> { assert_eq!(self.char(), '{'); let is_valid_char = |c| match c { 'A'..='Z' | 'a'..='z' | '-' => true, _ => false, }; let start = self.pos(); if !self.bump_and_bump_space() { return Err(self.error( Span::new(wb_start, self.pos()), ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, )); } let start_contents = self.pos(); // This is one of the critical bits: if the first non-whitespace // character isn't in [-A-Za-z] (i.e., this can't be a special word // boundary), then we bail and let the counted repetition parser deal // with this. if !is_valid_char(self.char()) { self.parser().pos.set(start); return Ok(None); } // Now collect up our chars until we see a '}'. let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); while !self.is_eof() && is_valid_char(self.char()) { scratch.push(self.char()); self.bump_and_bump_space(); } if self.is_eof() || self.char() != '}' { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::SpecialWordBoundaryUnclosed, )); } let end = self.pos(); self.bump(); let kind = match scratch.as_str() { "start" => ast::AssertionKind::WordBoundaryStart, "end" => ast::AssertionKind::WordBoundaryEnd, "start-half" => ast::AssertionKind::WordBoundaryStartHalf, "end-half" => ast::AssertionKind::WordBoundaryEndHalf, _ => { return Err(self.error( Span::new(start_contents, end), ast::ErrorKind::SpecialWordBoundaryUnrecognized, )) } }; Ok(Some(kind)) } /// Parse an octal representation of a Unicode codepoint up to 3 digits /// long. This expects the parser to be positioned at the first octal /// digit and advances the parser to the first character immediately /// following the octal number. This also assumes that parsing octal /// escapes is enabled. /// /// Assuming the preconditions are met, this routine can never fail. #[inline(never)] fn parse_octal(&self) -> ast::Literal { assert!(self.parser().octal); assert!('0' <= self.char() && self.char() <= '7'); let start = self.pos(); // Parse up to two more digits. while self.bump() && '0' <= self.char() && self.char() <= '7' && self.pos().offset - start.offset <= 2 {} let end = self.pos(); let octal = &self.pattern()[start.offset..end.offset]; // Parsing the octal should never fail since the above guarantees a // valid number. let codepoint = u32::from_str_radix(octal, 8).expect("valid octal number"); // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no // invalid Unicode scalar values. let c = char::from_u32(codepoint).expect("Unicode scalar value"); ast::Literal { span: Span::new(start, end), kind: ast::LiteralKind::Octal, c, } } /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to /// the first character immediately following the hexadecimal literal. #[inline(never)] fn parse_hex(&self) -> Result { assert!( self.char() == 'x' || self.char() == 'u' || self.char() == 'U' ); let hex_kind = match self.char() { 'x' => ast::HexLiteralKind::X, 'u' => ast::HexLiteralKind::UnicodeShort, _ => ast::HexLiteralKind::UnicodeLong, }; if !self.bump_and_bump_space() { return Err( self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) ); } if self.char() == '{' { self.parse_hex_brace(hex_kind) } else { self.parse_hex_digits(hex_kind) } } /// Parse an N-digit hex representation of a Unicode codepoint. This /// expects the parser to be positioned at the first digit and will advance /// the parser to the first character immediately following the escape /// sequence. /// /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) /// or 8 (for `\UNNNNNNNN`). #[inline(never)] fn parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result { let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); let start = self.pos(); for i in 0..kind.digits() { if i > 0 && !self.bump_and_bump_space() { return Err(self .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); } if !is_hex(self.char()) { return Err(self.error( self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit, )); } scratch.push(self.char()); } // The final bump just moves the parser past the literal, which may // be EOF. self.bump_and_bump_space(); let end = self.pos(); let hex = scratch.as_str(); match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { None => Err(self.error( Span::new(start, end), ast::ErrorKind::EscapeHexInvalid, )), Some(c) => Ok(ast::Literal { span: Span::new(start, end), kind: ast::LiteralKind::HexFixed(kind), c, }), } } /// Parse a hex representation of any Unicode scalar value. This expects /// the parser to be positioned at the opening brace `{` and will advance /// the parser to the first character following the closing brace `}`. #[inline(never)] fn parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result { let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); let brace_pos = self.pos(); let start = self.span_char().end; while self.bump_and_bump_space() && self.char() != '}' { if !is_hex(self.char()) { return Err(self.error( self.span_char(), ast::ErrorKind::EscapeHexInvalidDigit, )); } scratch.push(self.char()); } if self.is_eof() { return Err(self.error( Span::new(brace_pos, self.pos()), ast::ErrorKind::EscapeUnexpectedEof, )); } let end = self.pos(); let hex = scratch.as_str(); assert_eq!(self.char(), '}'); self.bump_and_bump_space(); if hex.is_empty() { return Err(self.error( Span::new(brace_pos, self.pos()), ast::ErrorKind::EscapeHexEmpty, )); } match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) { None => Err(self.error( Span::new(start, end), ast::ErrorKind::EscapeHexInvalid, )), Some(c) => Ok(ast::Literal { span: Span::new(start, self.pos()), kind: ast::LiteralKind::HexBrace(kind), c, }), } } /// Parse a decimal number into a u32 while trimming leading and trailing /// whitespace. /// /// This expects the parser to be positioned at the first position where /// a decimal digit could occur. This will advance the parser to the byte /// immediately following the last contiguous decimal digit. /// /// If no decimal digit could be found or if there was a problem parsing /// the complete set of digits into a u32, then an error is returned. fn parse_decimal(&self) -> Result { let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); while !self.is_eof() && self.char().is_whitespace() { self.bump(); } let start = self.pos(); while !self.is_eof() && '0' <= self.char() && self.char() <= '9' { scratch.push(self.char()); self.bump_and_bump_space(); } let span = Span::new(start, self.pos()); while !self.is_eof() && self.char().is_whitespace() { self.bump_and_bump_space(); } let digits = scratch.as_str(); if digits.is_empty() { return Err(self.error(span, ast::ErrorKind::DecimalEmpty)); } match u32::from_str_radix(digits, 10).ok() { Some(n) => Ok(n), None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)), } } /// Parse a standard character class consisting primarily of characters or /// character ranges, but can also contain nested character classes of /// any type (sans `.`). /// /// This assumes the parser is positioned at the opening `[`. If parsing /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. #[inline(never)] fn parse_set_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = ast::ClassSetUnion { span: self.span(), items: vec![] }; loop { self.bump_space(); if self.is_eof() { return Err(self.unclosed_class_error()); } match self.char() { '[' => { // If we've already parsed the opening bracket, then // attempt to treat this as the beginning of an ASCII // class. If ASCII class parsing fails, then the parser // backs up to `[`. if !self.parser().stack_class.borrow().is_empty() { if let Some(cls) = self.maybe_parse_ascii_class() { union.push(ast::ClassSetItem::Ascii(cls)); continue; } } union = self.push_class_open(union)?; } ']' => match self.pop_class(union)? { Either::Left(nested_union) => { union = nested_union; } Either::Right(class) => return Ok(class), }, '&' if self.peek() == Some('&') => { assert!(self.bump_if("&&")); union = self.push_class_op( ast::ClassSetBinaryOpKind::Intersection, union, ); } '-' if self.peek() == Some('-') => { assert!(self.bump_if("--")); union = self.push_class_op( ast::ClassSetBinaryOpKind::Difference, union, ); } '~' if self.peek() == Some('~') => { assert!(self.bump_if("~~")); union = self.push_class_op( ast::ClassSetBinaryOpKind::SymmetricDifference, union, ); } _ => { union.push(self.parse_set_class_range()?); } } } } /// Parse a single primitive item in a character class set. The item to /// be parsed can either be one of a simple literal character, a range /// between two simple literal characters or a "primitive" character /// class like \w or \p{Greek}. /// /// If an invalid escape is found, or if a character class is found where /// a simple literal is expected (e.g., in a range), then an error is /// returned. #[inline(never)] fn parse_set_class_range(&self) -> Result { let prim1 = self.parse_set_class_item()?; self.bump_space(); if self.is_eof() { return Err(self.unclosed_class_error()); } // If the next char isn't a `-`, then we don't have a range. // There are two exceptions. If the char after a `-` is a `]`, then // `-` is interpreted as a literal `-`. Alternatively, if the char // after a `-` is a `-`, then `--` corresponds to a "difference" // operation. if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') { return prim1.into_class_set_item(self); } // OK, now we're parsing a range, so bump past the `-` and parse the // second half of the range. if !self.bump_and_bump_space() { return Err(self.unclosed_class_error()); } let prim2 = self.parse_set_class_item()?; let range = ast::ClassSetRange { span: Span::new(prim1.span().start, prim2.span().end), start: prim1.into_class_literal(self)?, end: prim2.into_class_literal(self)?, }; if !range.is_valid() { return Err( self.error(range.span, ast::ErrorKind::ClassRangeInvalid) ); } Ok(ast::ClassSetItem::Range(range)) } /// Parse a single item in a character class as a primitive, where the /// primitive either consists of a verbatim literal or a single escape /// sequence. /// /// This assumes the parser is positioned at the beginning of a primitive, /// and advances the parser to the first position after the primitive if /// successful. /// /// Note that it is the caller's responsibility to report an error if an /// illegal primitive was parsed. #[inline(never)] fn parse_set_class_item(&self) -> Result { if self.char() == '\\' { self.parse_escape() } else { let x = Primitive::Literal(ast::Literal { span: self.span_char(), kind: ast::LiteralKind::Verbatim, c: self.char(), }); self.bump(); Ok(x) } } /// Parses the opening of a character class set. This includes the opening /// bracket along with `^` if present to indicate negation. This also /// starts parsing the opening set of unioned items if applicable, since /// there are special rules applied to certain characters in the opening /// of a character class. For example, `[^]]` is the class of all /// characters not equal to `]`. (`]` would need to be escaped in any other /// position.) Similarly for `-`. /// /// In all cases, the op inside the returned `ast::ClassBracketed` is an /// empty union. This empty union should be replaced with the actual item /// when it is popped from the parser's stack. /// /// This assumes the parser is positioned at the opening `[` and advances /// the parser to the first non-special byte of the character class. /// /// An error is returned if EOF is found. #[inline(never)] fn parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> { assert_eq!(self.char(), '['); let start = self.pos(); if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed, )); } let negated = if self.char() != '^' { false } else { if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed, )); } true }; // Accept any number of `-` as literal `-`. let mut union = ast::ClassSetUnion { span: self.span(), items: vec![] }; while self.char() == '-' { union.push(ast::ClassSetItem::Literal(ast::Literal { span: self.span_char(), kind: ast::LiteralKind::Verbatim, c: '-', })); if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, start), ast::ErrorKind::ClassUnclosed, )); } } // If `]` is the *first* char in a set, then interpret it as a literal // `]`. That is, an empty class is impossible to write. if union.items.is_empty() && self.char() == ']' { union.push(ast::ClassSetItem::Literal(ast::Literal { span: self.span_char(), kind: ast::LiteralKind::Verbatim, c: ']', })); if !self.bump_and_bump_space() { return Err(self.error( Span::new(start, self.pos()), ast::ErrorKind::ClassUnclosed, )); } } let set = ast::ClassBracketed { span: Span::new(start, self.pos()), negated, kind: ast::ClassSet::union(ast::ClassSetUnion { span: Span::new(union.span.start, union.span.start), items: vec![], }), }; Ok((set, union)) } /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`. /// /// This assumes the parser is positioned at the opening `[`. /// /// If no valid ASCII character class could be found, then this does not /// advance the parser and `None` is returned. Otherwise, the parser is /// advanced to the first byte following the closing `]` and the /// corresponding ASCII class is returned. #[inline(never)] fn maybe_parse_ascii_class(&self) -> Option { // ASCII character classes are interesting from a parsing perspective // because parsing cannot fail with any interesting error. For example, // in order to use an ASCII character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think // of it as "ASCII character classes have the syntax `[:NAME:]` which // can only appear within character brackets." This means that things // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect ASCII character class, e.g., // `[[:loower:]]`, then we treat that as a normal nested character // class containing the characters `:elorw`. One might argue that we // should return an error instead since the repeated colons give away // the intent to write an ASCII class. But what if the user typed // `[[:lower]]` instead? How can we tell that was intended to be an // ASCII class and not just a normal nested class? // // Reasonable people can probably disagree over this, but for better // or worse, we implement semantics that never fails at the expense // of better failure modes. assert_eq!(self.char(), '['); // If parsing fails, then we back up the parser to this starting point. let start = self.pos(); let mut negated = false; if !self.bump() || self.char() != ':' { self.parser().pos.set(start); return None; } if !self.bump() { self.parser().pos.set(start); return None; } if self.char() == '^' { negated = true; if !self.bump() { self.parser().pos.set(start); return None; } } let name_start = self.offset(); while self.char() != ':' && self.bump() {} if self.is_eof() { self.parser().pos.set(start); return None; } let name = &self.pattern()[name_start..self.offset()]; if !self.bump_if(":]") { self.parser().pos.set(start); return None; } let kind = match ast::ClassAsciiKind::from_name(name) { Some(kind) => kind, None => { self.parser().pos.set(start); return None; } }; Some(ast::ClassAscii { span: Span::new(start, self.pos()), kind, negated, }) } /// Parse a Unicode class in either the single character notation, `\pN` /// or the multi-character bracketed notation, `\p{Greek}`. This assumes /// the parser is positioned at the `p` (or `P` for negation) and will /// advance the parser to the character immediately following the class. /// /// Note that this does not check whether the class name is valid or not. #[inline(never)] fn parse_unicode_class(&self) -> Result { assert!(self.char() == 'p' || self.char() == 'P'); let mut scratch = self.parser().scratch.borrow_mut(); scratch.clear(); let negated = self.char() == 'P'; if !self.bump_and_bump_space() { return Err( self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof) ); } let (start, kind) = if self.char() == '{' { let start = self.span_char().end; while self.bump_and_bump_space() && self.char() != '}' { scratch.push(self.char()); } if self.is_eof() { return Err(self .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)); } assert_eq!(self.char(), '}'); self.bump(); let name = scratch.as_str(); if let Some(i) = name.find("!=") { ( start, ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::NotEqual, name: name[..i].to_string(), value: name[i + 2..].to_string(), }, ) } else if let Some(i) = name.find(':') { ( start, ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Colon, name: name[..i].to_string(), value: name[i + 1..].to_string(), }, ) } else if let Some(i) = name.find('=') { ( start, ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Equal, name: name[..i].to_string(), value: name[i + 1..].to_string(), }, ) } else { (start, ast::ClassUnicodeKind::Named(name.to_string())) } } else { let start = self.pos(); let c = self.char(); if c == '\\' { return Err(self.error( self.span_char(), ast::ErrorKind::UnicodeClassInvalid, )); } self.bump_and_bump_space(); let kind = ast::ClassUnicodeKind::OneLetter(c); (start, kind) }; Ok(ast::ClassUnicode { span: Span::new(start, self.pos()), negated, kind, }) } /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the /// parser is currently at a valid character class name and will be /// advanced to the character immediately following the class. #[inline(never)] fn parse_perl_class(&self) -> ast::ClassPerl { let c = self.char(); let span = self.span_char(); self.bump(); let (negated, kind) = match c { 'd' => (false, ast::ClassPerlKind::Digit), 'D' => (true, ast::ClassPerlKind::Digit), 's' => (false, ast::ClassPerlKind::Space), 'S' => (true, ast::ClassPerlKind::Space), 'w' => (false, ast::ClassPerlKind::Word), 'W' => (true, ast::ClassPerlKind::Word), c => panic!("expected valid Perl class but got '{}'", c), }; ast::ClassPerl { span, kind, negated } } } /// A type that traverses a fully parsed Ast and checks whether its depth /// exceeds the specified nesting limit. If it does, then an error is returned. #[derive(Debug)] struct NestLimiter<'p, 's, P> { /// The parser that is checking the nest limit. p: &'p ParserI<'s, P>, /// The current depth while walking an Ast. depth: u32, } impl<'p, 's, P: Borrow> NestLimiter<'p, 's, P> { fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> { NestLimiter { p, depth: 0 } } #[inline(never)] fn check(self, ast: &Ast) -> Result<()> { ast::visit(ast, self) } fn increment_depth(&mut self, span: &Span) -> Result<()> { let new = self.depth.checked_add(1).ok_or_else(|| { self.p.error( span.clone(), ast::ErrorKind::NestLimitExceeded(u32::MAX), ) })?; let limit = self.p.parser().nest_limit; if new > limit { return Err(self.p.error( span.clone(), ast::ErrorKind::NestLimitExceeded(limit), )); } self.depth = new; Ok(()) } fn decrement_depth(&mut self) { // Assuming the correctness of the visitor, this should never drop // below 0. self.depth = self.depth.checked_sub(1).unwrap(); } } impl<'p, 's, P: Borrow> ast::Visitor for NestLimiter<'p, 's, P> { type Output = (); type Err = ast::Error; fn finish(self) -> Result<()> { Ok(()) } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { let span = match *ast { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } Ast::ClassBracketed(ref x) => &x.span, Ast::Repetition(ref x) => &x.span, Ast::Group(ref x) => &x.span, Ast::Alternation(ref x) => &x.span, Ast::Concat(ref x) => &x.span, }; self.increment_depth(span) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { match *ast { Ast::Empty(_) | Ast::Flags(_) | Ast::Literal(_) | Ast::Dot(_) | Ast::Assertion(_) | Ast::ClassUnicode(_) | Ast::ClassPerl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } Ast::ClassBracketed(_) | Ast::Repetition(_) | Ast::Group(_) | Ast::Alternation(_) | Ast::Concat(_) => { self.decrement_depth(); Ok(()) } } } fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { let span = match *ast { ast::ClassSetItem::Empty(_) | ast::ClassSetItem::Literal(_) | ast::ClassSetItem::Range(_) | ast::ClassSetItem::Ascii(_) | ast::ClassSetItem::Unicode(_) | ast::ClassSetItem::Perl(_) => { // These are all base cases, so we don't increment depth. return Ok(()); } ast::ClassSetItem::Bracketed(ref x) => &x.span, ast::ClassSetItem::Union(ref x) => &x.span, }; self.increment_depth(span) } fn visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { match *ast { ast::ClassSetItem::Empty(_) | ast::ClassSetItem::Literal(_) | ast::ClassSetItem::Range(_) | ast::ClassSetItem::Ascii(_) | ast::ClassSetItem::Unicode(_) | ast::ClassSetItem::Perl(_) => { // These are all base cases, so we don't decrement depth. Ok(()) } ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => { self.decrement_depth(); Ok(()) } } } fn visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()> { self.increment_depth(&ast.span) } fn visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()> { self.decrement_depth(); Ok(()) } } /// When the result is an error, transforms the ast::ErrorKind from the source /// Result into another one. This function is used to return clearer error /// messages when possible. fn specialize_err( result: Result, from: ast::ErrorKind, to: ast::ErrorKind, ) -> Result { if let Err(e) = result { if e.kind == from { Err(ast::Error { kind: to, pattern: e.pattern, span: e.span }) } else { Err(e) } } else { result } } #[cfg(test)] mod tests { use core::ops::Range; use alloc::format; use crate::ast::{self, Ast, Position, Span}; use super::*; // Our own assert_eq, which has slightly better formatting (but honestly // still kind of crappy). macro_rules! assert_eq { ($left:expr, $right:expr) => {{ match (&$left, &$right) { (left_val, right_val) => { if !(*left_val == *right_val) { panic!( "assertion failed: `(left == right)`\n\n\ left: `{:?}`\nright: `{:?}`\n\n", left_val, right_val ) } } } }}; } // We create these errors to compare with real ast::Errors in the tests. // We define equality between TestError and ast::Error to disregard the // pattern string in ast::Error, which is annoying to provide in tests. #[derive(Clone, Debug)] struct TestError { span: Span, kind: ast::ErrorKind, } impl PartialEq for TestError { fn eq(&self, other: &ast::Error) -> bool { self.span == other.span && self.kind == other.kind } } impl PartialEq for ast::Error { fn eq(&self, other: &TestError) -> bool { self.span == other.span && self.kind == other.kind } } fn s(str: &str) -> String { str.to_string() } fn parser(pattern: &str) -> ParserI<'_, Parser> { ParserI::new(Parser::new(), pattern) } fn parser_octal(pattern: &str) -> ParserI<'_, Parser> { let parser = ParserBuilder::new().octal(true).build(); ParserI::new(parser, pattern) } fn parser_nest_limit( pattern: &str, nest_limit: u32, ) -> ParserI<'_, Parser> { let p = ParserBuilder::new().nest_limit(nest_limit).build(); ParserI::new(p, pattern) } fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> { let p = ParserBuilder::new().ignore_whitespace(true).build(); ParserI::new(p, pattern) } /// Short alias for creating a new span. fn nspan(start: Position, end: Position) -> Span { Span::new(start, end) } /// Short alias for creating a new position. fn npos(offset: usize, line: usize, column: usize) -> Position { Position::new(offset, line, column) } /// Create a new span from the given offset range. This assumes a single /// line and sets the columns based on the offsets. i.e., This only works /// out of the box for ASCII, which is fine for most tests. fn span(range: Range) -> Span { let start = Position::new(range.start, 1, range.start + 1); let end = Position::new(range.end, 1, range.end + 1); Span::new(start, end) } /// Create a new span for the corresponding byte range in the given string. fn span_range(subject: &str, range: Range) -> Span { let start = Position { offset: range.start, line: 1 + subject[..range.start].matches('\n').count(), column: 1 + subject[..range.start] .chars() .rev() .position(|c| c == '\n') .unwrap_or(subject[..range.start].chars().count()), }; let end = Position { offset: range.end, line: 1 + subject[..range.end].matches('\n').count(), column: 1 + subject[..range.end] .chars() .rev() .position(|c| c == '\n') .unwrap_or(subject[..range.end].chars().count()), }; Span::new(start, end) } /// Create a verbatim literal starting at the given position. fn lit(c: char, start: usize) -> Ast { lit_with(c, span(start..start + c.len_utf8())) } /// Create a meta literal starting at the given position. fn meta_lit(c: char, span: Span) -> Ast { Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Meta, c }) } /// Create a verbatim literal with the given span. fn lit_with(c: char, span: Span) -> Ast { Ast::literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, }) } /// Create a concatenation with the given range. fn concat(range: Range, asts: Vec) -> Ast { concat_with(span(range), asts) } /// Create a concatenation with the given span. fn concat_with(span: Span, asts: Vec) -> Ast { Ast::concat(ast::Concat { span, asts }) } /// Create an alternation with the given span. fn alt(range: Range, asts: Vec) -> Ast { Ast::alternation(ast::Alternation { span: span(range), asts }) } /// Create a capturing group with the given span. fn group(range: Range, index: u32, ast: Ast) -> Ast { Ast::group(ast::Group { span: span(range), kind: ast::GroupKind::CaptureIndex(index), ast: Box::new(ast), }) } /// Create an ast::SetFlags. /// /// The given pattern should be the full pattern string. The range given /// should correspond to the byte offsets where the flag set occurs. /// /// If negated is true, then the set is interpreted as beginning with a /// negation. fn flag_set( pat: &str, range: Range, flag: ast::Flag, negated: bool, ) -> Ast { let mut items = vec![ast::FlagsItem { span: span_range(pat, (range.end - 2)..(range.end - 1)), kind: ast::FlagsItemKind::Flag(flag), }]; if negated { items.insert( 0, ast::FlagsItem { span: span_range(pat, (range.start + 2)..(range.end - 2)), kind: ast::FlagsItemKind::Negation, }, ); } Ast::flags(ast::SetFlags { span: span_range(pat, range.clone()), flags: ast::Flags { span: span_range(pat, (range.start + 2)..(range.end - 1)), items, }, }) } #[test] fn parse_nest_limit() { // A nest limit of 0 still allows some types of regexes. assert_eq!( parser_nest_limit("", 0).parse(), Ok(Ast::empty(span(0..0))) ); assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0))); // Test repetition operations, which require one level of nesting. assert_eq!( parser_nest_limit("a+", 0).parse().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::NestLimitExceeded(0), } ); assert_eq!( parser_nest_limit("a+", 1).parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::OneOrMore, }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser_nest_limit("(a)+", 1).parse().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::NestLimitExceeded(1), } ); assert_eq!( parser_nest_limit("a+*", 1).parse().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::NestLimitExceeded(1), } ); assert_eq!( parser_nest_limit("a+*", 2).parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, ast: Box::new(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::OneOrMore, }, greedy: true, ast: Box::new(lit('a', 0)), })), })) ); // Test concatenations. A concatenation requires one level of nesting. assert_eq!( parser_nest_limit("ab", 0).parse().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::NestLimitExceeded(0), } ); assert_eq!( parser_nest_limit("ab", 1).parse(), Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])) ); assert_eq!( parser_nest_limit("abc", 1).parse(), Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])) ); // Test alternations. An alternation requires one level of nesting. assert_eq!( parser_nest_limit("a|b", 0).parse().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::NestLimitExceeded(0), } ); assert_eq!( parser_nest_limit("a|b", 1).parse(), Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])) ); assert_eq!( parser_nest_limit("a|b|c", 1).parse(), Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])) ); // Test character classes. Classes form their own mini-recursive // syntax! assert_eq!( parser_nest_limit("[a]", 0).parse().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::NestLimitExceeded(0), } ); assert_eq!( parser_nest_limit("[a]", 1).parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::Item(ast::ClassSetItem::Literal( ast::Literal { span: span(1..2), kind: ast::LiteralKind::Verbatim, c: 'a', } )), })) ); assert_eq!( parser_nest_limit("[ab]", 1).parse().unwrap_err(), TestError { span: span(1..3), kind: ast::ErrorKind::NestLimitExceeded(1), } ); assert_eq!( parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(), TestError { span: span(3..7), kind: ast::ErrorKind::NestLimitExceeded(2), } ); assert_eq!( parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(), TestError { span: span(4..6), kind: ast::ErrorKind::NestLimitExceeded(3), } ); assert_eq!( parser_nest_limit("[a--b]", 1).parse().unwrap_err(), TestError { span: span(1..5), kind: ast::ErrorKind::NestLimitExceeded(1), } ); assert_eq!( parser_nest_limit("[a--bc]", 2).parse().unwrap_err(), TestError { span: span(4..6), kind: ast::ErrorKind::NestLimitExceeded(2), } ); } #[test] fn parse_comments() { let pat = "(?x) # This is comment 1. foo # This is comment 2. # This is comment 3. bar # This is comment 4."; let astc = parser(pat).parse_with_comments().unwrap(); assert_eq!( astc.ast, concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), lit_with('f', span_range(pat, 26..27)), lit_with('o', span_range(pat, 27..28)), lit_with('o', span_range(pat, 28..29)), lit_with('b', span_range(pat, 74..75)), lit_with('a', span_range(pat, 75..76)), lit_with('r', span_range(pat, 76..77)), ] ) ); assert_eq!( astc.comments, vec![ ast::Comment { span: span_range(pat, 5..26), comment: s(" This is comment 1."), }, ast::Comment { span: span_range(pat, 30..51), comment: s(" This is comment 2."), }, ast::Comment { span: span_range(pat, 53..74), comment: s(" This is comment 3."), }, ast::Comment { span: span_range(pat, 78..98), comment: s(" This is comment 4."), }, ] ); } #[test] fn parse_holistic() { assert_eq!(parser("]").parse(), Ok(lit(']', 0))); assert_eq!( parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(), Ok(concat( 0..36, vec![ meta_lit('\\', span(0..2)), meta_lit('.', span(2..4)), meta_lit('+', span(4..6)), meta_lit('*', span(6..8)), meta_lit('?', span(8..10)), meta_lit('(', span(10..12)), meta_lit(')', span(12..14)), meta_lit('|', span(14..16)), meta_lit('[', span(16..18)), meta_lit(']', span(18..20)), meta_lit('{', span(20..22)), meta_lit('}', span(22..24)), meta_lit('^', span(24..26)), meta_lit('$', span(26..28)), meta_lit('#', span(28..30)), meta_lit('&', span(30..32)), meta_lit('-', span(32..34)), meta_lit('~', span(34..36)), ] )) ); } #[test] fn parse_ignore_whitespace() { // Test that basic whitespace insensitivity works. let pat = "(?x)a b"; assert_eq!( parser(pat).parse(), Ok(concat_with( nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), ] )) ); // Test that we can toggle whitespace insensitivity. let pat = "(?x)a b(?-x)a b"; assert_eq!( parser(pat).parse(), Ok(concat_with( nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))), flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true), lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))), lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))), lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))), ] )) ); // Test that nesting whitespace insensitive flags works. let pat = "a (?x:a )a "; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..11), vec![ lit_with('a', span_range(pat, 0..1)), lit_with(' ', span_range(pat, 1..2)), Ast::group(ast::Group { span: span_range(pat, 2..9), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 4..5), items: vec![ast::FlagsItem { span: span_range(pat, 4..5), kind: ast::FlagsItemKind::Flag( ast::Flag::IgnoreWhitespace ), },], }), ast: Box::new(lit_with('a', span_range(pat, 6..7))), }), lit_with('a', span_range(pat, 9..10)), lit_with(' ', span_range(pat, 10..11)), ] )) ); // Test that whitespace after an opening paren is insignificant. let pat = "(?x)( ?P a )"; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span_range(pat, 9..12), name: s("foo"), index: 1, } }, ast: Box::new(lit_with('a', span_range(pat, 14..15))), }), ] )) ); let pat = "(?x)( a )"; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::CaptureIndex(1), ast: Box::new(lit_with('a', span_range(pat, 7..8))), }), ] )) ); let pat = "(?x)( ?: a )"; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::group(ast::Group { span: span_range(pat, 4..pat.len()), kind: ast::GroupKind::NonCapturing(ast::Flags { span: span_range(pat, 8..8), items: vec![], }), ast: Box::new(lit_with('a', span_range(pat, 11..12))), }), ] )) ); let pat = r"(?x)\x { 53 }"; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::literal(ast::Literal { span: span(4..13), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::X ), c: 'S', }), ] )) ); // Test that whitespace after an escape is OK. let pat = r"(?x)\ "; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false), Ast::literal(ast::Literal { span: span_range(pat, 4..6), kind: ast::LiteralKind::Superfluous, c: ' ', }), ] )) ); } #[test] fn parse_newlines() { let pat = ".\n."; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..3), vec![ Ast::dot(span_range(pat, 0..1)), lit_with('\n', span_range(pat, 1..2)), Ast::dot(span_range(pat, 2..3)), ] )) ); let pat = "foobar\nbaz\nquux\n"; assert_eq!( parser(pat).parse(), Ok(concat_with( span_range(pat, 0..pat.len()), vec![ lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))), lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))), lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))), lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))), lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))), lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))), lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))), lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))), lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))), lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))), lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))), lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))), lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))), lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))), lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))), lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))), ] )) ); } #[test] fn parse_uncounted_repetition() { assert_eq!( parser(r"a*").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::ZeroOrMore, }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a+").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::OneOrMore, }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a?").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a??").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: false, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a?").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a?b").parse(), Ok(concat( 0..3, vec![ Ast::repetition(ast::Repetition { span: span(0..2), op: ast::RepetitionOp { span: span(1..2), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(lit('a', 0)), }), lit('b', 2), ] )) ); assert_eq!( parser(r"a??b").parse(), Ok(concat( 0..4, vec![ Ast::repetition(ast::Repetition { span: span(0..3), op: ast::RepetitionOp { span: span(1..3), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: false, ast: Box::new(lit('a', 0)), }), lit('b', 3), ] )) ); assert_eq!( parser(r"ab?").parse(), Ok(concat( 0..3, vec![ lit('a', 0), Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(lit('b', 1)), }), ] )) ); assert_eq!( parser(r"(ab)?").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(4..5), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(group( 0..4, 1, concat(1..3, vec![lit('a', 1), lit('b', 2),]) )), })) ); assert_eq!( parser(r"|a?").parse(), Ok(alt( 0..3, vec![ Ast::empty(span(0..0)), Ast::repetition(ast::Repetition { span: span(1..3), op: ast::RepetitionOp { span: span(2..3), kind: ast::RepetitionKind::ZeroOrOne, }, greedy: true, ast: Box::new(lit('a', 1)), }), ] )) ); assert_eq!( parser(r"*").parse().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"(?i)*").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"(*)").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"(?:?)").parse().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"+").parse().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"?").parse().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"(?)").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"|*").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"|+").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"|?").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); } #[test] fn parse_counted_repetition() { assert_eq!( parser(r"a{5}").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..4), op: ast::RepetitionOp { span: span(1..4), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Exactly(5) ), }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a{5,}").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), kind: ast::RepetitionKind::Range( ast::RepetitionRange::AtLeast(5) ), }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a{5,9}").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Bounded(5, 9) ), }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a{5}?").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..5), op: ast::RepetitionOp { span: span(1..5), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Exactly(5) ), }, greedy: false, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"ab{5}").parse(), Ok(concat( 0..5, vec![ lit('a', 0), Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Exactly(5) ), }, greedy: true, ast: Box::new(lit('b', 1)), }), ] )) ); assert_eq!( parser(r"ab{5}c").parse(), Ok(concat( 0..6, vec![ lit('a', 0), Ast::repetition(ast::Repetition { span: span(1..5), op: ast::RepetitionOp { span: span(2..5), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Exactly(5) ), }, greedy: true, ast: Box::new(lit('b', 1)), }), lit('c', 5), ] )) ); assert_eq!( parser(r"a{ 5 }").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..6), op: ast::RepetitionOp { span: span(1..6), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Exactly(5) ), }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"a{ 5 , 9 }").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..10), op: ast::RepetitionOp { span: span(1..10), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Bounded(5, 9) ), }, greedy: true, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser_ignore_whitespace(r"a{5,9} ?").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..8), op: ast::RepetitionOp { span: span(1..8), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Bounded(5, 9) ), }, greedy: false, ast: Box::new(lit('a', 0)), })) ); assert_eq!( parser(r"\b{5,9}").parse(), Ok(Ast::repetition(ast::Repetition { span: span(0..7), op: ast::RepetitionOp { span: span(2..7), kind: ast::RepetitionKind::Range( ast::RepetitionRange::Bounded(5, 9) ), }, greedy: true, ast: Box::new(Ast::assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::WordBoundary, })), })) ); assert_eq!( parser(r"(?i){0}").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"(?m){1,1}").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"a{]}").parse().unwrap_err(), TestError { span: span(2..2), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); assert_eq!( parser(r"a{1,]}").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); assert_eq!( parser(r"a{").parse().unwrap_err(), TestError { span: span(1..2), kind: ast::ErrorKind::RepetitionCountUnclosed, } ); assert_eq!( parser(r"a{}").parse().unwrap_err(), TestError { span: span(2..2), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); assert_eq!( parser(r"a{a").parse().unwrap_err(), TestError { span: span(2..2), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); assert_eq!( parser(r"a{9999999999}").parse().unwrap_err(), TestError { span: span(2..12), kind: ast::ErrorKind::DecimalInvalid, } ); assert_eq!( parser(r"a{9").parse().unwrap_err(), TestError { span: span(1..3), kind: ast::ErrorKind::RepetitionCountUnclosed, } ); assert_eq!( parser(r"a{9,a").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); assert_eq!( parser(r"a{9,9999999999}").parse().unwrap_err(), TestError { span: span(4..14), kind: ast::ErrorKind::DecimalInvalid, } ); assert_eq!( parser(r"a{9,").parse().unwrap_err(), TestError { span: span(1..4), kind: ast::ErrorKind::RepetitionCountUnclosed, } ); assert_eq!( parser(r"a{9,11").parse().unwrap_err(), TestError { span: span(1..6), kind: ast::ErrorKind::RepetitionCountUnclosed, } ); assert_eq!( parser(r"a{2,1}").parse().unwrap_err(), TestError { span: span(1..6), kind: ast::ErrorKind::RepetitionCountInvalid, } ); assert_eq!( parser(r"{5}").parse().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::RepetitionMissing, } ); assert_eq!( parser(r"|{5}").parse().unwrap_err(), TestError { span: span(1..1), kind: ast::ErrorKind::RepetitionMissing, } ); } #[test] fn parse_alternate() { assert_eq!( parser(r"a|b").parse(), Ok(Ast::alternation(ast::Alternation { span: span(0..3), asts: vec![lit('a', 0), lit('b', 2)], })) ); assert_eq!( parser(r"(a|b)").parse(), Ok(group( 0..5, 1, Ast::alternation(ast::Alternation { span: span(1..4), asts: vec![lit('a', 1), lit('b', 3)], }) )) ); assert_eq!( parser(r"a|b|c").parse(), Ok(Ast::alternation(ast::Alternation { span: span(0..5), asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)], })) ); assert_eq!( parser(r"ax|by|cz").parse(), Ok(Ast::alternation(ast::Alternation { span: span(0..8), asts: vec![ concat(0..2, vec![lit('a', 0), lit('x', 1)]), concat(3..5, vec![lit('b', 3), lit('y', 4)]), concat(6..8, vec![lit('c', 6), lit('z', 7)]), ], })) ); assert_eq!( parser(r"(ax|by|cz)").parse(), Ok(group( 0..10, 1, Ast::alternation(ast::Alternation { span: span(1..9), asts: vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), concat(4..6, vec![lit('b', 4), lit('y', 5)]), concat(7..9, vec![lit('c', 7), lit('z', 8)]), ], }) )) ); assert_eq!( parser(r"(ax|(by|(cz)))").parse(), Ok(group( 0..14, 1, alt( 1..13, vec![ concat(1..3, vec![lit('a', 1), lit('x', 2)]), group( 4..13, 2, alt( 5..12, vec![ concat( 5..7, vec![lit('b', 5), lit('y', 6)] ), group( 8..12, 3, concat( 9..11, vec![lit('c', 9), lit('z', 10),] ) ), ] ) ), ] ) )) ); assert_eq!( parser(r"|").parse(), Ok(alt( 0..1, vec![Ast::empty(span(0..0)), Ast::empty(span(1..1)),] )) ); assert_eq!( parser(r"||").parse(), Ok(alt( 0..2, vec![ Ast::empty(span(0..0)), Ast::empty(span(1..1)), Ast::empty(span(2..2)), ] )) ); assert_eq!( parser(r"a|").parse(), Ok(alt(0..2, vec![lit('a', 0), Ast::empty(span(2..2)),])) ); assert_eq!( parser(r"|a").parse(), Ok(alt(0..2, vec![Ast::empty(span(0..0)), lit('a', 1),])) ); assert_eq!( parser(r"(|)").parse(), Ok(group( 0..3, 1, alt( 1..2, vec![Ast::empty(span(1..1)), Ast::empty(span(2..2)),] ) )) ); assert_eq!( parser(r"(a|)").parse(), Ok(group( 0..4, 1, alt(1..3, vec![lit('a', 1), Ast::empty(span(3..3)),]) )) ); assert_eq!( parser(r"(|a)").parse(), Ok(group( 0..4, 1, alt(1..3, vec![Ast::empty(span(1..1)), lit('a', 2),]) )) ); assert_eq!( parser(r"a|b)").parse().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::GroupUnopened, } ); assert_eq!( parser(r"(a|b").parse().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::GroupUnclosed, } ); } #[test] fn parse_unsupported_lookaround() { assert_eq!( parser(r"(?=a)").parse().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::UnsupportedLookAround, } ); assert_eq!( parser(r"(?!a)").parse().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::UnsupportedLookAround, } ); assert_eq!( parser(r"(?<=a)").parse().unwrap_err(), TestError { span: span(0..4), kind: ast::ErrorKind::UnsupportedLookAround, } ); assert_eq!( parser(r"(?z)").parse(), Ok(Ast::group(ast::Group { span: span(0..7), kind: ast::GroupKind::CaptureName { starts_with_p: false, name: ast::CaptureName { span: span(3..4), name: s("a"), index: 1, } }, ast: Box::new(lit('z', 5)), })) ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::group(ast::Group { span: span(0..8), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span(4..5), name: s("a"), index: 1, } }, ast: Box::new(lit('z', 6)), })) ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span(4..7), name: s("abc"), index: 1, } }, ast: Box::new(lit('z', 8)), })) ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span(4..7), name: s("a_1"), index: 1, } }, ast: Box::new(lit('z', 8)), })) ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::group(ast::Group { span: span(0..10), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span(4..7), name: s("a.1"), index: 1, } }, ast: Box::new(lit('z', 8)), })) ); assert_eq!( parser("(?Pz)").parse(), Ok(Ast::group(ast::Group { span: span(0..11), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: span(4..8), name: s("a[1]"), index: 1, } }, ast: Box::new(lit('z', 9)), })) ); assert_eq!( parser("(?P)").parse(), Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 9), ), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: Span::new( Position::new(4, 1, 5), Position::new(7, 1, 7), ), name: s("a¾"), index: 1, } }, ast: Box::new(Ast::empty(Span::new( Position::new(8, 1, 8), Position::new(8, 1, 8), ))), })) ); assert_eq!( parser("(?P<名字>)").parse(), Ok(Ast::group(ast::Group { span: Span::new( Position::new(0, 1, 1), Position::new(12, 1, 9), ), kind: ast::GroupKind::CaptureName { starts_with_p: true, name: ast::CaptureName { span: Span::new( Position::new(4, 1, 5), Position::new(10, 1, 7), ), name: s("名字"), index: 1, } }, ast: Box::new(Ast::empty(Span::new( Position::new(11, 1, 8), Position::new(11, 1, 8), ))), })) ); assert_eq!( parser("(?P<").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::GroupNameUnexpectedEof, } ); assert_eq!( parser("(?P<>z)").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::GroupNameEmpty, } ); assert_eq!( parser("(?Py)(?Pz)").parse().unwrap_err(), TestError { span: span(12..13), kind: ast::ErrorKind::GroupNameDuplicate { original: span(4..5), }, } ); assert_eq!( parser("(?P<5>)").parse().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::GroupNameInvalid, } ); assert_eq!( parser("(?P<5a>)").parse().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::GroupNameInvalid, } ); assert_eq!( parser("(?P<¾>)").parse().unwrap_err(), TestError { span: Span::new( Position::new(4, 1, 5), Position::new(6, 1, 6), ), kind: ast::ErrorKind::GroupNameInvalid, } ); assert_eq!( parser("(?P<¾a>)").parse().unwrap_err(), TestError { span: Span::new( Position::new(4, 1, 5), Position::new(6, 1, 6), ), kind: ast::ErrorKind::GroupNameInvalid, } ); assert_eq!( parser("(?P<☃>)").parse().unwrap_err(), TestError { span: Span::new( Position::new(4, 1, 5), Position::new(7, 1, 6), ), kind: ast::ErrorKind::GroupNameInvalid, } ); assert_eq!( parser("(?P)").parse().unwrap_err(), TestError { span: Span::new( Position::new(5, 1, 6), Position::new(8, 1, 7), ), kind: ast::ErrorKind::GroupNameInvalid, } ); } #[test] fn parse_flags() { assert_eq!( parser("i:").parse_flags(), Ok(ast::Flags { span: span(0..1), items: vec![ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), }], }) ); assert_eq!( parser("i)").parse_flags(), Ok(ast::Flags { span: span(0..1), items: vec![ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive), }], }) ); assert_eq!( parser("isU:").parse_flags(), Ok(ast::Flags { span: span(0..3), items: vec![ ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Flag( ast::Flag::CaseInsensitive ), }, ast::FlagsItem { span: span(1..2), kind: ast::FlagsItemKind::Flag( ast::Flag::DotMatchesNewLine ), }, ast::FlagsItem { span: span(2..3), kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), }, ], }) ); assert_eq!( parser("-isU:").parse_flags(), Ok(ast::Flags { span: span(0..4), items: vec![ ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Negation, }, ast::FlagsItem { span: span(1..2), kind: ast::FlagsItemKind::Flag( ast::Flag::CaseInsensitive ), }, ast::FlagsItem { span: span(2..3), kind: ast::FlagsItemKind::Flag( ast::Flag::DotMatchesNewLine ), }, ast::FlagsItem { span: span(3..4), kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), }, ], }) ); assert_eq!( parser("i-sU:").parse_flags(), Ok(ast::Flags { span: span(0..4), items: vec![ ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Flag( ast::Flag::CaseInsensitive ), }, ast::FlagsItem { span: span(1..2), kind: ast::FlagsItemKind::Negation, }, ast::FlagsItem { span: span(2..3), kind: ast::FlagsItemKind::Flag( ast::Flag::DotMatchesNewLine ), }, ast::FlagsItem { span: span(3..4), kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed), }, ], }) ); assert_eq!( parser("i-sR:").parse_flags(), Ok(ast::Flags { span: span(0..4), items: vec![ ast::FlagsItem { span: span(0..1), kind: ast::FlagsItemKind::Flag( ast::Flag::CaseInsensitive ), }, ast::FlagsItem { span: span(1..2), kind: ast::FlagsItemKind::Negation, }, ast::FlagsItem { span: span(2..3), kind: ast::FlagsItemKind::Flag( ast::Flag::DotMatchesNewLine ), }, ast::FlagsItem { span: span(3..4), kind: ast::FlagsItemKind::Flag(ast::Flag::CRLF), }, ], }) ); assert_eq!( parser("isU").parse_flags().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::FlagUnexpectedEof, } ); assert_eq!( parser("isUa:").parse_flags().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::FlagUnrecognized, } ); assert_eq!( parser("isUi:").parse_flags().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) }, } ); assert_eq!( parser("i-sU-i:").parse_flags().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::FlagRepeatedNegation { original: span(1..2), }, } ); assert_eq!( parser("-)").parse_flags().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::FlagDanglingNegation, } ); assert_eq!( parser("i-)").parse_flags().unwrap_err(), TestError { span: span(1..2), kind: ast::ErrorKind::FlagDanglingNegation, } ); assert_eq!( parser("iU-)").parse_flags().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::FlagDanglingNegation, } ); } #[test] fn parse_flag() { assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive)); assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine)); assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine)); assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed)); assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode)); assert_eq!(parser("R").parse_flag(), Ok(ast::Flag::CRLF)); assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace)); assert_eq!( parser("a").parse_flag().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::FlagUnrecognized, } ); assert_eq!( parser("☃").parse_flag().unwrap_err(), TestError { span: span_range("☃", 0..3), kind: ast::ErrorKind::FlagUnrecognized, } ); } #[test] fn parse_primitive_non_escape() { assert_eq!( parser(r".").parse_primitive(), Ok(Primitive::Dot(span(0..1))) ); assert_eq!( parser(r"^").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..1), kind: ast::AssertionKind::StartLine, })) ); assert_eq!( parser(r"$").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..1), kind: ast::AssertionKind::EndLine, })) ); assert_eq!( parser(r"a").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..1), kind: ast::LiteralKind::Verbatim, c: 'a', })) ); assert_eq!( parser(r"|").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..1), kind: ast::LiteralKind::Verbatim, c: '|', })) ); assert_eq!( parser(r"☃").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span_range("☃", 0..3), kind: ast::LiteralKind::Verbatim, c: '☃', })) ); } #[test] fn parse_escape() { assert_eq!( parser(r"\|").parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), kind: ast::LiteralKind::Meta, c: '|', })) ); let specials = &[ (r"\a", '\x07', ast::SpecialLiteralKind::Bell), (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed), (r"\t", '\t', ast::SpecialLiteralKind::Tab), (r"\n", '\n', ast::SpecialLiteralKind::LineFeed), (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn), (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab), ]; for &(pat, c, ref kind) in specials { assert_eq!( parser(pat).parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), kind: ast::LiteralKind::Special(kind.clone()), c, })) ); } assert_eq!( parser(r"\A").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::StartText, })) ); assert_eq!( parser(r"\z").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::EndText, })) ); assert_eq!( parser(r"\b").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::WordBoundary, })) ); assert_eq!( parser(r"\b{start}").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..9), kind: ast::AssertionKind::WordBoundaryStart, })) ); assert_eq!( parser(r"\b{end}").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..7), kind: ast::AssertionKind::WordBoundaryEnd, })) ); assert_eq!( parser(r"\b{start-half}").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..14), kind: ast::AssertionKind::WordBoundaryStartHalf, })) ); assert_eq!( parser(r"\b{end-half}").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..12), kind: ast::AssertionKind::WordBoundaryEndHalf, })) ); assert_eq!( parser(r"\<").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::WordBoundaryStartAngle, })) ); assert_eq!( parser(r"\>").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::WordBoundaryEndAngle, })) ); assert_eq!( parser(r"\B").parse_primitive(), Ok(Primitive::Assertion(ast::Assertion { span: span(0..2), kind: ast::AssertionKind::NotWordBoundary, })) ); // We also support superfluous escapes in most cases now too. for c in ['!', '@', '%', '"', '\'', '/', ' '] { let pat = format!(r"\{}", c); assert_eq!( parser(&pat).parse_primitive(), Ok(Primitive::Literal(ast::Literal { span: span(0..2), kind: ast::LiteralKind::Superfluous, c, })) ); } // Some superfluous escapes, namely [0-9A-Za-z], are still banned. This // gives flexibility for future evolution. assert_eq!( parser(r"\e").parse_escape().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::EscapeUnrecognized, } ); assert_eq!( parser(r"\y").parse_escape().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::EscapeUnrecognized, } ); // Starting a special word boundary without any non-whitespace chars // after the brace makes it ambiguous whether the user meant to write // a counted repetition (probably not?) or an actual special word // boundary assertion. assert_eq!( parser(r"\b{").parse_escape().unwrap_err(), TestError { span: span(0..3), kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); assert_eq!( parser_ignore_whitespace(r"\b{ ").parse_escape().unwrap_err(), TestError { span: span(0..4), kind: ast::ErrorKind::SpecialWordOrRepetitionUnexpectedEof, } ); // When 'x' is not enabled, the space is seen as a non-[-A-Za-z] char, // and thus causes the parser to treat it as a counted repetition. assert_eq!( parser(r"\b{ ").parse().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::RepetitionCountDecimalEmpty, } ); // In this case, we got some valid chars that makes it look like the // user is writing one of the special word boundary assertions, but // we forget to close the brace. assert_eq!( parser(r"\b{foo").parse_escape().unwrap_err(), TestError { span: span(2..6), kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, } ); // We get the same error as above, except it is provoked by seeing a // char that we know is invalid before seeing a closing brace. assert_eq!( parser(r"\b{foo!}").parse_escape().unwrap_err(), TestError { span: span(2..6), kind: ast::ErrorKind::SpecialWordBoundaryUnclosed, } ); // And this one occurs when, syntactically, everything looks okay, but // we don't use a valid spelling of a word boundary assertion. assert_eq!( parser(r"\b{foo}").parse_escape().unwrap_err(), TestError { span: span(3..6), kind: ast::ErrorKind::SpecialWordBoundaryUnrecognized, } ); // An unfinished escape is illegal. assert_eq!( parser(r"\").parse_escape().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); } #[test] fn parse_unsupported_backreference() { assert_eq!( parser(r"\0").parse_escape().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::UnsupportedBackreference, } ); assert_eq!( parser(r"\9").parse_escape().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::UnsupportedBackreference, } ); } #[test] fn parse_octal() { for i in 0..511 { let pat = format!(r"\{:o}", i); assert_eq!( parser_octal(&pat).parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::Octal, c: char::from_u32(i).unwrap(), })) ); } assert_eq!( parser_octal(r"\778").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', })) ); assert_eq!( parser_octal(r"\7777").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', })) ); assert_eq!( parser_octal(r"\778").parse(), Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ Ast::literal(ast::Literal { span: span(0..3), kind: ast::LiteralKind::Octal, c: '?', }), Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: '8', }), ], })) ); assert_eq!( parser_octal(r"\7777").parse(), Ok(Ast::concat(ast::Concat { span: span(0..5), asts: vec![ Ast::literal(ast::Literal { span: span(0..4), kind: ast::LiteralKind::Octal, c: '\u{01FF}', }), Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: '7', }), ], })) ); assert_eq!( parser_octal(r"\8").parse_escape().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::EscapeUnrecognized, } ); } #[test] fn parse_hex_two() { for i in 0..256 { let pat = format!(r"\x{:02x}", i); assert_eq!( parser(&pat).parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X), c: char::from_u32(i).unwrap(), })) ); } assert_eq!( parser(r"\xF").parse_escape().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\xG").parse_escape().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\xFG").parse_escape().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); } #[test] fn parse_hex_four() { for i in 0..65536 { let c = match char::from_u32(i) { None => continue, Some(c) => c, }; let pat = format!(r"\u{:04x}", i); assert_eq!( parser(&pat).parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed( ast::HexLiteralKind::UnicodeShort ), c, })) ); } assert_eq!( parser(r"\uF").parse_escape().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\uG").parse_escape().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\uFG").parse_escape().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\uFFG").parse_escape().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\uFFFG").parse_escape().unwrap_err(), TestError { span: span(5..6), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\uD800").parse_escape().unwrap_err(), TestError { span: span(2..6), kind: ast::ErrorKind::EscapeHexInvalid, } ); } #[test] fn parse_hex_eight() { for i in 0..65536 { let c = match char::from_u32(i) { None => continue, Some(c) => c, }; let pat = format!(r"\U{:08x}", i); assert_eq!( parser(&pat).parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..pat.len()), kind: ast::LiteralKind::HexFixed( ast::HexLiteralKind::UnicodeLong ), c, })) ); } assert_eq!( parser(r"\UF").parse_escape().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\UG").parse_escape().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFG").parse_escape().unwrap_err(), TestError { span: span(3..4), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFG").parse_escape().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFFG").parse_escape().unwrap_err(), TestError { span: span(5..6), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFFFG").parse_escape().unwrap_err(), TestError { span: span(6..7), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFFFFG").parse_escape().unwrap_err(), TestError { span: span(7..8), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFFFFFG").parse_escape().unwrap_err(), TestError { span: span(8..9), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\UFFFFFFFG").parse_escape().unwrap_err(), TestError { span: span(9..10), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); } #[test] fn parse_hex_brace() { assert_eq!( parser(r"\u{26c4}").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..8), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::UnicodeShort ), c: '⛄', })) ); assert_eq!( parser(r"\U{26c4}").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..8), kind: ast::LiteralKind::HexBrace( ast::HexLiteralKind::UnicodeLong ), c: '⛄', })) ); assert_eq!( parser(r"\x{26c4}").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..8), kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), c: '⛄', })) ); assert_eq!( parser(r"\x{26C4}").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..8), kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), c: '⛄', })) ); assert_eq!( parser(r"\x{10fFfF}").parse_escape(), Ok(Primitive::Literal(ast::Literal { span: span(0..10), kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X), c: '\u{10FFFF}', })) ); assert_eq!( parser(r"\x").parse_escape().unwrap_err(), TestError { span: span(2..2), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\x{").parse_escape().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\x{FF").parse_escape().unwrap_err(), TestError { span: span(2..5), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\x{}").parse_escape().unwrap_err(), TestError { span: span(2..4), kind: ast::ErrorKind::EscapeHexEmpty, } ); assert_eq!( parser(r"\x{FGF}").parse_escape().unwrap_err(), TestError { span: span(4..5), kind: ast::ErrorKind::EscapeHexInvalidDigit, } ); assert_eq!( parser(r"\x{FFFFFF}").parse_escape().unwrap_err(), TestError { span: span(3..9), kind: ast::ErrorKind::EscapeHexInvalid, } ); assert_eq!( parser(r"\x{D800}").parse_escape().unwrap_err(), TestError { span: span(3..7), kind: ast::ErrorKind::EscapeHexInvalid, } ); assert_eq!( parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(), TestError { span: span(3..12), kind: ast::ErrorKind::EscapeHexInvalid, } ); } #[test] fn parse_decimal() { assert_eq!(parser("123").parse_decimal(), Ok(123)); assert_eq!(parser("0").parse_decimal(), Ok(0)); assert_eq!(parser("01").parse_decimal(), Ok(1)); assert_eq!( parser("-1").parse_decimal().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } ); assert_eq!( parser("").parse_decimal().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty } ); assert_eq!( parser("9999999999").parse_decimal().unwrap_err(), TestError { span: span(0..10), kind: ast::ErrorKind::DecimalInvalid, } ); } #[test] fn parse_set_class() { fn union(span: Span, items: Vec) -> ast::ClassSet { ast::ClassSet::union(ast::ClassSetUnion { span, items }) } fn intersection( span: Span, lhs: ast::ClassSet, rhs: ast::ClassSet, ) -> ast::ClassSet { ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { span, kind: ast::ClassSetBinaryOpKind::Intersection, lhs: Box::new(lhs), rhs: Box::new(rhs), }) } fn difference( span: Span, lhs: ast::ClassSet, rhs: ast::ClassSet, ) -> ast::ClassSet { ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { span, kind: ast::ClassSetBinaryOpKind::Difference, lhs: Box::new(lhs), rhs: Box::new(rhs), }) } fn symdifference( span: Span, lhs: ast::ClassSet, rhs: ast::ClassSet, ) -> ast::ClassSet { ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp { span, kind: ast::ClassSetBinaryOpKind::SymmetricDifference, lhs: Box::new(lhs), rhs: Box::new(rhs), }) } fn itemset(item: ast::ClassSetItem) -> ast::ClassSet { ast::ClassSet::Item(item) } fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem { ast::ClassSetItem::Ascii(cls) } fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem { ast::ClassSetItem::Unicode(cls) } fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem { ast::ClassSetItem::Perl(cls) } fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem { ast::ClassSetItem::Bracketed(Box::new(cls)) } fn lit(span: Span, c: char) -> ast::ClassSetItem { ast::ClassSetItem::Literal(ast::Literal { span, kind: ast::LiteralKind::Verbatim, c, }) } fn empty(span: Span) -> ast::ClassSetItem { ast::ClassSetItem::Empty(span) } fn range(span: Span, start: char, end: char) -> ast::ClassSetItem { let pos1 = Position { offset: span.start.offset + start.len_utf8(), column: span.start.column + 1, ..span.start }; let pos2 = Position { offset: span.end.offset - end.len_utf8(), column: span.end.column - 1, ..span.end }; ast::ClassSetItem::Range(ast::ClassSetRange { span, start: ast::Literal { span: Span { end: pos1, ..span }, kind: ast::LiteralKind::Verbatim, c: start, }, end: ast::Literal { span: Span { start: pos2, ..span }, kind: ast::LiteralKind::Verbatim, c: end, }, }) } fn alnum(span: Span, negated: bool) -> ast::ClassAscii { ast::ClassAscii { span, kind: ast::ClassAsciiKind::Alnum, negated } } fn lower(span: Span, negated: bool) -> ast::ClassAscii { ast::ClassAscii { span, kind: ast::ClassAsciiKind::Lower, negated } } assert_eq!( parser("[[:alnum:]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..11), negated: false, kind: itemset(item_ascii(alnum(span(1..10), false))), })) ); assert_eq!( parser("[[[:alnum:]]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..13), negated: false, kind: itemset(item_bracket(ast::ClassBracketed { span: span(1..12), negated: false, kind: itemset(item_ascii(alnum(span(2..11), false))), })), })) ); assert_eq!( parser("[[:alnum:]&&[:lower:]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: intersection( span(1..21), itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), })) ); assert_eq!( parser("[[:alnum:]--[:lower:]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: difference( span(1..21), itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), })) ); assert_eq!( parser("[[:alnum:]~~[:lower:]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..22), negated: false, kind: symdifference( span(1..21), itemset(item_ascii(alnum(span(1..10), false))), itemset(item_ascii(lower(span(12..21), false))), ), })) ); assert_eq!( parser("[a]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), 'a')), })) ); assert_eq!( parser(r"[a\]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( span(1..4), vec![ lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), kind: ast::LiteralKind::Meta, c: ']', }), ] ), })) ); assert_eq!( parser(r"[a\-z]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( span(1..5), vec![ lit(span(1..2), 'a'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), kind: ast::LiteralKind::Meta, c: '-', }), lit(span(4..5), 'z'), ] ), })) ); assert_eq!( parser("[ab]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),] ), })) ); assert_eq!( parser("[a-]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),] ), })) ); assert_eq!( parser("[-a]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: union( span(1..3), vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),] ), })) ); assert_eq!( parser(r"[\pL]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(item_unicode(ast::ClassUnicode { span: span(1..4), negated: false, kind: ast::ClassUnicodeKind::OneLetter('L'), })), })) ); assert_eq!( parser(r"[\w]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(item_perl(ast::ClassPerl { span: span(1..3), kind: ast::ClassPerlKind::Word, negated: false, })), })) ); assert_eq!( parser(r"[a\wz]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: union( span(1..5), vec![ lit(span(1..2), 'a'), item_perl(ast::ClassPerl { span: span(2..4), kind: ast::ClassPerlKind::Word, negated: false, }), lit(span(4..5), 'z'), ] ), })) ); assert_eq!( parser("[a-z]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: itemset(range(span(1..4), 'a', 'z')), })) ); assert_eq!( parser("[a-cx-z]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..8), negated: false, kind: union( span(1..7), vec![ range(span(1..4), 'a', 'c'), range(span(4..7), 'x', 'z'), ] ), })) ); assert_eq!( parser(r"[\w&&a-cx-z]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( span(1..11), itemset(item_perl(ast::ClassPerl { span: span(1..3), kind: ast::ClassPerlKind::Word, negated: false, })), union( span(5..11), vec![ range(span(5..8), 'a', 'c'), range(span(8..11), 'x', 'z'), ] ), ), })) ); assert_eq!( parser(r"[a-cx-z&&\w]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..12), negated: false, kind: intersection( span(1..11), union( span(1..7), vec![ range(span(1..4), 'a', 'c'), range(span(4..7), 'x', 'z'), ] ), itemset(item_perl(ast::ClassPerl { span: span(9..11), kind: ast::ClassPerlKind::Word, negated: false, })), ), })) ); assert_eq!( parser(r"[a--b--c]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: difference( span(1..8), difference( span(1..5), itemset(lit(span(1..2), 'a')), itemset(lit(span(4..5), 'b')), ), itemset(lit(span(7..8), 'c')), ), })) ); assert_eq!( parser(r"[a~~b~~c]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..9), negated: false, kind: symdifference( span(1..8), symdifference( span(1..5), itemset(lit(span(1..2), 'a')), itemset(lit(span(4..5), 'b')), ), itemset(lit(span(7..8), 'c')), ), })) ); assert_eq!( parser(r"[\^&&^]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), kind: ast::LiteralKind::Meta, c: '^', })), itemset(lit(span(5..6), '^')), ), })) ); assert_eq!( parser(r"[\&&&&]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..7), negated: false, kind: intersection( span(1..6), itemset(ast::ClassSetItem::Literal(ast::Literal { span: span(1..3), kind: ast::LiteralKind::Meta, c: '&', })), itemset(lit(span(5..6), '&')), ), })) ); assert_eq!( parser(r"[&&&&]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..6), negated: false, kind: intersection( span(1..5), intersection( span(1..3), itemset(empty(span(1..1))), itemset(empty(span(3..3))), ), itemset(empty(span(5..5))), ), })) ); let pat = "[☃-⛄]"; assert_eq!( parser(pat).parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span_range(pat, 0..9), negated: false, kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange { span: span_range(pat, 1..8), start: ast::Literal { span: span_range(pat, 1..4), kind: ast::LiteralKind::Verbatim, c: '☃', }, end: ast::Literal { span: span_range(pat, 5..8), kind: ast::LiteralKind::Verbatim, c: '⛄', }, })), })) ); assert_eq!( parser(r"[]]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..3), negated: false, kind: itemset(lit(span(1..2), ']')), })) ); assert_eq!( parser(r"[]\[]").parse(), Ok(Ast::class_bracketed(ast::ClassBracketed { span: span(0..5), negated: false, kind: union( span(1..4), vec![ lit(span(1..2), ']'), ast::ClassSetItem::Literal(ast::Literal { span: span(2..4), kind: ast::LiteralKind::Meta, c: '[', }), ] ), })) ); assert_eq!( parser(r"[\[]]").parse(), Ok(concat( 0..5, vec![ Ast::class_bracketed(ast::ClassBracketed { span: span(0..4), negated: false, kind: itemset(ast::ClassSetItem::Literal( ast::Literal { span: span(1..3), kind: ast::LiteralKind::Meta, c: '[', } )), }), Ast::literal(ast::Literal { span: span(4..5), kind: ast::LiteralKind::Verbatim, c: ']', }), ] )) ); assert_eq!( parser("[").parse().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[[").parse().unwrap_err(), TestError { span: span(1..2), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[[-]").parse().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[[[:alnum:]").parse().unwrap_err(), TestError { span: span(1..2), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser(r"[\b]").parse().unwrap_err(), TestError { span: span(1..3), kind: ast::ErrorKind::ClassEscapeInvalid, } ); assert_eq!( parser(r"[\w-a]").parse().unwrap_err(), TestError { span: span(1..3), kind: ast::ErrorKind::ClassRangeLiteral, } ); assert_eq!( parser(r"[a-\w]").parse().unwrap_err(), TestError { span: span(3..5), kind: ast::ErrorKind::ClassRangeLiteral, } ); assert_eq!( parser(r"[z-a]").parse().unwrap_err(), TestError { span: span(1..4), kind: ast::ErrorKind::ClassRangeInvalid, } ); assert_eq!( parser_ignore_whitespace("[a ").parse().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser_ignore_whitespace("[a- ").parse().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::ClassUnclosed, } ); } #[test] fn parse_set_class_open() { assert_eq!(parser("[a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..1), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(1..1), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(1..1), items: vec![] }; Ok((set, union)) }); assert_eq!( parser_ignore_whitespace("[ a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..4), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(4..4), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(4..4), items: vec![] }; Ok((set, union)) } ); assert_eq!(parser("[^a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..2), negated: true, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(2..2), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(2..2), items: vec![] }; Ok((set, union)) }); assert_eq!( parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..4), negated: true, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(4..4), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(4..4), items: vec![] }; Ok((set, union)) } ); assert_eq!(parser("[-a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..2), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(1..1), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(1..2), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(1..2), kind: ast::LiteralKind::Verbatim, c: '-', })], }; Ok((set, union)) }); assert_eq!( parser_ignore_whitespace("[ - a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..4), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(2..2), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(2..3), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: '-', })], }; Ok((set, union)) } ); assert_eq!(parser("[^-a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..3), negated: true, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(2..2), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(2..3), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: '-', })], }; Ok((set, union)) }); assert_eq!(parser("[--a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..3), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(1..1), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(1..3), items: vec![ ast::ClassSetItem::Literal(ast::Literal { span: span(1..2), kind: ast::LiteralKind::Verbatim, c: '-', }), ast::ClassSetItem::Literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: '-', }), ], }; Ok((set, union)) }); assert_eq!(parser("[]a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..2), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(1..1), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(1..2), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(1..2), kind: ast::LiteralKind::Verbatim, c: ']', })], }; Ok((set, union)) }); assert_eq!( parser_ignore_whitespace("[ ] a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..4), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(2..2), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(2..3), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: ']', })], }; Ok((set, union)) } ); assert_eq!(parser("[^]a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..3), negated: true, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(2..2), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(2..3), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: ']', })], }; Ok((set, union)) }); assert_eq!(parser("[-]a]").parse_set_class_open(), { let set = ast::ClassBracketed { span: span(0..2), negated: false, kind: ast::ClassSet::union(ast::ClassSetUnion { span: span(1..1), items: vec![], }), }; let union = ast::ClassSetUnion { span: span(1..2), items: vec![ast::ClassSetItem::Literal(ast::Literal { span: span(1..2), kind: ast::LiteralKind::Verbatim, c: '-', })], }; Ok((set, union)) }); assert_eq!( parser("[").parse_set_class_open().unwrap_err(), TestError { span: span(0..1), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser_ignore_whitespace("[ ") .parse_set_class_open() .unwrap_err(), TestError { span: span(0..5), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[^").parse_set_class_open().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[]").parse_set_class_open().unwrap_err(), TestError { span: span(0..2), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[-").parse_set_class_open().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::ClassUnclosed, } ); assert_eq!( parser("[--").parse_set_class_open().unwrap_err(), TestError { span: span(0..0), kind: ast::ErrorKind::ClassUnclosed, } ); // See: https://github.com/rust-lang/regex/issues/792 assert_eq!( parser("(?x)[-#]").parse_with_comments().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::ClassUnclosed, } ); } #[test] fn maybe_parse_ascii_class() { assert_eq!( parser(r"[:alnum:]").maybe_parse_ascii_class(), Some(ast::ClassAscii { span: span(0..9), kind: ast::ClassAsciiKind::Alnum, negated: false, }) ); assert_eq!( parser(r"[:alnum:]A").maybe_parse_ascii_class(), Some(ast::ClassAscii { span: span(0..9), kind: ast::ClassAsciiKind::Alnum, negated: false, }) ); assert_eq!( parser(r"[:^alnum:]").maybe_parse_ascii_class(), Some(ast::ClassAscii { span: span(0..10), kind: ast::ClassAsciiKind::Alnum, negated: true, }) ); let p = parser(r"[:"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); let p = parser(r"[:^"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); let p = parser(r"[^:alnum:]"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); let p = parser(r"[:alnnum:]"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); let p = parser(r"[:alnum]"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); let p = parser(r"[:alnum:"); assert_eq!(p.maybe_parse_ascii_class(), None); assert_eq!(p.offset(), 0); } #[test] fn parse_unicode_class() { assert_eq!( parser(r"\pN").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), })) ); assert_eq!( parser(r"\PN").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..3), negated: true, kind: ast::ClassUnicodeKind::OneLetter('N'), })) ); assert_eq!( parser(r"\p{N}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..5), negated: false, kind: ast::ClassUnicodeKind::Named(s("N")), })) ); assert_eq!( parser(r"\P{N}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..5), negated: true, kind: ast::ClassUnicodeKind::Named(s("N")), })) ); assert_eq!( parser(r"\p{Greek}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), })) ); assert_eq!( parser(r"\p{scx:Katakana}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..16), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Colon, name: s("scx"), value: s("Katakana"), }, })) ); assert_eq!( parser(r"\p{scx=Katakana}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..16), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Equal, name: s("scx"), value: s("Katakana"), }, })) ); assert_eq!( parser(r"\p{scx!=Katakana}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..17), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::NotEqual, name: s("scx"), value: s("Katakana"), }, })) ); assert_eq!( parser(r"\p{:}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..5), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Colon, name: s(""), value: s(""), }, })) ); assert_eq!( parser(r"\p{=}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..5), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::Equal, name: s(""), value: s(""), }, })) ); assert_eq!( parser(r"\p{!=}").parse_escape(), Ok(Primitive::Unicode(ast::ClassUnicode { span: span(0..6), negated: false, kind: ast::ClassUnicodeKind::NamedValue { op: ast::ClassUnicodeOpKind::NotEqual, name: s(""), value: s(""), }, })) ); assert_eq!( parser(r"\p").parse_escape().unwrap_err(), TestError { span: span(2..2), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\p{").parse_escape().unwrap_err(), TestError { span: span(3..3), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\p{N").parse_escape().unwrap_err(), TestError { span: span(4..4), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\p{Greek").parse_escape().unwrap_err(), TestError { span: span(8..8), kind: ast::ErrorKind::EscapeUnexpectedEof, } ); assert_eq!( parser(r"\pNz").parse(), Ok(Ast::concat(ast::Concat { span: span(0..4), asts: vec![ Ast::class_unicode(ast::ClassUnicode { span: span(0..3), negated: false, kind: ast::ClassUnicodeKind::OneLetter('N'), }), Ast::literal(ast::Literal { span: span(3..4), kind: ast::LiteralKind::Verbatim, c: 'z', }), ], })) ); assert_eq!( parser(r"\p{Greek}z").parse(), Ok(Ast::concat(ast::Concat { span: span(0..10), asts: vec![ Ast::class_unicode(ast::ClassUnicode { span: span(0..9), negated: false, kind: ast::ClassUnicodeKind::Named(s("Greek")), }), Ast::literal(ast::Literal { span: span(9..10), kind: ast::LiteralKind::Verbatim, c: 'z', }), ], })) ); assert_eq!( parser(r"\p\{").parse().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::UnicodeClassInvalid, } ); assert_eq!( parser(r"\P\{").parse().unwrap_err(), TestError { span: span(2..3), kind: ast::ErrorKind::UnicodeClassInvalid, } ); } #[test] fn parse_perl_class() { assert_eq!( parser(r"\d").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, })) ); assert_eq!( parser(r"\D").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: true, })) ); assert_eq!( parser(r"\s").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Space, negated: false, })) ); assert_eq!( parser(r"\S").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Space, negated: true, })) ); assert_eq!( parser(r"\w").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Word, negated: false, })) ); assert_eq!( parser(r"\W").parse_escape(), Ok(Primitive::Perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Word, negated: true, })) ); assert_eq!( parser(r"\d").parse(), Ok(Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, })) ); assert_eq!( parser(r"\dz").parse(), Ok(Ast::concat(ast::Concat { span: span(0..3), asts: vec![ Ast::class_perl(ast::ClassPerl { span: span(0..2), kind: ast::ClassPerlKind::Digit, negated: false, }), Ast::literal(ast::Literal { span: span(2..3), kind: ast::LiteralKind::Verbatim, c: 'z', }), ], })) ); } // This tests a bug fix where the nest limit checker wasn't decrementing // its depth during post-traversal, which causes long regexes to trip // the default limit too aggressively. #[test] fn regression_454_nest_too_big() { let pattern = r#" 2(?: [45]\d{3}| 7(?: 1[0-267]| 2[0-289]| 3[0-29]| 4[01]| 5[1-3]| 6[013]| 7[0178]| 91 )| 8(?: 0[125]| [139][1-6]| 2[0157-9]| 41| 6[1-35]| 7[1-5]| 8[1-8]| 90 )| 9(?: 0[0-2]| 1[0-4]| 2[568]| 3[3-6]| 5[5-7]| 6[0167]| 7[15]| 8[0146-9] ) )\d{4} "#; assert!(parser_nest_limit(pattern, 50).parse().is_ok()); } // This tests that we treat a trailing `-` in a character class as a // literal `-` even when whitespace mode is enabled and there is whitespace // after the trailing `-`. #[test] fn regression_455_trailing_dash_ignore_whitespace() { assert!(parser("(?x)[ / - ]").parse().is_ok()); assert!(parser("(?x)[ a - ]").parse().is_ok()); assert!(parser( "(?x)[ a - ] " ) .parse() .is_ok()); assert!(parser( "(?x)[ a # wat - ] " ) .parse() .is_ok()); assert!(parser("(?x)[ / -").parse().is_err()); assert!(parser("(?x)[ / - ").parse().is_err()); assert!(parser( "(?x)[ / - " ) .parse() .is_err()); assert!(parser( "(?x)[ / - # wat " ) .parse() .is_err()); } } regex-syntax-0.8.2/src/ast/print.rs000064400000000000000000000440041046102023000153530ustar 00000000000000/*! This module provides a regular expression printer for `Ast`. */ use core::fmt; use crate::ast::{ self, visitor::{self, Visitor}, Ast, }; /// A builder for constructing a printer. /// /// Note that since a printer doesn't have any configuration knobs, this type /// remains unexported. #[derive(Clone, Debug)] struct PrinterBuilder { _priv: (), } impl Default for PrinterBuilder { fn default() -> PrinterBuilder { PrinterBuilder::new() } } impl PrinterBuilder { fn new() -> PrinterBuilder { PrinterBuilder { _priv: () } } fn build(&self) -> Printer { Printer { _priv: () } } } /// A printer for a regular expression abstract syntax tree. /// /// A printer converts an abstract syntax tree (AST) to a regular expression /// pattern string. This particular printer uses constant stack space and heap /// space proportional to the size of the AST. /// /// This printer will not necessarily preserve the original formatting of the /// regular expression pattern string. For example, all whitespace and comments /// are ignored. #[derive(Debug)] pub struct Printer { _priv: (), } impl Printer { /// Create a new printer. pub fn new() -> Printer { PrinterBuilder::new().build() } /// Print the given `Ast` to the given writer. The writer must implement /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, ast: &Ast, wtr: W) -> fmt::Result { visitor::visit(ast, Writer { wtr }) } } #[derive(Debug)] struct Writer { wtr: W, } impl Visitor for Writer { type Output = (); type Err = fmt::Error; fn finish(self) -> fmt::Result { Ok(()) } fn visit_pre(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Group(ref x) => self.fmt_group_pre(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_pre(x), _ => Ok(()), } } fn visit_post(&mut self, ast: &Ast) -> fmt::Result { match *ast { Ast::Empty(_) => Ok(()), Ast::Flags(ref x) => self.fmt_set_flags(x), Ast::Literal(ref x) => self.fmt_literal(x), Ast::Dot(_) => self.wtr.write_str("."), Ast::Assertion(ref x) => self.fmt_assertion(x), Ast::ClassPerl(ref x) => self.fmt_class_perl(x), Ast::ClassUnicode(ref x) => self.fmt_class_unicode(x), Ast::ClassBracketed(ref x) => self.fmt_class_bracketed_post(x), Ast::Repetition(ref x) => self.fmt_repetition(x), Ast::Group(ref x) => self.fmt_group_post(x), Ast::Alternation(_) => Ok(()), Ast::Concat(_) => Ok(()), } } fn visit_alternation_in(&mut self) -> fmt::Result { self.wtr.write_str("|") } fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<(), Self::Err> { match *ast { ast::ClassSetItem::Bracketed(ref x) => { self.fmt_class_bracketed_pre(x) } _ => Ok(()), } } fn visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<(), Self::Err> { use crate::ast::ClassSetItem::*; match *ast { Empty(_) => Ok(()), Literal(ref x) => self.fmt_literal(x), Range(ref x) => { self.fmt_literal(&x.start)?; self.wtr.write_str("-")?; self.fmt_literal(&x.end)?; Ok(()) } Ascii(ref x) => self.fmt_class_ascii(x), Unicode(ref x) => self.fmt_class_unicode(x), Perl(ref x) => self.fmt_class_perl(x), Bracketed(ref x) => self.fmt_class_bracketed_post(x), Union(_) => Ok(()), } } fn visit_class_set_binary_op_in( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<(), Self::Err> { self.fmt_class_set_binary_op_kind(&ast.kind) } } impl Writer { fn fmt_group_pre(&mut self, ast: &ast::Group) -> fmt::Result { use crate::ast::GroupKind::*; match ast.kind { CaptureIndex(_) => self.wtr.write_str("("), CaptureName { ref name, starts_with_p } => { let start = if starts_with_p { "(?P<" } else { "(?<" }; self.wtr.write_str(start)?; self.wtr.write_str(&name.name)?; self.wtr.write_str(">")?; Ok(()) } NonCapturing(ref flags) => { self.wtr.write_str("(?")?; self.fmt_flags(flags)?; self.wtr.write_str(":")?; Ok(()) } } } fn fmt_group_post(&mut self, _ast: &ast::Group) -> fmt::Result { self.wtr.write_str(")") } fn fmt_repetition(&mut self, ast: &ast::Repetition) -> fmt::Result { use crate::ast::RepetitionKind::*; match ast.op.kind { ZeroOrOne if ast.greedy => self.wtr.write_str("?"), ZeroOrOne => self.wtr.write_str("??"), ZeroOrMore if ast.greedy => self.wtr.write_str("*"), ZeroOrMore => self.wtr.write_str("*?"), OneOrMore if ast.greedy => self.wtr.write_str("+"), OneOrMore => self.wtr.write_str("+?"), Range(ref x) => { self.fmt_repetition_range(x)?; if !ast.greedy { self.wtr.write_str("?")?; } Ok(()) } } } fn fmt_repetition_range( &mut self, ast: &ast::RepetitionRange, ) -> fmt::Result { use crate::ast::RepetitionRange::*; match *ast { Exactly(x) => write!(self.wtr, "{{{}}}", x), AtLeast(x) => write!(self.wtr, "{{{},}}", x), Bounded(x, y) => write!(self.wtr, "{{{},{}}}", x, y), } } fn fmt_literal(&mut self, ast: &ast::Literal) -> fmt::Result { use crate::ast::LiteralKind::*; match ast.kind { Verbatim => self.wtr.write_char(ast.c), Meta | Superfluous => write!(self.wtr, r"\{}", ast.c), Octal => write!(self.wtr, r"\{:o}", u32::from(ast.c)), HexFixed(ast::HexLiteralKind::X) => { write!(self.wtr, r"\x{:02X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeShort) => { write!(self.wtr, r"\u{:04X}", u32::from(ast.c)) } HexFixed(ast::HexLiteralKind::UnicodeLong) => { write!(self.wtr, r"\U{:08X}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::X) => { write!(self.wtr, r"\x{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeShort) => { write!(self.wtr, r"\u{{{:X}}}", u32::from(ast.c)) } HexBrace(ast::HexLiteralKind::UnicodeLong) => { write!(self.wtr, r"\U{{{:X}}}", u32::from(ast.c)) } Special(ast::SpecialLiteralKind::Bell) => { self.wtr.write_str(r"\a") } Special(ast::SpecialLiteralKind::FormFeed) => { self.wtr.write_str(r"\f") } Special(ast::SpecialLiteralKind::Tab) => self.wtr.write_str(r"\t"), Special(ast::SpecialLiteralKind::LineFeed) => { self.wtr.write_str(r"\n") } Special(ast::SpecialLiteralKind::CarriageReturn) => { self.wtr.write_str(r"\r") } Special(ast::SpecialLiteralKind::VerticalTab) => { self.wtr.write_str(r"\v") } Special(ast::SpecialLiteralKind::Space) => { self.wtr.write_str(r"\ ") } } } fn fmt_assertion(&mut self, ast: &ast::Assertion) -> fmt::Result { use crate::ast::AssertionKind::*; match ast.kind { StartLine => self.wtr.write_str("^"), EndLine => self.wtr.write_str("$"), StartText => self.wtr.write_str(r"\A"), EndText => self.wtr.write_str(r"\z"), WordBoundary => self.wtr.write_str(r"\b"), NotWordBoundary => self.wtr.write_str(r"\B"), WordBoundaryStart => self.wtr.write_str(r"\b{start}"), WordBoundaryEnd => self.wtr.write_str(r"\b{end}"), WordBoundaryStartAngle => self.wtr.write_str(r"\<"), WordBoundaryEndAngle => self.wtr.write_str(r"\>"), WordBoundaryStartHalf => self.wtr.write_str(r"\b{start-half}"), WordBoundaryEndHalf => self.wtr.write_str(r"\b{end-half}"), } } fn fmt_set_flags(&mut self, ast: &ast::SetFlags) -> fmt::Result { self.wtr.write_str("(?")?; self.fmt_flags(&ast.flags)?; self.wtr.write_str(")")?; Ok(()) } fn fmt_flags(&mut self, ast: &ast::Flags) -> fmt::Result { use crate::ast::{Flag, FlagsItemKind}; for item in &ast.items { match item.kind { FlagsItemKind::Negation => self.wtr.write_str("-"), FlagsItemKind::Flag(ref flag) => match *flag { Flag::CaseInsensitive => self.wtr.write_str("i"), Flag::MultiLine => self.wtr.write_str("m"), Flag::DotMatchesNewLine => self.wtr.write_str("s"), Flag::SwapGreed => self.wtr.write_str("U"), Flag::Unicode => self.wtr.write_str("u"), Flag::CRLF => self.wtr.write_str("R"), Flag::IgnoreWhitespace => self.wtr.write_str("x"), }, }?; } Ok(()) } fn fmt_class_bracketed_pre( &mut self, ast: &ast::ClassBracketed, ) -> fmt::Result { if ast.negated { self.wtr.write_str("[^") } else { self.wtr.write_str("[") } } fn fmt_class_bracketed_post( &mut self, _ast: &ast::ClassBracketed, ) -> fmt::Result { self.wtr.write_str("]") } fn fmt_class_set_binary_op_kind( &mut self, ast: &ast::ClassSetBinaryOpKind, ) -> fmt::Result { use crate::ast::ClassSetBinaryOpKind::*; match *ast { Intersection => self.wtr.write_str("&&"), Difference => self.wtr.write_str("--"), SymmetricDifference => self.wtr.write_str("~~"), } } fn fmt_class_perl(&mut self, ast: &ast::ClassPerl) -> fmt::Result { use crate::ast::ClassPerlKind::*; match ast.kind { Digit if ast.negated => self.wtr.write_str(r"\D"), Digit => self.wtr.write_str(r"\d"), Space if ast.negated => self.wtr.write_str(r"\S"), Space => self.wtr.write_str(r"\s"), Word if ast.negated => self.wtr.write_str(r"\W"), Word => self.wtr.write_str(r"\w"), } } fn fmt_class_ascii(&mut self, ast: &ast::ClassAscii) -> fmt::Result { use crate::ast::ClassAsciiKind::*; match ast.kind { Alnum if ast.negated => self.wtr.write_str("[:^alnum:]"), Alnum => self.wtr.write_str("[:alnum:]"), Alpha if ast.negated => self.wtr.write_str("[:^alpha:]"), Alpha => self.wtr.write_str("[:alpha:]"), Ascii if ast.negated => self.wtr.write_str("[:^ascii:]"), Ascii => self.wtr.write_str("[:ascii:]"), Blank if ast.negated => self.wtr.write_str("[:^blank:]"), Blank => self.wtr.write_str("[:blank:]"), Cntrl if ast.negated => self.wtr.write_str("[:^cntrl:]"), Cntrl => self.wtr.write_str("[:cntrl:]"), Digit if ast.negated => self.wtr.write_str("[:^digit:]"), Digit => self.wtr.write_str("[:digit:]"), Graph if ast.negated => self.wtr.write_str("[:^graph:]"), Graph => self.wtr.write_str("[:graph:]"), Lower if ast.negated => self.wtr.write_str("[:^lower:]"), Lower => self.wtr.write_str("[:lower:]"), Print if ast.negated => self.wtr.write_str("[:^print:]"), Print => self.wtr.write_str("[:print:]"), Punct if ast.negated => self.wtr.write_str("[:^punct:]"), Punct => self.wtr.write_str("[:punct:]"), Space if ast.negated => self.wtr.write_str("[:^space:]"), Space => self.wtr.write_str("[:space:]"), Upper if ast.negated => self.wtr.write_str("[:^upper:]"), Upper => self.wtr.write_str("[:upper:]"), Word if ast.negated => self.wtr.write_str("[:^word:]"), Word => self.wtr.write_str("[:word:]"), Xdigit if ast.negated => self.wtr.write_str("[:^xdigit:]"), Xdigit => self.wtr.write_str("[:xdigit:]"), } } fn fmt_class_unicode(&mut self, ast: &ast::ClassUnicode) -> fmt::Result { use crate::ast::ClassUnicodeKind::*; use crate::ast::ClassUnicodeOpKind::*; if ast.negated { self.wtr.write_str(r"\P")?; } else { self.wtr.write_str(r"\p")?; } match ast.kind { OneLetter(c) => self.wtr.write_char(c), Named(ref x) => write!(self.wtr, "{{{}}}", x), NamedValue { op: Equal, ref name, ref value } => { write!(self.wtr, "{{{}={}}}", name, value) } NamedValue { op: Colon, ref name, ref value } => { write!(self.wtr, "{{{}:{}}}", name, value) } NamedValue { op: NotEqual, ref name, ref value } => { write!(self.wtr, "{{{}!={}}}", name, value) } } } } #[cfg(test)] mod tests { use alloc::string::String; use crate::ast::parse::ParserBuilder; use super::*; fn roundtrip(given: &str) { roundtrip_with(|b| b, given); } fn roundtrip_with(mut f: F, given: &str) where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, { let mut builder = ParserBuilder::new(); f(&mut builder); let ast = builder.build().parse(given).unwrap(); let mut printer = Printer::new(); let mut dst = String::new(); printer.print(&ast, &mut dst).unwrap(); assert_eq!(given, dst); } #[test] fn print_literal() { roundtrip("a"); roundtrip(r"\["); roundtrip_with(|b| b.octal(true), r"\141"); roundtrip(r"\x61"); roundtrip(r"\x7F"); roundtrip(r"\u0061"); roundtrip(r"\U00000061"); roundtrip(r"\x{61}"); roundtrip(r"\x{7F}"); roundtrip(r"\u{61}"); roundtrip(r"\U{61}"); roundtrip(r"\a"); roundtrip(r"\f"); roundtrip(r"\t"); roundtrip(r"\n"); roundtrip(r"\r"); roundtrip(r"\v"); roundtrip(r"(?x)\ "); } #[test] fn print_dot() { roundtrip("."); } #[test] fn print_concat() { roundtrip("ab"); roundtrip("abcde"); roundtrip("a(bcd)ef"); } #[test] fn print_alternation() { roundtrip("a|b"); roundtrip("a|b|c|d|e"); roundtrip("|a|b|c|d|e"); roundtrip("|a|b|c|d|e|"); roundtrip("a(b|c|d)|e|f"); } #[test] fn print_assertion() { roundtrip(r"^"); roundtrip(r"$"); roundtrip(r"\A"); roundtrip(r"\z"); roundtrip(r"\b"); roundtrip(r"\B"); } #[test] fn print_repetition() { roundtrip("a?"); roundtrip("a??"); roundtrip("a*"); roundtrip("a*?"); roundtrip("a+"); roundtrip("a+?"); roundtrip("a{5}"); roundtrip("a{5}?"); roundtrip("a{5,}"); roundtrip("a{5,}?"); roundtrip("a{5,10}"); roundtrip("a{5,10}?"); } #[test] fn print_flags() { roundtrip("(?i)"); roundtrip("(?-i)"); roundtrip("(?s-i)"); roundtrip("(?-si)"); roundtrip("(?siUmux)"); } #[test] fn print_group() { roundtrip("(?i:a)"); roundtrip("(?Pa)"); roundtrip("(?a)"); roundtrip("(a)"); } #[test] fn print_class() { roundtrip(r"[abc]"); roundtrip(r"[a-z]"); roundtrip(r"[^a-z]"); roundtrip(r"[a-z0-9]"); roundtrip(r"[-a-z0-9]"); roundtrip(r"[-a-z0-9]"); roundtrip(r"[a-z0-9---]"); roundtrip(r"[a-z&&m-n]"); roundtrip(r"[[a-z&&m-n]]"); roundtrip(r"[a-z--m-n]"); roundtrip(r"[a-z~~m-n]"); roundtrip(r"[a-z[0-9]]"); roundtrip(r"[a-z[^0-9]]"); roundtrip(r"\d"); roundtrip(r"\D"); roundtrip(r"\s"); roundtrip(r"\S"); roundtrip(r"\w"); roundtrip(r"\W"); roundtrip(r"[[:alnum:]]"); roundtrip(r"[[:^alnum:]]"); roundtrip(r"[[:alpha:]]"); roundtrip(r"[[:^alpha:]]"); roundtrip(r"[[:ascii:]]"); roundtrip(r"[[:^ascii:]]"); roundtrip(r"[[:blank:]]"); roundtrip(r"[[:^blank:]]"); roundtrip(r"[[:cntrl:]]"); roundtrip(r"[[:^cntrl:]]"); roundtrip(r"[[:digit:]]"); roundtrip(r"[[:^digit:]]"); roundtrip(r"[[:graph:]]"); roundtrip(r"[[:^graph:]]"); roundtrip(r"[[:lower:]]"); roundtrip(r"[[:^lower:]]"); roundtrip(r"[[:print:]]"); roundtrip(r"[[:^print:]]"); roundtrip(r"[[:punct:]]"); roundtrip(r"[[:^punct:]]"); roundtrip(r"[[:space:]]"); roundtrip(r"[[:^space:]]"); roundtrip(r"[[:upper:]]"); roundtrip(r"[[:^upper:]]"); roundtrip(r"[[:word:]]"); roundtrip(r"[[:^word:]]"); roundtrip(r"[[:xdigit:]]"); roundtrip(r"[[:^xdigit:]]"); roundtrip(r"\pL"); roundtrip(r"\PL"); roundtrip(r"\p{L}"); roundtrip(r"\P{L}"); roundtrip(r"\p{X=Y}"); roundtrip(r"\P{X=Y}"); roundtrip(r"\p{X:Y}"); roundtrip(r"\P{X:Y}"); roundtrip(r"\p{X!=Y}"); roundtrip(r"\P{X!=Y}"); } } regex-syntax-0.8.2/src/ast/visitor.rs000064400000000000000000000451001046102023000157140ustar 00000000000000use alloc::{vec, vec::Vec}; use crate::ast::{self, Ast}; /// A trait for visiting an abstract syntax tree (AST) in depth first order. /// /// The principle aim of this trait is to enable callers to perform case /// analysis on an abstract syntax tree without necessarily using recursion. /// In particular, this permits callers to do case analysis with constant stack /// usage, which can be important since the size of an abstract syntax tree /// may be proportional to end user input. /// /// Typical usage of this trait involves providing an implementation and then /// running it using the [`visit`] function. /// /// Note that the abstract syntax tree for a regular expression is quite /// complex. Unless you specifically need it, you might be able to use the much /// simpler [high-level intermediate representation](crate::hir::Hir) and its /// [corresponding `Visitor` trait](crate::hir::Visitor) instead. pub trait Visitor { /// The result of visiting an AST. type Output; /// An error that visiting an AST might return. type Err; /// All implementors of `Visitor` must provide a `finish` method, which /// yields the result of visiting the AST or an error. fn finish(self) -> Result; /// This method is called before beginning traversal of the AST. fn start(&mut self) {} /// This method is called on an `Ast` before descending into child `Ast` /// nodes. fn visit_pre(&mut self, _ast: &Ast) -> Result<(), Self::Err> { Ok(()) } /// This method is called on an `Ast` after descending all of its child /// `Ast` nodes. fn visit_post(&mut self, _ast: &Ast) -> Result<(), Self::Err> { Ok(()) } /// This method is called between child nodes of an /// [`Alternation`](ast::Alternation). fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { Ok(()) } /// This method is called between child nodes of a concatenation. fn visit_concat_in(&mut self) -> Result<(), Self::Err> { Ok(()) } /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// before descending into child nodes. fn visit_class_set_item_pre( &mut self, _ast: &ast::ClassSetItem, ) -> Result<(), Self::Err> { Ok(()) } /// This method is called on every [`ClassSetItem`](ast::ClassSetItem) /// after descending into child nodes. fn visit_class_set_item_post( &mut self, _ast: &ast::ClassSetItem, ) -> Result<(), Self::Err> { Ok(()) } /// This method is called on every /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) before descending into /// child nodes. fn visit_class_set_binary_op_pre( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<(), Self::Err> { Ok(()) } /// This method is called on every /// [`ClassSetBinaryOp`](ast::ClassSetBinaryOp) after descending into child /// nodes. fn visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<(), Self::Err> { Ok(()) } /// This method is called between the left hand and right hand child nodes /// of a [`ClassSetBinaryOp`](ast::ClassSetBinaryOp). fn visit_class_set_binary_op_in( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<(), Self::Err> { Ok(()) } } /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Ast` while calling the /// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Ast` without using a stack size proportional to the depth /// of the `Ast`. Namely, this method will instead use constant stack size, but /// will use heap space proportional to the size of the `Ast`. This may be /// desirable in cases where the size of `Ast` is proportional to end user /// input. /// /// If the visitor returns an error at any point, then visiting is stopped and /// the error is returned. pub fn visit(ast: &Ast, visitor: V) -> Result { HeapVisitor::new().visit(ast, visitor) } /// HeapVisitor visits every item in an `Ast` recursively using constant stack /// size and a heap size proportional to the size of the `Ast`. struct HeapVisitor<'a> { /// A stack of `Ast` nodes. This is roughly analogous to the call stack /// used in a typical recursive visitor. stack: Vec<(&'a Ast, Frame<'a>)>, /// Similar to the `Ast` stack above, but is used only for character /// classes. In particular, character classes embed their own mini /// recursive syntax. stack_class: Vec<(ClassInduct<'a>, ClassFrame<'a>)>, } /// Represents a single stack frame while performing structural induction over /// an `Ast`. enum Frame<'a> { /// A stack frame allocated just before descending into a repetition /// operator's child node. Repetition(&'a ast::Repetition), /// A stack frame allocated just before descending into a group's child /// node. Group(&'a ast::Group), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { /// The child node we are currently visiting. head: &'a Ast, /// The remaining child nodes to visit (which may be empty). tail: &'a [Ast], }, /// The stack frame used while visiting every child node of an alternation /// of expressions. Alternation { /// The child node we are currently visiting. head: &'a Ast, /// The remaining child nodes to visit (which may be empty). tail: &'a [Ast], }, } /// Represents a single stack frame while performing structural induction over /// a character class. enum ClassFrame<'a> { /// The stack frame used while visiting every child node of a union of /// character class items. Union { /// The child node we are currently visiting. head: &'a ast::ClassSetItem, /// The remaining child nodes to visit (which may be empty). tail: &'a [ast::ClassSetItem], }, /// The stack frame used while a binary class operation. Binary { op: &'a ast::ClassSetBinaryOp }, /// A stack frame allocated just before descending into a binary operator's /// left hand child node. BinaryLHS { op: &'a ast::ClassSetBinaryOp, lhs: &'a ast::ClassSet, rhs: &'a ast::ClassSet, }, /// A stack frame allocated just before descending into a binary operator's /// right hand child node. BinaryRHS { op: &'a ast::ClassSetBinaryOp, rhs: &'a ast::ClassSet }, } /// A representation of the inductive step when performing structural induction /// over a character class. /// /// Note that there is no analogous explicit type for the inductive step for /// `Ast` nodes because the inductive step is just an `Ast`. For character /// classes, the inductive step can produce one of two possible child nodes: /// an item or a binary operation. (An item cannot be a binary operation /// because that would imply binary operations can be unioned in the concrete /// syntax, which is not possible.) enum ClassInduct<'a> { Item(&'a ast::ClassSetItem), BinaryOp(&'a ast::ClassSetBinaryOp), } impl<'a> HeapVisitor<'a> { fn new() -> HeapVisitor<'a> { HeapVisitor { stack: vec![], stack_class: vec![] } } fn visit( &mut self, mut ast: &'a Ast, mut visitor: V, ) -> Result { self.stack.clear(); self.stack_class.clear(); visitor.start(); loop { visitor.visit_pre(ast)?; if let Some(x) = self.induct(ast, &mut visitor)? { let child = x.child(); self.stack.push((ast, x)); ast = child; continue; } // No induction means we have a base case, so we can post visit // it now. visitor.visit_post(ast)?; // At this point, we now try to pop our call stack until it is // either empty or we hit another inductive case. loop { let (post_ast, frame) = match self.stack.pop() { None => return visitor.finish(), Some((post_ast, frame)) => (post_ast, frame), }; // If this is a concat/alternate, then we might have additional // inductive steps to process. if let Some(x) = self.pop(frame) { match x { Frame::Alternation { .. } => { visitor.visit_alternation_in()?; } Frame::Concat { .. } => { visitor.visit_concat_in()?; } _ => {} } ast = x.child(); self.stack.push((post_ast, x)); break; } // Otherwise, we've finished visiting all the child nodes for // this AST, so we can post visit it now. visitor.visit_post(post_ast)?; } } } /// Build a stack frame for the given AST if one is needed (which occurs if /// and only if there are child nodes in the AST). Otherwise, return None. /// /// If this visits a class, then the underlying visitor implementation may /// return an error which will be passed on here. fn induct( &mut self, ast: &'a Ast, visitor: &mut V, ) -> Result>, V::Err> { Ok(match *ast { Ast::ClassBracketed(ref x) => { self.visit_class(x, visitor)?; None } Ast::Repetition(ref x) => Some(Frame::Repetition(x)), Ast::Group(ref x) => Some(Frame::Group(x)), Ast::Concat(ref x) if x.asts.is_empty() => None, Ast::Concat(ref x) => { Some(Frame::Concat { head: &x.asts[0], tail: &x.asts[1..] }) } Ast::Alternation(ref x) if x.asts.is_empty() => None, Ast::Alternation(ref x) => Some(Frame::Alternation { head: &x.asts[0], tail: &x.asts[1..], }), _ => None, }) } /// Pops the given frame. If the frame has an additional inductive step, /// then return it, otherwise return `None`. fn pop(&self, induct: Frame<'a>) -> Option> { match induct { Frame::Repetition(_) => None, Frame::Group(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None } else { Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) } } Frame::Alternation { tail, .. } => { if tail.is_empty() { None } else { Some(Frame::Alternation { head: &tail[0], tail: &tail[1..], }) } } } } fn visit_class( &mut self, ast: &'a ast::ClassBracketed, visitor: &mut V, ) -> Result<(), V::Err> { let mut ast = ClassInduct::from_bracketed(ast); loop { self.visit_class_pre(&ast, visitor)?; if let Some(x) = self.induct_class(&ast) { let child = x.child(); self.stack_class.push((ast, x)); ast = child; continue; } self.visit_class_post(&ast, visitor)?; // At this point, we now try to pop our call stack until it is // either empty or we hit another inductive case. loop { let (post_ast, frame) = match self.stack_class.pop() { None => return Ok(()), Some((post_ast, frame)) => (post_ast, frame), }; // If this is a union or a binary op, then we might have // additional inductive steps to process. if let Some(x) = self.pop_class(frame) { if let ClassFrame::BinaryRHS { ref op, .. } = x { visitor.visit_class_set_binary_op_in(op)?; } ast = x.child(); self.stack_class.push((post_ast, x)); break; } // Otherwise, we've finished visiting all the child nodes for // this class node, so we can post visit it now. self.visit_class_post(&post_ast, visitor)?; } } } /// Call the appropriate `Visitor` methods given an inductive step. fn visit_class_pre( &self, ast: &ClassInduct<'a>, visitor: &mut V, ) -> Result<(), V::Err> { match *ast { ClassInduct::Item(item) => { visitor.visit_class_set_item_pre(item)?; } ClassInduct::BinaryOp(op) => { visitor.visit_class_set_binary_op_pre(op)?; } } Ok(()) } /// Call the appropriate `Visitor` methods given an inductive step. fn visit_class_post( &self, ast: &ClassInduct<'a>, visitor: &mut V, ) -> Result<(), V::Err> { match *ast { ClassInduct::Item(item) => { visitor.visit_class_set_item_post(item)?; } ClassInduct::BinaryOp(op) => { visitor.visit_class_set_binary_op_post(op)?; } } Ok(()) } /// Build a stack frame for the given class node if one is needed (which /// occurs if and only if there are child nodes). Otherwise, return None. fn induct_class(&self, ast: &ClassInduct<'a>) -> Option> { match *ast { ClassInduct::Item(&ast::ClassSetItem::Bracketed(ref x)) => { match x.kind { ast::ClassSet::Item(ref item) => { Some(ClassFrame::Union { head: item, tail: &[] }) } ast::ClassSet::BinaryOp(ref op) => { Some(ClassFrame::Binary { op }) } } } ClassInduct::Item(&ast::ClassSetItem::Union(ref x)) => { if x.items.is_empty() { None } else { Some(ClassFrame::Union { head: &x.items[0], tail: &x.items[1..], }) } } ClassInduct::BinaryOp(op) => { Some(ClassFrame::BinaryLHS { op, lhs: &op.lhs, rhs: &op.rhs }) } _ => None, } } /// Pops the given frame. If the frame has an additional inductive step, /// then return it, otherwise return `None`. fn pop_class(&self, induct: ClassFrame<'a>) -> Option> { match induct { ClassFrame::Union { tail, .. } => { if tail.is_empty() { None } else { Some(ClassFrame::Union { head: &tail[0], tail: &tail[1..], }) } } ClassFrame::Binary { .. } => None, ClassFrame::BinaryLHS { op, rhs, .. } => { Some(ClassFrame::BinaryRHS { op, rhs }) } ClassFrame::BinaryRHS { .. } => None, } } } impl<'a> Frame<'a> { /// Perform the next inductive step on this frame and return the next /// child AST node to visit. fn child(&self) -> &'a Ast { match *self { Frame::Repetition(rep) => &rep.ast, Frame::Group(group) => &group.ast, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } } } impl<'a> ClassFrame<'a> { /// Perform the next inductive step on this frame and return the next /// child class node to visit. fn child(&self) -> ClassInduct<'a> { match *self { ClassFrame::Union { head, .. } => ClassInduct::Item(head), ClassFrame::Binary { op, .. } => ClassInduct::BinaryOp(op), ClassFrame::BinaryLHS { ref lhs, .. } => { ClassInduct::from_set(lhs) } ClassFrame::BinaryRHS { ref rhs, .. } => { ClassInduct::from_set(rhs) } } } } impl<'a> ClassInduct<'a> { fn from_bracketed(ast: &'a ast::ClassBracketed) -> ClassInduct<'a> { ClassInduct::from_set(&ast.kind) } fn from_set(ast: &'a ast::ClassSet) -> ClassInduct<'a> { match *ast { ast::ClassSet::Item(ref item) => ClassInduct::Item(item), ast::ClassSet::BinaryOp(ref op) => ClassInduct::BinaryOp(op), } } } impl<'a> core::fmt::Debug for ClassFrame<'a> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassFrame::Union { .. } => "Union", ClassFrame::Binary { .. } => "Binary", ClassFrame::BinaryLHS { .. } => "BinaryLHS", ClassFrame::BinaryRHS { .. } => "BinaryRHS", }; write!(f, "{}", x) } } impl<'a> core::fmt::Debug for ClassInduct<'a> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let x = match *self { ClassInduct::Item(it) => match *it { ast::ClassSetItem::Empty(_) => "Item(Empty)", ast::ClassSetItem::Literal(_) => "Item(Literal)", ast::ClassSetItem::Range(_) => "Item(Range)", ast::ClassSetItem::Ascii(_) => "Item(Ascii)", ast::ClassSetItem::Perl(_) => "Item(Perl)", ast::ClassSetItem::Unicode(_) => "Item(Unicode)", ast::ClassSetItem::Bracketed(_) => "Item(Bracketed)", ast::ClassSetItem::Union(_) => "Item(Union)", }, ClassInduct::BinaryOp(it) => match it.kind { ast::ClassSetBinaryOpKind::Intersection => { "BinaryOp(Intersection)" } ast::ClassSetBinaryOpKind::Difference => { "BinaryOp(Difference)" } ast::ClassSetBinaryOpKind::SymmetricDifference => { "BinaryOp(SymmetricDifference)" } }, }; write!(f, "{}", x) } } regex-syntax-0.8.2/src/debug.rs000064400000000000000000000070411046102023000145160ustar 00000000000000/// A type that wraps a single byte with a convenient fmt::Debug impl that /// escapes the byte. pub(crate) struct Byte(pub(crate) u8); impl core::fmt::Debug for Byte { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { // Special case ASCII space. It's too hard to read otherwise, so // put quotes around it. I sometimes wonder whether just '\x20' would // be better... if self.0 == b' ' { return write!(f, "' '"); } // 10 bytes is enough to cover any output from ascii::escape_default. let mut bytes = [0u8; 10]; let mut len = 0; for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { // capitalize \xab to \xAB if i >= 2 && b'a' <= b && b <= b'f' { b -= 32; } bytes[len] = b; len += 1; } write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) } } /// A type that provides a human readable debug impl for arbitrary bytes. /// /// This generally works best when the bytes are presumed to be mostly UTF-8, /// but will work for anything. /// /// N.B. This is copied nearly verbatim from regex-automata. Sigh. pub(crate) struct Bytes<'a>(pub(crate) &'a [u8]); impl<'a> core::fmt::Debug for Bytes<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "\"")?; // This is a sad re-implementation of a similar impl found in bstr. let mut bytes = self.0; while let Some(result) = utf8_decode(bytes) { let ch = match result { Ok(ch) => ch, Err(byte) => { write!(f, r"\x{:02x}", byte)?; bytes = &bytes[1..]; continue; } }; bytes = &bytes[ch.len_utf8()..]; match ch { '\0' => write!(f, "\\0")?, // ASCII control characters except \0, \n, \r, \t '\x01'..='\x08' | '\x0b' | '\x0c' | '\x0e'..='\x19' | '\x7f' => { write!(f, "\\x{:02x}", u32::from(ch))?; } '\n' | '\r' | '\t' | _ => { write!(f, "{}", ch.escape_debug())?; } } } write!(f, "\"")?; Ok(()) } } /// Decodes the next UTF-8 encoded codepoint from the given byte slice. /// /// If no valid encoding of a codepoint exists at the beginning of the given /// byte slice, then the first byte is returned instead. /// /// This returns `None` if and only if `bytes` is empty. pub(crate) fn utf8_decode(bytes: &[u8]) -> Option> { fn len(byte: u8) -> Option { if byte <= 0x7F { return Some(1); } else if byte & 0b1100_0000 == 0b1000_0000 { return None; } else if byte <= 0b1101_1111 { Some(2) } else if byte <= 0b1110_1111 { Some(3) } else if byte <= 0b1111_0111 { Some(4) } else { None } } if bytes.is_empty() { return None; } let len = match len(bytes[0]) { None => return Some(Err(bytes[0])), Some(len) if len > bytes.len() => return Some(Err(bytes[0])), Some(1) => return Some(Ok(char::from(bytes[0]))), Some(len) => len, }; match core::str::from_utf8(&bytes[..len]) { Ok(s) => Some(Ok(s.chars().next().unwrap())), Err(_) => Some(Err(bytes[0])), } } regex-syntax-0.8.2/src/either.rs000064400000000000000000000003021046102023000147010ustar 00000000000000/// A simple binary sum type. /// /// This is occasionally useful in an ad hoc fashion. #[derive(Clone, Debug, Eq, PartialEq)] pub enum Either { Left(Left), Right(Right), } regex-syntax-0.8.2/src/error.rs000064400000000000000000000244301046102023000145620ustar 00000000000000use alloc::{ format, string::{String, ToString}, vec, vec::Vec, }; use crate::{ast, hir}; /// This error type encompasses any error that can be returned by this crate. /// /// This error type is marked as `non_exhaustive`. This means that adding a /// new variant is not considered a breaking change. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum Error { /// An error that occurred while translating concrete syntax into abstract /// syntax (AST). Parse(ast::Error), /// An error that occurred while translating abstract syntax into a high /// level intermediate representation (HIR). Translate(hir::Error), } impl From for Error { fn from(err: ast::Error) -> Error { Error::Parse(err) } } impl From for Error { fn from(err: hir::Error) -> Error { Error::Translate(err) } } #[cfg(feature = "std")] impl std::error::Error for Error {} impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { Error::Parse(ref x) => x.fmt(f), Error::Translate(ref x) => x.fmt(f), } } } /// A helper type for formatting nice error messages. /// /// This type is responsible for reporting regex parse errors in a nice human /// readable format. Most of its complexity is from interspersing notational /// markers pointing out the position where an error occurred. #[derive(Debug)] pub struct Formatter<'e, E> { /// The original regex pattern in which the error occurred. pattern: &'e str, /// The error kind. It must impl fmt::Display. err: &'e E, /// The primary span of the error. span: &'e ast::Span, /// An auxiliary and optional span, in case the error needs to point to /// two locations (e.g., when reporting a duplicate capture group name). aux_span: Option<&'e ast::Span>, } impl<'e> From<&'e ast::Error> for Formatter<'e, ast::ErrorKind> { fn from(err: &'e ast::Error) -> Self { Formatter { pattern: err.pattern(), err: err.kind(), span: err.span(), aux_span: err.auxiliary_span(), } } } impl<'e> From<&'e hir::Error> for Formatter<'e, hir::ErrorKind> { fn from(err: &'e hir::Error) -> Self { Formatter { pattern: err.pattern(), err: err.kind(), span: err.span(), aux_span: None, } } } impl<'e, E: core::fmt::Display> core::fmt::Display for Formatter<'e, E> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let spans = Spans::from_formatter(self); if self.pattern.contains('\n') { let divider = repeat_char('~', 79); writeln!(f, "regex parse error:")?; writeln!(f, "{}", divider)?; let notated = spans.notate(); write!(f, "{}", notated)?; writeln!(f, "{}", divider)?; // If we have error spans that cover multiple lines, then we just // note the line numbers. if !spans.multi_line.is_empty() { let mut notes = vec![]; for span in &spans.multi_line { notes.push(format!( "on line {} (column {}) through line {} (column {})", span.start.line, span.start.column, span.end.line, span.end.column - 1 )); } writeln!(f, "{}", notes.join("\n"))?; } write!(f, "error: {}", self.err)?; } else { writeln!(f, "regex parse error:")?; let notated = Spans::from_formatter(self).notate(); write!(f, "{}", notated)?; write!(f, "error: {}", self.err)?; } Ok(()) } } /// This type represents an arbitrary number of error spans in a way that makes /// it convenient to notate the regex pattern. ("Notate" means "point out /// exactly where the error occurred in the regex pattern.") /// /// Technically, we can only ever have two spans given our current error /// structure. However, after toiling with a specific algorithm for handling /// two spans, it became obvious that an algorithm to handle an arbitrary /// number of spans was actually much simpler. struct Spans<'p> { /// The original regex pattern string. pattern: &'p str, /// The total width that should be used for line numbers. The width is /// used for left padding the line numbers for alignment. /// /// A value of `0` means line numbers should not be displayed. That is, /// the pattern is itself only one line. line_number_width: usize, /// All error spans that occur on a single line. This sequence always has /// length equivalent to the number of lines in `pattern`, where the index /// of the sequence represents a line number, starting at `0`. The spans /// in each line are sorted in ascending order. by_line: Vec>, /// All error spans that occur over one or more lines. That is, the start /// and end position of the span have different line numbers. The spans are /// sorted in ascending order. multi_line: Vec, } impl<'p> Spans<'p> { /// Build a sequence of spans from a formatter. fn from_formatter<'e, E: core::fmt::Display>( fmter: &'p Formatter<'e, E>, ) -> Spans<'p> { let mut line_count = fmter.pattern.lines().count(); // If the pattern ends with a `\n` literal, then our line count is // off by one, since a span can occur immediately after the last `\n`, // which is consider to be an additional line. if fmter.pattern.ends_with('\n') { line_count += 1; } let line_number_width = if line_count <= 1 { 0 } else { line_count.to_string().len() }; let mut spans = Spans { pattern: &fmter.pattern, line_number_width, by_line: vec![vec![]; line_count], multi_line: vec![], }; spans.add(fmter.span.clone()); if let Some(span) = fmter.aux_span { spans.add(span.clone()); } spans } /// Add the given span to this sequence, putting it in the right place. fn add(&mut self, span: ast::Span) { // This is grossly inefficient since we sort after each add, but right // now, we only ever add two spans at most. if span.is_one_line() { let i = span.start.line - 1; // because lines are 1-indexed self.by_line[i].push(span); self.by_line[i].sort(); } else { self.multi_line.push(span); self.multi_line.sort(); } } /// Notate the pattern string with carents (`^`) pointing at each span /// location. This only applies to spans that occur within a single line. fn notate(&self) -> String { let mut notated = String::new(); for (i, line) in self.pattern.lines().enumerate() { if self.line_number_width > 0 { notated.push_str(&self.left_pad_line_number(i + 1)); notated.push_str(": "); } else { notated.push_str(" "); } notated.push_str(line); notated.push('\n'); if let Some(notes) = self.notate_line(i) { notated.push_str(¬es); notated.push('\n'); } } notated } /// Return notes for the line indexed at `i` (zero-based). If there are no /// spans for the given line, then `None` is returned. Otherwise, an /// appropriately space padded string with correctly positioned `^` is /// returned, accounting for line numbers. fn notate_line(&self, i: usize) -> Option { let spans = &self.by_line[i]; if spans.is_empty() { return None; } let mut notes = String::new(); for _ in 0..self.line_number_padding() { notes.push(' '); } let mut pos = 0; for span in spans { for _ in pos..(span.start.column - 1) { notes.push(' '); pos += 1; } let note_len = span.end.column.saturating_sub(span.start.column); for _ in 0..core::cmp::max(1, note_len) { notes.push('^'); pos += 1; } } Some(notes) } /// Left pad the given line number with spaces such that it is aligned with /// other line numbers. fn left_pad_line_number(&self, n: usize) -> String { let n = n.to_string(); let pad = self.line_number_width.checked_sub(n.len()).unwrap(); let mut result = repeat_char(' ', pad); result.push_str(&n); result } /// Return the line number padding beginning at the start of each line of /// the pattern. /// /// If the pattern is only one line, then this returns a fixed padding /// for visual indentation. fn line_number_padding(&self) -> usize { if self.line_number_width == 0 { 4 } else { 2 + self.line_number_width } } } fn repeat_char(c: char, count: usize) -> String { core::iter::repeat(c).take(count).collect() } #[cfg(test)] mod tests { use alloc::string::ToString; use crate::ast::parse::Parser; fn assert_panic_message(pattern: &str, expected_msg: &str) { let result = Parser::new().parse(pattern); match result { Ok(_) => { panic!("regex should not have parsed"); } Err(err) => { assert_eq!(err.to_string(), expected_msg.trim()); } } } // See: https://github.com/rust-lang/regex/issues/464 #[test] fn regression_464() { let err = Parser::new().parse("a{\n").unwrap_err(); // This test checks that the error formatter doesn't panic. assert!(!err.to_string().is_empty()); } // See: https://github.com/rust-lang/regex/issues/545 #[test] fn repetition_quantifier_expects_a_valid_decimal() { assert_panic_message( r"\\u{[^}]*}", r#" regex parse error: \\u{[^}]*} ^ error: repetition quantifier expects a valid decimal "#, ); } } regex-syntax-0.8.2/src/hir/interval.rs000064400000000000000000000535341046102023000160460ustar 00000000000000use core::{char, cmp, fmt::Debug, slice}; use alloc::vec::Vec; use crate::unicode; // This module contains an *internal* implementation of interval sets. // // The primary invariant that interval sets guards is canonical ordering. That // is, every interval set contains an ordered sequence of intervals where // no two intervals are overlapping or adjacent. While this invariant is // occasionally broken within the implementation, it should be impossible for // callers to observe it. // // Since case folding (as implemented below) breaks that invariant, we roll // that into this API even though it is a little out of place in an otherwise // generic interval set. (Hence the reason why the `unicode` module is imported // here.) // // Some of the implementation complexity here is a result of me wanting to // preserve the sequential representation without using additional memory. // In many cases, we do use linear extra memory, but it is at most 2x and it // is amortized. If we relaxed the memory requirements, this implementation // could become much simpler. The extra memory is honestly probably OK, but // character classes (especially of the Unicode variety) can become quite // large, and it would be nice to keep regex compilation snappy even in debug // builds. (In the past, I have been careless with this area of code and it has // caused slow regex compilations in debug mode, so this isn't entirely // unwarranted.) // // Tests on this are relegated to the public API of HIR in src/hir.rs. #[derive(Clone, Debug)] pub struct IntervalSet { /// A sorted set of non-overlapping ranges. ranges: Vec, /// While not required at all for correctness, we keep track of whether an /// interval set has been case folded or not. This helps us avoid doing /// redundant work if, for example, a set has already been cased folded. /// And note that whether a set is folded or not is preserved through /// all of the pairwise set operations. That is, if both interval sets /// have been case folded, then any of difference, union, intersection or /// symmetric difference all produce a case folded set. /// /// Note that when this is true, it *must* be the case that the set is case /// folded. But when it's false, the set *may* be case folded. In other /// words, we only set this to true when we know it to be case, but we're /// okay with it being false if it would otherwise be costly to determine /// whether it should be true. This means code cannot assume that a false /// value necessarily indicates that the set is not case folded. /// /// Bottom line: this is a performance optimization. folded: bool, } impl Eq for IntervalSet {} // We implement PartialEq manually so that we don't consider the set's internal // 'folded' property to be part of its identity. The 'folded' property is // strictly an optimization. impl PartialEq for IntervalSet { fn eq(&self, other: &IntervalSet) -> bool { self.ranges.eq(&other.ranges) } } impl IntervalSet { /// Create a new set from a sequence of intervals. Each interval is /// specified as a pair of bounds, where both bounds are inclusive. /// /// The given ranges do not need to be in any specific order, and ranges /// may overlap. pub fn new>(intervals: T) -> IntervalSet { let ranges: Vec = intervals.into_iter().collect(); // An empty set is case folded. let folded = ranges.is_empty(); let mut set = IntervalSet { ranges, folded }; set.canonicalize(); set } /// Add a new interval to this set. pub fn push(&mut self, interval: I) { // TODO: This could be faster. e.g., Push the interval such that // it preserves canonicalization. self.ranges.push(interval); self.canonicalize(); // We don't know whether the new interval added here is considered // case folded, so we conservatively assume that the entire set is // no longer case folded if it was previously. self.folded = false; } /// Return an iterator over all intervals in this set. /// /// The iterator yields intervals in ascending order. pub fn iter(&self) -> IntervalSetIter<'_, I> { IntervalSetIter(self.ranges.iter()) } /// Return an immutable slice of intervals in this set. /// /// The sequence returned is in canonical ordering. pub fn intervals(&self) -> &[I] { &self.ranges } /// Expand this interval set such that it contains all case folded /// characters. For example, if this class consists of the range `a-z`, /// then applying case folding will result in the class containing both the /// ranges `a-z` and `A-Z`. /// /// This returns an error if the necessary case mapping data is not /// available. pub fn case_fold_simple(&mut self) -> Result<(), unicode::CaseFoldError> { if self.folded { return Ok(()); } let len = self.ranges.len(); for i in 0..len { let range = self.ranges[i]; if let Err(err) = range.case_fold_simple(&mut self.ranges) { self.canonicalize(); return Err(err); } } self.canonicalize(); self.folded = true; Ok(()) } /// Union this set with the given set, in place. pub fn union(&mut self, other: &IntervalSet) { if other.ranges.is_empty() || self.ranges == other.ranges { return; } // This could almost certainly be done more efficiently. self.ranges.extend(&other.ranges); self.canonicalize(); self.folded = self.folded && other.folded; } /// Intersect this set with the given set, in place. pub fn intersect(&mut self, other: &IntervalSet) { if self.ranges.is_empty() { return; } if other.ranges.is_empty() { self.ranges.clear(); // An empty set is case folded. self.folded = true; return; } // There should be a way to do this in-place with constant memory, // but I couldn't figure out a simple way to do it. So just append // the intersection to the end of this range, and then drain it before // we're done. let drain_end = self.ranges.len(); let mut ita = 0..drain_end; let mut itb = 0..other.ranges.len(); let mut a = ita.next().unwrap(); let mut b = itb.next().unwrap(); loop { if let Some(ab) = self.ranges[a].intersect(&other.ranges[b]) { self.ranges.push(ab); } let (it, aorb) = if self.ranges[a].upper() < other.ranges[b].upper() { (&mut ita, &mut a) } else { (&mut itb, &mut b) }; match it.next() { Some(v) => *aorb = v, None => break, } } self.ranges.drain(..drain_end); self.folded = self.folded && other.folded; } /// Subtract the given set from this set, in place. pub fn difference(&mut self, other: &IntervalSet) { if self.ranges.is_empty() || other.ranges.is_empty() { return; } // This algorithm is (to me) surprisingly complex. A search of the // interwebs indicate that this is a potentially interesting problem. // Folks seem to suggest interval or segment trees, but I'd like to // avoid the overhead (both runtime and conceptual) of that. // // The following is basically my Shitty First Draft. Therefore, in // order to grok it, you probably need to read each line carefully. // Simplifications are most welcome! // // Remember, we can assume the canonical format invariant here, which // says that all ranges are sorted, not overlapping and not adjacent in // each class. let drain_end = self.ranges.len(); let (mut a, mut b) = (0, 0); 'LOOP: while a < drain_end && b < other.ranges.len() { // Basically, the easy cases are when neither range overlaps with // each other. If the `b` range is less than our current `a` // range, then we can skip it and move on. if other.ranges[b].upper() < self.ranges[a].lower() { b += 1; continue; } // ... similarly for the `a` range. If it's less than the smallest // `b` range, then we can add it as-is. if self.ranges[a].upper() < other.ranges[b].lower() { let range = self.ranges[a]; self.ranges.push(range); a += 1; continue; } // Otherwise, we have overlapping ranges. assert!(!self.ranges[a].is_intersection_empty(&other.ranges[b])); // This part is tricky and was non-obvious to me without looking // at explicit examples (see the tests). The trickiness stems from // two things: 1) subtracting a range from another range could // yield two ranges and 2) after subtracting a range, it's possible // that future ranges can have an impact. The loop below advances // the `b` ranges until they can't possible impact the current // range. // // For example, if our `a` range is `a-t` and our next three `b` // ranges are `a-c`, `g-i`, `r-t` and `x-z`, then we need to apply // subtraction three times before moving on to the next `a` range. let mut range = self.ranges[a]; while b < other.ranges.len() && !range.is_intersection_empty(&other.ranges[b]) { let old_range = range; range = match range.difference(&other.ranges[b]) { (None, None) => { // We lost the entire range, so move on to the next // without adding this one. a += 1; continue 'LOOP; } (Some(range1), None) | (None, Some(range1)) => range1, (Some(range1), Some(range2)) => { self.ranges.push(range1); range2 } }; // It's possible that the `b` range has more to contribute // here. In particular, if it is greater than the original // range, then it might impact the next `a` range *and* it // has impacted the current `a` range as much as possible, // so we can quit. We don't bump `b` so that the next `a` // range can apply it. if other.ranges[b].upper() > old_range.upper() { break; } // Otherwise, the next `b` range might apply to the current // `a` range. b += 1; } self.ranges.push(range); a += 1; } while a < drain_end { let range = self.ranges[a]; self.ranges.push(range); a += 1; } self.ranges.drain(..drain_end); self.folded = self.folded && other.folded; } /// Compute the symmetric difference of the two sets, in place. /// /// This computes the symmetric difference of two interval sets. This /// removes all elements in this set that are also in the given set, /// but also adds all elements from the given set that aren't in this /// set. That is, the set will contain all elements in either set, /// but will not contain any elements that are in both sets. pub fn symmetric_difference(&mut self, other: &IntervalSet) { // TODO(burntsushi): Fix this so that it amortizes allocation. let mut intersection = self.clone(); intersection.intersect(other); self.union(other); self.difference(&intersection); } /// Negate this interval set. /// /// For all `x` where `x` is any element, if `x` was in this set, then it /// will not be in this set after negation. pub fn negate(&mut self) { if self.ranges.is_empty() { let (min, max) = (I::Bound::min_value(), I::Bound::max_value()); self.ranges.push(I::create(min, max)); // The set containing everything must case folded. self.folded = true; return; } // There should be a way to do this in-place with constant memory, // but I couldn't figure out a simple way to do it. So just append // the negation to the end of this range, and then drain it before // we're done. let drain_end = self.ranges.len(); // We do checked arithmetic below because of the canonical ordering // invariant. if self.ranges[0].lower() > I::Bound::min_value() { let upper = self.ranges[0].lower().decrement(); self.ranges.push(I::create(I::Bound::min_value(), upper)); } for i in 1..drain_end { let lower = self.ranges[i - 1].upper().increment(); let upper = self.ranges[i].lower().decrement(); self.ranges.push(I::create(lower, upper)); } if self.ranges[drain_end - 1].upper() < I::Bound::max_value() { let lower = self.ranges[drain_end - 1].upper().increment(); self.ranges.push(I::create(lower, I::Bound::max_value())); } self.ranges.drain(..drain_end); // We don't need to update whether this set is folded or not, because // it is conservatively preserved through negation. Namely, if a set // is not folded, then it is possible that its negation is folded, for // example, [^☃]. But we're fine with assuming that the set is not // folded in that case. (`folded` permits false negatives but not false // positives.) // // But what about when a set is folded, is its negation also // necessarily folded? Yes. Because if a set is folded, then for every // character in the set, it necessarily included its equivalence class // of case folded characters. Negating it in turn means that all // equivalence classes in the set are negated, and any equivalence // class that was previously not in the set is now entirely in the set. } /// Converts this set into a canonical ordering. fn canonicalize(&mut self) { if self.is_canonical() { return; } self.ranges.sort(); assert!(!self.ranges.is_empty()); // Is there a way to do this in-place with constant memory? I couldn't // figure out a way to do it. So just append the canonicalization to // the end of this range, and then drain it before we're done. let drain_end = self.ranges.len(); for oldi in 0..drain_end { // If we've added at least one new range, then check if we can // merge this range in the previously added range. if self.ranges.len() > drain_end { let (last, rest) = self.ranges.split_last_mut().unwrap(); if let Some(union) = last.union(&rest[oldi]) { *last = union; continue; } } let range = self.ranges[oldi]; self.ranges.push(range); } self.ranges.drain(..drain_end); } /// Returns true if and only if this class is in a canonical ordering. fn is_canonical(&self) -> bool { for pair in self.ranges.windows(2) { if pair[0] >= pair[1] { return false; } if pair[0].is_contiguous(&pair[1]) { return false; } } true } } /// An iterator over intervals. #[derive(Debug)] pub struct IntervalSetIter<'a, I>(slice::Iter<'a, I>); impl<'a, I> Iterator for IntervalSetIter<'a, I> { type Item = &'a I; fn next(&mut self) -> Option<&'a I> { self.0.next() } } pub trait Interval: Clone + Copy + Debug + Default + Eq + PartialEq + PartialOrd + Ord { type Bound: Bound; fn lower(&self) -> Self::Bound; fn upper(&self) -> Self::Bound; fn set_lower(&mut self, bound: Self::Bound); fn set_upper(&mut self, bound: Self::Bound); fn case_fold_simple( &self, intervals: &mut Vec, ) -> Result<(), unicode::CaseFoldError>; /// Create a new interval. fn create(lower: Self::Bound, upper: Self::Bound) -> Self { let mut int = Self::default(); if lower <= upper { int.set_lower(lower); int.set_upper(upper); } else { int.set_lower(upper); int.set_upper(lower); } int } /// Union the given overlapping range into this range. /// /// If the two ranges aren't contiguous, then this returns `None`. fn union(&self, other: &Self) -> Option { if !self.is_contiguous(other) { return None; } let lower = cmp::min(self.lower(), other.lower()); let upper = cmp::max(self.upper(), other.upper()); Some(Self::create(lower, upper)) } /// Intersect this range with the given range and return the result. /// /// If the intersection is empty, then this returns `None`. fn intersect(&self, other: &Self) -> Option { let lower = cmp::max(self.lower(), other.lower()); let upper = cmp::min(self.upper(), other.upper()); if lower <= upper { Some(Self::create(lower, upper)) } else { None } } /// Subtract the given range from this range and return the resulting /// ranges. /// /// If subtraction would result in an empty range, then no ranges are /// returned. fn difference(&self, other: &Self) -> (Option, Option) { if self.is_subset(other) { return (None, None); } if self.is_intersection_empty(other) { return (Some(self.clone()), None); } let add_lower = other.lower() > self.lower(); let add_upper = other.upper() < self.upper(); // We know this because !self.is_subset(other) and the ranges have // a non-empty intersection. assert!(add_lower || add_upper); let mut ret = (None, None); if add_lower { let upper = other.lower().decrement(); ret.0 = Some(Self::create(self.lower(), upper)); } if add_upper { let lower = other.upper().increment(); let range = Self::create(lower, self.upper()); if ret.0.is_none() { ret.0 = Some(range); } else { ret.1 = Some(range); } } ret } /// Compute the symmetric difference the given range from this range. This /// returns the union of the two ranges minus its intersection. fn symmetric_difference( &self, other: &Self, ) -> (Option, Option) { let union = match self.union(other) { None => return (Some(self.clone()), Some(other.clone())), Some(union) => union, }; let intersection = match self.intersect(other) { None => return (Some(self.clone()), Some(other.clone())), Some(intersection) => intersection, }; union.difference(&intersection) } /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. fn is_contiguous(&self, other: &Self) -> bool { let lower1 = self.lower().as_u32(); let upper1 = self.upper().as_u32(); let lower2 = other.lower().as_u32(); let upper2 = other.upper().as_u32(); cmp::max(lower1, lower2) <= cmp::min(upper1, upper2).saturating_add(1) } /// Returns true if and only if the intersection of this range and the /// other range is empty. fn is_intersection_empty(&self, other: &Self) -> bool { let (lower1, upper1) = (self.lower(), self.upper()); let (lower2, upper2) = (other.lower(), other.upper()); cmp::max(lower1, lower2) > cmp::min(upper1, upper2) } /// Returns true if and only if this range is a subset of the other range. fn is_subset(&self, other: &Self) -> bool { let (lower1, upper1) = (self.lower(), self.upper()); let (lower2, upper2) = (other.lower(), other.upper()); (lower2 <= lower1 && lower1 <= upper2) && (lower2 <= upper1 && upper1 <= upper2) } } pub trait Bound: Copy + Clone + Debug + Eq + PartialEq + PartialOrd + Ord { fn min_value() -> Self; fn max_value() -> Self; fn as_u32(self) -> u32; fn increment(self) -> Self; fn decrement(self) -> Self; } impl Bound for u8 { fn min_value() -> Self { u8::MIN } fn max_value() -> Self { u8::MAX } fn as_u32(self) -> u32 { u32::from(self) } fn increment(self) -> Self { self.checked_add(1).unwrap() } fn decrement(self) -> Self { self.checked_sub(1).unwrap() } } impl Bound for char { fn min_value() -> Self { '\x00' } fn max_value() -> Self { '\u{10FFFF}' } fn as_u32(self) -> u32 { u32::from(self) } fn increment(self) -> Self { match self { '\u{D7FF}' => '\u{E000}', c => char::from_u32(u32::from(c).checked_add(1).unwrap()).unwrap(), } } fn decrement(self) -> Self { match self { '\u{E000}' => '\u{D7FF}', c => char::from_u32(u32::from(c).checked_sub(1).unwrap()).unwrap(), } } } // Tests for interval sets are written in src/hir.rs against the public API. regex-syntax-0.8.2/src/hir/literal.rs000064400000000000000000003417651046102023000156640ustar 00000000000000/*! Provides literal extraction from `Hir` expressions. An [`Extractor`] pulls literals out of [`Hir`] expressions and returns a [`Seq`] of [`Literal`]s. The purpose of literal extraction is generally to provide avenues for optimizing regex searches. The main idea is that substring searches can be an order of magnitude faster than a regex search. Therefore, if one can execute a substring search to find candidate match locations and only run the regex search at those locations, then it is possible for huge improvements in performance to be realized. With that said, literal optimizations are generally a black art because even though substring search is generally faster, if the number of candidates produced is high, then it can create a lot of overhead by ping-ponging between the substring search and the regex search. Here are some heuristics that might be used to help increase the chances of effective literal optimizations: * Stick to small [`Seq`]s. If you search for too many literals, it's likely to lead to substring search that is only a little faster than a regex search, and thus the overhead of using literal optimizations in the first place might make things slower overall. * The literals in your [`Seq`] shouldn't be too short. In general, longer is better. A sequence corresponding to single bytes that occur frequently in the haystack, for example, is probably a bad literal optimization because it's likely to produce many false positive candidates. Longer literals are less likely to match, and thus probably produce fewer false positives. * If it's possible to estimate the approximate frequency of each byte according to some pre-computed background distribution, it is possible to compute a score of how "good" a `Seq` is. If a `Seq` isn't good enough, you might consider skipping the literal optimization and just use the regex engine. (It should be noted that there are always pathological cases that can make any kind of literal optimization be a net slower result. This is why it might be a good idea to be conservative, or to even provide a means for literal optimizations to be dynamically disabled if they are determined to be ineffective according to some measure.) You're encouraged to explore the methods on [`Seq`], which permit shrinking the size of sequences in a preference-order preserving fashion. Finally, note that it isn't strictly necessary to use an [`Extractor`]. Namely, an `Extractor` only uses public APIs of the [`Seq`] and [`Literal`] types, so it is possible to implement your own extractor. For example, for n-grams or "inner" literals (i.e., not prefix or suffix literals). The `Extractor` is mostly responsible for the case analysis over `Hir` expressions. Much of the "trickier" parts are how to combine literal sequences, and that is all implemented on [`Seq`]. */ use core::{cmp, mem, num::NonZeroUsize}; use alloc::{vec, vec::Vec}; use crate::hir::{self, Hir}; /// Extracts prefix or suffix literal sequences from [`Hir`] expressions. /// /// Literal extraction is based on the following observations: /// /// * Many regexes start with one or a small number of literals. /// * Substring search for literals is often much faster (sometimes by an order /// of magnitude) than a regex search. /// /// Thus, in many cases, one can search for literals to find candidate starting /// locations of a match, and then only run the full regex engine at each such /// location instead of over the full haystack. /// /// The main downside of literal extraction is that it can wind up causing a /// search to be slower overall. For example, if there are many matches or if /// there are many candidates that don't ultimately lead to a match, then a /// lot of overhead will be spent in shuffing back-and-forth between substring /// search and the regex engine. This is the fundamental reason why literal /// optimizations for regex patterns is sometimes considered a "black art." /// /// # Look-around assertions /// /// Literal extraction treats all look-around assertions as-if they match every /// empty string. So for example, the regex `\bquux\b` will yield a sequence /// containing a single exact literal `quux`. However, not all occurrences /// of `quux` correspond to a match a of the regex. For example, `\bquux\b` /// does not match `ZquuxZ` anywhere because `quux` does not fall on a word /// boundary. /// /// In effect, if your regex contains look-around assertions, then a match of /// an exact literal does not necessarily mean the regex overall matches. So /// you may still need to run the regex engine in such cases to confirm the /// match. /// /// The precise guarantee you get from a literal sequence is: if every literal /// in the sequence is exact and the original regex contains zero look-around /// assertions, then a preference-order multi-substring search of those /// literals will precisely match a preference-order search of the original /// regex. /// /// # Example /// /// This shows how to extract prefixes: /// /// ``` /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// /// let hir = parse(r"(a|b|c)(x|y|z)[A-Z]+foo")?; /// let got = Extractor::new().extract(&hir); /// // All literals returned are "inexact" because none of them reach the /// // match state. /// let expected = Seq::from_iter([ /// Literal::inexact("ax"), /// Literal::inexact("ay"), /// Literal::inexact("az"), /// Literal::inexact("bx"), /// Literal::inexact("by"), /// Literal::inexact("bz"), /// Literal::inexact("cx"), /// Literal::inexact("cy"), /// Literal::inexact("cz"), /// ]); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// This shows how to extract suffixes: /// /// ``` /// use regex_syntax::{ /// hir::literal::{Extractor, ExtractKind, Literal, Seq}, /// parse, /// }; /// /// let hir = parse(r"foo|[A-Z]+bar")?; /// let got = Extractor::new().kind(ExtractKind::Suffix).extract(&hir); /// // Since 'foo' gets to a match state, it is considered exact. But 'bar' /// // does not because of the '[A-Z]+', and thus is marked inexact. /// let expected = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Extractor { kind: ExtractKind, limit_class: usize, limit_repeat: usize, limit_literal_len: usize, limit_total: usize, } impl Extractor { /// Create a new extractor with a default configuration. /// /// The extractor can be optionally configured before calling /// [`Extractor::extract`] to get a literal sequence. pub fn new() -> Extractor { Extractor { kind: ExtractKind::Prefix, limit_class: 10, limit_repeat: 10, limit_literal_len: 100, limit_total: 250, } } /// Execute the extractor and return a sequence of literals. pub fn extract(&self, hir: &Hir) -> Seq { use crate::hir::HirKind::*; match *hir.kind() { Empty | Look(_) => Seq::singleton(self::Literal::exact(vec![])), Literal(hir::Literal(ref bytes)) => { let mut seq = Seq::singleton(self::Literal::exact(bytes.to_vec())); self.enforce_literal_len(&mut seq); seq } Class(hir::Class::Unicode(ref cls)) => { self.extract_class_unicode(cls) } Class(hir::Class::Bytes(ref cls)) => self.extract_class_bytes(cls), Repetition(ref rep) => self.extract_repetition(rep), Capture(hir::Capture { ref sub, .. }) => self.extract(sub), Concat(ref hirs) => match self.kind { ExtractKind::Prefix => self.extract_concat(hirs.iter()), ExtractKind::Suffix => self.extract_concat(hirs.iter().rev()), }, Alternation(ref hirs) => { // Unlike concat, we always union starting from the beginning, // since the beginning corresponds to the highest preference, // which doesn't change based on forwards vs reverse. self.extract_alternation(hirs.iter()) } } } /// Set the kind of literal sequence to extract from an [`Hir`] expression. /// /// The default is to extract prefixes, but suffixes can be selected /// instead. The contract for prefixes is that every match of the /// corresponding `Hir` must start with one of the literals in the sequence /// returned. Moreover, the _order_ of the sequence returned corresponds to /// the preference order. /// /// Suffixes satisfy a similar contract in that every match of the /// corresponding `Hir` must end with one of the literals in the sequence /// returned. However, there is no guarantee that the literals are in /// preference order. /// /// Remember that a sequence can be infinite. For example, unless the /// limits are configured to be impractically large, attempting to extract /// prefixes (or suffixes) for the pattern `[A-Z]` will return an infinite /// sequence. Generally speaking, if the sequence returned is infinite, /// then it is presumed to be unwise to do prefix (or suffix) optimizations /// for the pattern. pub fn kind(&mut self, kind: ExtractKind) -> &mut Extractor { self.kind = kind; self } /// Configure a limit on the length of the sequence that is permitted for /// a character class. If a character class exceeds this limit, then the /// sequence returned for it is infinite. /// /// This prevents classes like `[A-Z]` or `\pL` from getting turned into /// huge and likely unproductive sequences of literals. /// /// # Example /// /// This example shows how this limit can be lowered to decrease the tolerance /// for character classes being turned into literal sequences. /// /// ``` /// use regex_syntax::{hir::literal::{Extractor, Seq}, parse}; /// /// let hir = parse(r"[0-9]")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new([ /// "0", "1", "2", "3", "4", "5", "6", "7", "8", "9", /// ]); /// assert_eq!(expected, got); /// /// // Now let's shrink the limit and see how that changes things. /// let got = Extractor::new().limit_class(4).extract(&hir); /// let expected = Seq::infinite(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn limit_class(&mut self, limit: usize) -> &mut Extractor { self.limit_class = limit; self } /// Configure a limit on the total number of repetitions that is permitted /// before literal extraction is stopped. /// /// This is useful for limiting things like `(abcde){50}`, or more /// insidiously, `(?:){1000000000}`. This limit prevents any one single /// repetition from adding too much to a literal sequence. /// /// With this limit set, repetitions that exceed it will be stopped and any /// literals extracted up to that point will be made inexact. /// /// # Example /// /// This shows how to decrease the limit and compares it with the default. /// /// ``` /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// /// let hir = parse(r"(abc){8}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); /// assert_eq!(expected, got); /// /// // Now let's shrink the limit and see how that changes things. /// let got = Extractor::new().limit_repeat(4).extract(&hir); /// let expected = Seq::from_iter([ /// Literal::inexact("abcabcabcabc"), /// ]); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn limit_repeat(&mut self, limit: usize) -> &mut Extractor { self.limit_repeat = limit; self } /// Configure a limit on the maximum length of any literal in a sequence. /// /// This is useful for limiting things like `(abcde){5}{5}{5}{5}`. While /// each repetition or literal in that regex is small, when all the /// repetitions are applied, one ends up with a literal of length `5^4 = /// 625`. /// /// With this limit set, literals that exceed it will be made inexact and /// thus prevented from growing. /// /// # Example /// /// This shows how to decrease the limit and compares it with the default. /// /// ``` /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// /// let hir = parse(r"(abc){2}{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new(["abcabcabcabcabcabcabcabc"]); /// assert_eq!(expected, got); /// /// // Now let's shrink the limit and see how that changes things. /// let got = Extractor::new().limit_literal_len(14).extract(&hir); /// let expected = Seq::from_iter([ /// Literal::inexact("abcabcabcabcab"), /// ]); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn limit_literal_len(&mut self, limit: usize) -> &mut Extractor { self.limit_literal_len = limit; self } /// Configure a limit on the total number of literals that will be /// returned. /// /// This is useful as a practical measure for avoiding the creation of /// large sequences of literals. While the extractor will automatically /// handle local creations of large sequences (for example, `[A-Z]` yields /// an infinite sequence by default), large sequences can be created /// through non-local means as well. /// /// For example, `[ab]{3}{3}` would yield a sequence of length `512 = 2^9` /// despite each of the repetitions being small on their own. This limit /// thus represents a "catch all" for avoiding locally small sequences from /// combining into large sequences. /// /// # Example /// /// This example shows how reducing the limit will change the literal /// sequence returned. /// /// ``` /// use regex_syntax::{hir::literal::{Extractor, Literal, Seq}, parse}; /// /// let hir = parse(r"[ab]{2}{2}")?; /// /// let got = Extractor::new().extract(&hir); /// let expected = Seq::new([ /// "aaaa", "aaab", "aaba", "aabb", /// "abaa", "abab", "abba", "abbb", /// "baaa", "baab", "baba", "babb", /// "bbaa", "bbab", "bbba", "bbbb", /// ]); /// assert_eq!(expected, got); /// /// // The default limit is not too big, but big enough to extract all /// // literals from '[ab]{2}{2}'. If we shrink the limit to less than 16, /// // then we'll get a truncated set. Notice that it returns a sequence of /// // length 4 even though our limit was 10. This is because the sequence /// // is difficult to increase without blowing the limit. Notice also /// // that every literal in the sequence is now inexact because they were /// // stripped of some suffix. /// let got = Extractor::new().limit_total(10).extract(&hir); /// let expected = Seq::from_iter([ /// Literal::inexact("aa"), /// Literal::inexact("ab"), /// Literal::inexact("ba"), /// Literal::inexact("bb"), /// ]); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn limit_total(&mut self, limit: usize) -> &mut Extractor { self.limit_total = limit; self } /// Extract a sequence from the given concatenation. Sequences from each of /// the child HIR expressions are combined via cross product. /// /// This short circuits once the cross product turns into a sequence /// containing only inexact literals. fn extract_concat<'a, I: Iterator>(&self, it: I) -> Seq { let mut seq = Seq::singleton(self::Literal::exact(vec![])); for hir in it { // If every element in the sequence is inexact, then a cross // product will always be a no-op. Thus, there is nothing else we // can add to it and can quit early. Note that this also includes // infinite sequences. if seq.is_inexact() { break; } // Note that 'cross' also dispatches based on whether we're // extracting prefixes or suffixes. seq = self.cross(seq, &mut self.extract(hir)); } seq } /// Extract a sequence from the given alternation. /// /// This short circuits once the union turns into an infinite sequence. fn extract_alternation<'a, I: Iterator>( &self, it: I, ) -> Seq { let mut seq = Seq::empty(); for hir in it { // Once our 'seq' is infinite, every subsequent union // operation on it will itself always result in an // infinite sequence. Thus, it can never change and we can // short-circuit. if !seq.is_finite() { break; } seq = self.union(seq, &mut self.extract(hir)); } seq } /// Extract a sequence of literals from the given repetition. We do our /// best, Some examples: /// /// 'a*' => [inexact(a), exact("")] /// 'a*?' => [exact(""), inexact(a)] /// 'a+' => [inexact(a)] /// 'a{3}' => [exact(aaa)] /// 'a{3,5} => [inexact(aaa)] /// /// The key here really is making sure we get the 'inexact' vs 'exact' /// attributes correct on each of the literals we add. For example, the /// fact that 'a*' gives us an inexact 'a' and an exact empty string means /// that a regex like 'ab*c' will result in [inexact(ab), exact(ac)] /// literals being extracted, which might actually be a better prefilter /// than just 'a'. fn extract_repetition(&self, rep: &hir::Repetition) -> Seq { let mut subseq = self.extract(&rep.sub); match *rep { hir::Repetition { min: 0, max, greedy, .. } => { // When 'max=1', we can retain exactness, since 'a?' is // equivalent to 'a|'. Similarly below, 'a??' is equivalent to // '|a'. if max != Some(1) { subseq.make_inexact(); } let mut empty = Seq::singleton(Literal::exact(vec![])); if !greedy { mem::swap(&mut subseq, &mut empty); } self.union(subseq, &mut empty) } hir::Repetition { min, max: Some(max), .. } if min == max => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); let mut seq = Seq::singleton(Literal::exact(vec![])); for _ in 0..cmp::min(min, limit) { if seq.is_inexact() { break; } seq = self.cross(seq, &mut subseq.clone()); } if usize::try_from(min).is_err() || min > limit { seq.make_inexact(); } seq } hir::Repetition { min, .. } => { assert!(min > 0); // handled above let limit = u32::try_from(self.limit_repeat).unwrap_or(u32::MAX); let mut seq = Seq::singleton(Literal::exact(vec![])); for _ in 0..cmp::min(min, limit) { if seq.is_inexact() { break; } seq = self.cross(seq, &mut subseq.clone()); } seq.make_inexact(); seq } } } /// Convert the given Unicode class into a sequence of literals if the /// class is small enough. If the class is too big, return an infinite /// sequence. fn extract_class_unicode(&self, cls: &hir::ClassUnicode) -> Seq { if self.class_over_limit_unicode(cls) { return Seq::infinite(); } let mut seq = Seq::empty(); for r in cls.iter() { for ch in r.start()..=r.end() { seq.push(Literal::from(ch)); } } self.enforce_literal_len(&mut seq); seq } /// Convert the given byte class into a sequence of literals if the class /// is small enough. If the class is too big, return an infinite sequence. fn extract_class_bytes(&self, cls: &hir::ClassBytes) -> Seq { if self.class_over_limit_bytes(cls) { return Seq::infinite(); } let mut seq = Seq::empty(); for r in cls.iter() { for b in r.start()..=r.end() { seq.push(Literal::from(b)); } } self.enforce_literal_len(&mut seq); seq } /// Returns true if the given Unicode class exceeds the configured limits /// on this extractor. fn class_over_limit_unicode(&self, cls: &hir::ClassUnicode) -> bool { let mut count = 0; for r in cls.iter() { if count > self.limit_class { return true; } count += r.len(); } count > self.limit_class } /// Returns true if the given byte class exceeds the configured limits on /// this extractor. fn class_over_limit_bytes(&self, cls: &hir::ClassBytes) -> bool { let mut count = 0; for r in cls.iter() { if count > self.limit_class { return true; } count += r.len(); } count > self.limit_class } /// Compute the cross product of the two sequences if the result would be /// within configured limits. Otherwise, make `seq2` infinite and cross the /// infinite sequence with `seq1`. fn cross(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { if seq1.max_cross_len(seq2).map_or(false, |len| len > self.limit_total) { seq2.make_infinite(); } if let ExtractKind::Suffix = self.kind { seq1.cross_reverse(seq2); } else { seq1.cross_forward(seq2); } assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); self.enforce_literal_len(&mut seq1); seq1 } /// Union the two sequences if the result would be within configured /// limits. Otherwise, make `seq2` infinite and union the infinite sequence /// with `seq1`. fn union(&self, mut seq1: Seq, seq2: &mut Seq) -> Seq { if seq1.max_union_len(seq2).map_or(false, |len| len > self.limit_total) { // We try to trim our literal sequences to see if we can make // room for more literals. The idea is that we'd rather trim down // literals already in our sequence if it means we can add a few // more and retain a finite sequence. Otherwise, we'll union with // an infinite sequence and that infects everything and effectively // stops literal extraction in its tracks. // // We do we keep 4 bytes here? Well, it's a bit of an abstraction // leakage. Downstream, the literals may wind up getting fed to // the Teddy algorithm, which supports searching literals up to // length 4. So that's why we pick that number here. Arguably this // should be a tuneable parameter, but it seems a little tricky to // describe. And I'm still unsure if this is the right way to go // about culling literal sequences. match self.kind { ExtractKind::Prefix => { seq1.keep_first_bytes(4); seq2.keep_first_bytes(4); } ExtractKind::Suffix => { seq1.keep_last_bytes(4); seq2.keep_last_bytes(4); } } seq1.dedup(); seq2.dedup(); if seq1 .max_union_len(seq2) .map_or(false, |len| len > self.limit_total) { seq2.make_infinite(); } } seq1.union(seq2); assert!(seq1.len().map_or(true, |x| x <= self.limit_total)); seq1 } /// Applies the literal length limit to the given sequence. If none of the /// literals in the sequence exceed the limit, then this is a no-op. fn enforce_literal_len(&self, seq: &mut Seq) { let len = self.limit_literal_len; match self.kind { ExtractKind::Prefix => seq.keep_first_bytes(len), ExtractKind::Suffix => seq.keep_last_bytes(len), } } } impl Default for Extractor { fn default() -> Extractor { Extractor::new() } } /// The kind of literals to extract from an [`Hir`] expression. /// /// The default extraction kind is `Prefix`. #[non_exhaustive] #[derive(Clone, Debug)] pub enum ExtractKind { /// Extracts only prefix literals from a regex. Prefix, /// Extracts only suffix literals from a regex. /// /// Note that the sequence returned by suffix literals currently may /// not correctly represent leftmost-first or "preference" order match /// semantics. Suffix, } impl ExtractKind { /// Returns true if this kind is the `Prefix` variant. pub fn is_prefix(&self) -> bool { matches!(*self, ExtractKind::Prefix) } /// Returns true if this kind is the `Suffix` variant. pub fn is_suffix(&self) -> bool { matches!(*self, ExtractKind::Suffix) } } impl Default for ExtractKind { fn default() -> ExtractKind { ExtractKind::Prefix } } /// A sequence of literals. /// /// A `Seq` is very much like a set in that it represents a union of its /// members. That is, it corresponds to a set of literals where at least one /// must match in order for a particular [`Hir`] expression to match. (Whether /// this corresponds to the entire `Hir` expression, a prefix of it or a suffix /// of it depends on how the `Seq` was extracted from the `Hir`.) /// /// It is also unlike a set in that multiple identical literals may appear, /// and that the order of the literals in the `Seq` matters. For example, if /// the sequence is `[sam, samwise]` and leftmost-first matching is used, then /// `samwise` can never match and the sequence is equivalent to `[sam]`. /// /// # States of a sequence /// /// A `Seq` has a few different logical states to consider: /// /// * The sequence can represent "any" literal. When this happens, the set does /// not have a finite size. The purpose of this state is to inhibit callers /// from making assumptions about what literals are required in order to match /// a particular [`Hir`] expression. Generally speaking, when a set is in this /// state, literal optimizations are inhibited. A good example of a regex that /// will cause this sort of set to appear is `[A-Za-z]`. The character class /// is just too big (and also too narrow) to be usefully expanded into 52 /// different literals. (Note that the decision for when a seq should become /// infinite is determined by the caller. A seq itself has no hard-coded /// limits.) /// * The sequence can be empty, in which case, it is an affirmative statement /// that there are no literals that can match the corresponding `Hir`. /// Consequently, the `Hir` never matches any input. For example, `[a&&b]`. /// * The sequence can be non-empty, in which case, at least one of the /// literals must match in order for the corresponding `Hir` to match. /// /// # Example /// /// This example shows how literal sequences can be simplified by stripping /// suffixes and minimizing while maintaining preference order. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq = Seq::new(&[ /// "farm", /// "appliance", /// "faraway", /// "apple", /// "fare", /// "gap", /// "applicant", /// "applaud", /// ]); /// seq.keep_first_bytes(3); /// seq.minimize_by_preference(); /// // Notice that 'far' comes before 'app', which matches the order in the /// // original sequence. This guarantees that leftmost-first semantics are /// // not altered by simplifying the set. /// let expected = Seq::from_iter([ /// Literal::inexact("far"), /// Literal::inexact("app"), /// Literal::exact("gap"), /// ]); /// assert_eq!(expected, seq); /// ``` #[derive(Clone, Eq, PartialEq)] pub struct Seq { /// The members of this seq. /// /// When `None`, the seq represents all possible literals. That is, it /// prevents one from making assumptions about specific literals in the /// seq, and forces one to treat it as if any literal might be in the seq. /// /// Note that `Some(vec![])` is valid and corresponds to the empty seq of /// literals, i.e., a regex that can never match. For example, `[a&&b]`. /// It is distinct from `Some(vec![""])`, which corresponds to the seq /// containing an empty string, which matches at every position. literals: Option>, } impl Seq { /// Returns an empty sequence. /// /// An empty sequence matches zero literals, and thus corresponds to a /// regex that itself can never match. #[inline] pub fn empty() -> Seq { Seq { literals: Some(vec![]) } } /// Returns a sequence of literals without a finite size and may contain /// any literal. /// /// A sequence without finite size does not reveal anything about the /// characteristics of the literals in its set. There are no fixed prefixes /// or suffixes, nor are lower or upper bounds on the length of the literals /// in the set known. /// /// This is useful to represent constructs in a regex that are "too big" /// to useful represent as a sequence of literals. For example, `[A-Za-z]`. /// When sequences get too big, they lose their discriminating nature and /// are more likely to produce false positives, which in turn makes them /// less likely to speed up searches. /// /// More pragmatically, for many regexes, enumerating all possible literals /// is itself not possible or might otherwise use too many resources. So /// constraining the size of sets during extraction is a practical trade /// off to make. #[inline] pub fn infinite() -> Seq { Seq { literals: None } } /// Returns a sequence containing a single literal. #[inline] pub fn singleton(lit: Literal) -> Seq { Seq { literals: Some(vec![lit]) } } /// Returns a sequence of exact literals from the given byte strings. #[inline] pub fn new(it: I) -> Seq where I: IntoIterator, B: AsRef<[u8]>, { it.into_iter().map(|b| Literal::exact(b.as_ref())).collect() } /// If this is a finite sequence, return its members as a slice of /// literals. /// /// The slice returned may be empty, in which case, there are no literals /// that can match this sequence. #[inline] pub fn literals(&self) -> Option<&[Literal]> { self.literals.as_deref() } /// Push a literal to the end of this sequence. /// /// If this sequence is not finite, then this is a no-op. /// /// Similarly, if the most recently added item of this sequence is /// equivalent to the literal given, then it is not added. This reflects /// a `Seq`'s "set like" behavior, and represents a practical trade off. /// Namely, there is never any need to have two adjacent and equivalent /// literals in the same sequence, _and_ it is easy to detect in some /// cases. #[inline] pub fn push(&mut self, lit: Literal) { let lits = match self.literals { None => return, Some(ref mut lits) => lits, }; if lits.last().map_or(false, |m| m == &lit) { return; } lits.push(lit); } /// Make all of the literals in this sequence inexact. /// /// This is a no-op if this sequence is not finite. #[inline] pub fn make_inexact(&mut self) { let lits = match self.literals { None => return, Some(ref mut lits) => lits, }; for lit in lits.iter_mut() { lit.make_inexact(); } } /// Converts this sequence to an infinite sequence. /// /// This is a no-op if the sequence is already infinite. #[inline] pub fn make_infinite(&mut self) { self.literals = None; } /// Modify this sequence to contain the cross product between it and the /// sequence given. /// /// The cross product only considers literals in this sequence that are /// exact. That is, inexact literals are not extended. /// /// The literals are always drained from `other`, even if none are used. /// This permits callers to reuse the sequence allocation elsewhere. /// /// If this sequence is infinite, then this is a no-op, regardless of what /// `other` contains (and in this case, the literals are still drained from /// `other`). If `other` is infinite and this sequence is finite, then this /// is a no-op, unless this sequence contains a zero-length literal. In /// which case, the infiniteness of `other` infects this sequence, and this /// sequence is itself made infinite. /// /// Like [`Seq::union`], this may attempt to deduplicate literals. See /// [`Seq::dedup`] for how deduplication deals with exact and inexact /// literals. /// /// # Example /// /// This example shows basic usage and how exact and inexact literals /// interact. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::from_iter([ /// Literal::inexact("quux"), /// Literal::exact("baz"), /// ]); /// seq1.cross_forward(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// /// let expected = Seq::from_iter([ /// Literal::inexact("fooquux"), /// Literal::exact("foobaz"), /// Literal::inexact("bar"), /// ]); /// assert_eq!(expected, seq1); /// ``` /// /// This example shows the behavior of when `other` is an infinite /// sequence. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::infinite(); /// seq1.cross_forward(&mut seq2); /// /// // When seq2 is infinite, cross product doesn't add anything, but /// // ensures all members of seq1 are inexact. /// let expected = Seq::from_iter([ /// Literal::inexact("foo"), /// Literal::inexact("bar"), /// ]); /// assert_eq!(expected, seq1); /// ``` /// /// This example is like the one above, but shows what happens when this /// sequence contains an empty string. In this case, an infinite `other` /// sequence infects this sequence (because the empty string means that /// there are no finite prefixes): /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::exact(""), // inexact provokes same behavior /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::infinite(); /// seq1.cross_forward(&mut seq2); /// /// // seq1 is now infinite! /// assert!(!seq1.is_finite()); /// ``` /// /// This example shows the behavior of this sequence is infinite. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::infinite(); /// let mut seq2 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// seq1.cross_forward(&mut seq2); /// /// // seq1 remains unchanged. /// assert!(!seq1.is_finite()); /// // Even though the literals in seq2 weren't used, it was still drained. /// assert_eq!(Some(0), seq2.len()); /// ``` #[inline] pub fn cross_forward(&mut self, other: &mut Seq) { let (lits1, lits2) = match self.cross_preamble(other) { None => return, Some((lits1, lits2)) => (lits1, lits2), }; let newcap = lits1.len().saturating_mul(lits2.len()); for selflit in mem::replace(lits1, Vec::with_capacity(newcap)) { if !selflit.is_exact() { lits1.push(selflit); continue; } for otherlit in lits2.iter() { let mut newlit = Literal::exact(Vec::with_capacity( selflit.len() + otherlit.len(), )); newlit.extend(&selflit); newlit.extend(&otherlit); if !otherlit.is_exact() { newlit.make_inexact(); } lits1.push(newlit); } } lits2.drain(..); self.dedup(); } /// Modify this sequence to contain the cross product between it and /// the sequence given, where the sequences are treated as suffixes /// instead of prefixes. Namely, the sequence `other` is *prepended* /// to `self` (as opposed to `other` being *appended* to `self` in /// [`Seq::cross_forward`]). /// /// The cross product only considers literals in this sequence that are /// exact. That is, inexact literals are not extended. /// /// The literals are always drained from `other`, even if none are used. /// This permits callers to reuse the sequence allocation elsewhere. /// /// If this sequence is infinite, then this is a no-op, regardless of what /// `other` contains (and in this case, the literals are still drained from /// `other`). If `other` is infinite and this sequence is finite, then this /// is a no-op, unless this sequence contains a zero-length literal. In /// which case, the infiniteness of `other` infects this sequence, and this /// sequence is itself made infinite. /// /// Like [`Seq::union`], this may attempt to deduplicate literals. See /// [`Seq::dedup`] for how deduplication deals with exact and inexact /// literals. /// /// # Example /// /// This example shows basic usage and how exact and inexact literals /// interact. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::from_iter([ /// Literal::inexact("quux"), /// Literal::exact("baz"), /// ]); /// seq1.cross_reverse(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// /// let expected = Seq::from_iter([ /// Literal::inexact("quuxfoo"), /// Literal::inexact("bar"), /// Literal::exact("bazfoo"), /// ]); /// assert_eq!(expected, seq1); /// ``` /// /// This example shows the behavior of when `other` is an infinite /// sequence. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::infinite(); /// seq1.cross_reverse(&mut seq2); /// /// // When seq2 is infinite, cross product doesn't add anything, but /// // ensures all members of seq1 are inexact. /// let expected = Seq::from_iter([ /// Literal::inexact("foo"), /// Literal::inexact("bar"), /// ]); /// assert_eq!(expected, seq1); /// ``` /// /// This example is like the one above, but shows what happens when this /// sequence contains an empty string. In this case, an infinite `other` /// sequence infects this sequence (because the empty string means that /// there are no finite suffixes): /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::exact(""), // inexact provokes same behavior /// Literal::inexact("bar"), /// ]); /// let mut seq2 = Seq::infinite(); /// seq1.cross_reverse(&mut seq2); /// /// // seq1 is now infinite! /// assert!(!seq1.is_finite()); /// ``` /// /// This example shows the behavior when this sequence is infinite. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq1 = Seq::infinite(); /// let mut seq2 = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("bar"), /// ]); /// seq1.cross_reverse(&mut seq2); /// /// // seq1 remains unchanged. /// assert!(!seq1.is_finite()); /// // Even though the literals in seq2 weren't used, it was still drained. /// assert_eq!(Some(0), seq2.len()); /// ``` #[inline] pub fn cross_reverse(&mut self, other: &mut Seq) { let (lits1, lits2) = match self.cross_preamble(other) { None => return, Some((lits1, lits2)) => (lits1, lits2), }; // We basically proceed as we do in 'cross_forward' at this point, // except that the outer loop is now 'other' and the inner loop is now // 'self'. That's because 'self' corresponds to suffixes and 'other' // corresponds to the sequence we want to *prepend* to the suffixes. let newcap = lits1.len().saturating_mul(lits2.len()); let selflits = mem::replace(lits1, Vec::with_capacity(newcap)); for (i, otherlit) in lits2.drain(..).enumerate() { for selflit in selflits.iter() { if !selflit.is_exact() { // If the suffix isn't exact, then we can't prepend // anything to it. However, we still want to keep it. But // we only want to keep one of them, to avoid duplication. // (The duplication is okay from a correctness perspective, // but wasteful.) if i == 0 { lits1.push(selflit.clone()); } continue; } let mut newlit = Literal::exact(Vec::with_capacity( otherlit.len() + selflit.len(), )); newlit.extend(&otherlit); newlit.extend(&selflit); if !otherlit.is_exact() { newlit.make_inexact(); } lits1.push(newlit); } } self.dedup(); } /// A helper function the corresponds to the subtle preamble for both /// `cross_forward` and `cross_reverse`. In effect, it handles the cases /// of infinite sequences for both `self` and `other`, as well as ensuring /// that literals from `other` are drained even if they aren't used. fn cross_preamble<'a>( &'a mut self, other: &'a mut Seq, ) -> Option<(&'a mut Vec, &'a mut Vec)> { let lits2 = match other.literals { None => { // If our current seq contains the empty string and the seq // we're adding matches any literal, then it follows that the // current seq must now also match any literal. // // Otherwise, we just have to make sure everything in this // sequence is inexact. if self.min_literal_len() == Some(0) { *self = Seq::infinite(); } else { self.make_inexact(); } return None; } Some(ref mut lits) => lits, }; let lits1 = match self.literals { None => { // If we aren't going to make it to the end of this routine // where lits2 is drained, then we need to do it now. lits2.drain(..); return None; } Some(ref mut lits) => lits, }; Some((lits1, lits2)) } /// Unions the `other` sequence into this one. /// /// The literals are always drained out of the given `other` sequence, /// even if they are being unioned into an infinite sequence. This permits /// the caller to reuse the `other` sequence in another context. /// /// Some literal deduping may be performed. If any deduping happens, /// any leftmost-first or "preference" order match semantics will be /// preserved. /// /// # Example /// /// This example shows basic usage. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq1 = Seq::new(&["foo", "bar"]); /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// /// // Adjacent literals are deduped, but non-adjacent literals may not be. /// assert_eq!(Seq::new(&["foo", "bar", "quux", "foo"]), seq1); /// ``` /// /// This example shows that literals are drained from `other` even when /// they aren't necessarily used. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq1 = Seq::infinite(); /// // Infinite sequences have no finite length. /// assert_eq!(None, seq1.len()); /// /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union(&mut seq2); /// /// // seq1 is still infinite and seq2 has been drained. /// assert_eq!(None, seq1.len()); /// assert_eq!(Some(0), seq2.len()); /// ``` #[inline] pub fn union(&mut self, other: &mut Seq) { let lits2 = match other.literals { None => { // Unioning with an infinite sequence always results in an // infinite sequence. self.make_infinite(); return; } Some(ref mut lits) => lits.drain(..), }; let lits1 = match self.literals { None => return, Some(ref mut lits) => lits, }; lits1.extend(lits2); self.dedup(); } /// Unions the `other` sequence into this one by splice the `other` /// sequence at the position of the first zero-length literal. /// /// This is useful for preserving preference order semantics when combining /// two literal sequences. For example, in the regex `(a||f)+foo`, the /// correct preference order prefix sequence is `[a, foo, f]`. /// /// The literals are always drained out of the given `other` sequence, /// even if they are being unioned into an infinite sequence. This permits /// the caller to reuse the `other` sequence in another context. Note that /// the literals are drained even if no union is performed as well, i.e., /// when this sequence does not contain a zero-length literal. /// /// Some literal deduping may be performed. If any deduping happens, /// any leftmost-first or "preference" order match semantics will be /// preserved. /// /// # Example /// /// This example shows basic usage. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq1 = Seq::new(&["a", "", "f", ""]); /// let mut seq2 = Seq::new(&["foo"]); /// seq1.union_into_empty(&mut seq2); /// /// // The literals are pulled out of seq2. /// assert_eq!(Some(0), seq2.len()); /// // 'foo' gets spliced into seq1 where the first empty string occurs. /// assert_eq!(Seq::new(&["a", "foo", "f"]), seq1); /// ``` /// /// This example shows that literals are drained from `other` even when /// they aren't necessarily used. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq1 = Seq::new(&["foo", "bar"]); /// let mut seq2 = Seq::new(&["bar", "quux", "foo"]); /// seq1.union_into_empty(&mut seq2); /// /// // seq1 has no zero length literals, so no splicing happens. /// assert_eq!(Seq::new(&["foo", "bar"]), seq1); /// // Even though no splicing happens, seq2 is still drained. /// assert_eq!(Some(0), seq2.len()); /// ``` #[inline] pub fn union_into_empty(&mut self, other: &mut Seq) { let lits2 = other.literals.as_mut().map(|lits| lits.drain(..)); let lits1 = match self.literals { None => return, Some(ref mut lits) => lits, }; let first_empty = match lits1.iter().position(|m| m.is_empty()) { None => return, Some(i) => i, }; let lits2 = match lits2 { None => { // Note that we are only here if we've found an empty literal, // which implies that an infinite sequence infects this seq and // also turns it into an infinite sequence. self.literals = None; return; } Some(lits) => lits, }; // Clearing out the empties needs to come before the splice because // the splice might add more empties that we don't want to get rid // of. Since we're splicing into the position of the first empty, the // 'first_empty' position computed above is still correct. lits1.retain(|m| !m.is_empty()); lits1.splice(first_empty..first_empty, lits2); self.dedup(); } /// Deduplicate adjacent equivalent literals in this sequence. /// /// If adjacent literals are equivalent strings but one is exact and the /// other inexact, the inexact literal is kept and the exact one is /// removed. /// /// Deduping an infinite sequence is a no-op. /// /// # Example /// /// This example shows how literals that are duplicate byte strings but /// are not equivalent with respect to exactness are resolved. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::inexact("foo"), /// ]); /// seq.dedup(); /// /// assert_eq!(Seq::from_iter([Literal::inexact("foo")]), seq); /// ``` #[inline] pub fn dedup(&mut self) { if let Some(ref mut lits) = self.literals { lits.dedup_by(|lit1, lit2| { if lit1.as_bytes() != lit2.as_bytes() { return false; } if lit1.is_exact() != lit2.is_exact() { lit1.make_inexact(); lit2.make_inexact(); } true }); } } /// Sorts this sequence of literals lexicographically. /// /// Note that if, before sorting, if a literal that is a prefix of another /// literal appears after it, then after sorting, the sequence will not /// represent the same preference order match semantics. For example, /// sorting the sequence `[samwise, sam]` yields the sequence `[sam, /// samwise]`. Under preference order semantics, the latter sequence will /// never match `samwise` where as the first sequence can. /// /// # Example /// /// This example shows basic usage. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq = Seq::new(&["foo", "quux", "bar"]); /// seq.sort(); /// /// assert_eq!(Seq::new(&["bar", "foo", "quux"]), seq); /// ``` #[inline] pub fn sort(&mut self) { if let Some(ref mut lits) = self.literals { lits.sort(); } } /// Reverses all of the literals in this sequence. /// /// The order of the sequence itself is preserved. /// /// # Example /// /// This example shows basic usage. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let mut seq = Seq::new(&["oof", "rab"]); /// seq.reverse_literals(); /// assert_eq!(Seq::new(&["foo", "bar"]), seq); /// ``` #[inline] pub fn reverse_literals(&mut self) { if let Some(ref mut lits) = self.literals { for lit in lits.iter_mut() { lit.reverse(); } } } /// Shrinks this seq to its minimal size while respecting the preference /// order of its literals. /// /// While this routine will remove duplicate literals from this seq, it /// will also remove literals that can never match in a leftmost-first or /// "preference order" search. Similar to [`Seq::dedup`], if a literal is /// deduped, then the one that remains is made inexact. /// /// This is a no-op on seqs that are empty or not finite. /// /// # Example /// /// This example shows the difference between `{sam, samwise}` and /// `{samwise, sam}`. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// // If 'sam' comes before 'samwise' and a preference order search is /// // executed, then 'samwise' can never match. /// let mut seq = Seq::new(&["sam", "samwise"]); /// seq.minimize_by_preference(); /// assert_eq!(Seq::from_iter([Literal::inexact("sam")]), seq); /// /// // But if they are reversed, then it's possible for 'samwise' to match /// // since it is given higher preference. /// let mut seq = Seq::new(&["samwise", "sam"]); /// seq.minimize_by_preference(); /// assert_eq!(Seq::new(&["samwise", "sam"]), seq); /// ``` /// /// This example shows that if an empty string is in this seq, then /// anything that comes after it can never match. /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// // An empty string is a prefix of all strings, so it automatically /// // inhibits any subsequent strings from matching. /// let mut seq = Seq::new(&["foo", "bar", "", "quux", "fox"]); /// seq.minimize_by_preference(); /// let expected = Seq::from_iter([ /// Literal::exact("foo"), /// Literal::exact("bar"), /// Literal::inexact(""), /// ]); /// assert_eq!(expected, seq); /// /// // And of course, if it's at the beginning, then it makes it impossible /// // for anything else to match. /// let mut seq = Seq::new(&["", "foo", "quux", "fox"]); /// seq.minimize_by_preference(); /// assert_eq!(Seq::from_iter([Literal::inexact("")]), seq); /// ``` #[inline] pub fn minimize_by_preference(&mut self) { if let Some(ref mut lits) = self.literals { PreferenceTrie::minimize(lits, false); } } /// Trims all literals in this seq such that only the first `len` bytes /// remain. If a literal has less than or equal to `len` bytes, then it /// remains unchanged. Otherwise, it is trimmed and made inexact. /// /// # Example /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq = Seq::new(&["a", "foo", "quux"]); /// seq.keep_first_bytes(2); /// /// let expected = Seq::from_iter([ /// Literal::exact("a"), /// Literal::inexact("fo"), /// Literal::inexact("qu"), /// ]); /// assert_eq!(expected, seq); /// ``` #[inline] pub fn keep_first_bytes(&mut self, len: usize) { if let Some(ref mut lits) = self.literals { for m in lits.iter_mut() { m.keep_first_bytes(len); } } } /// Trims all literals in this seq such that only the last `len` bytes /// remain. If a literal has less than or equal to `len` bytes, then it /// remains unchanged. Otherwise, it is trimmed and made inexact. /// /// # Example /// /// ``` /// use regex_syntax::hir::literal::{Literal, Seq}; /// /// let mut seq = Seq::new(&["a", "foo", "quux"]); /// seq.keep_last_bytes(2); /// /// let expected = Seq::from_iter([ /// Literal::exact("a"), /// Literal::inexact("oo"), /// Literal::inexact("ux"), /// ]); /// assert_eq!(expected, seq); /// ``` #[inline] pub fn keep_last_bytes(&mut self, len: usize) { if let Some(ref mut lits) = self.literals { for m in lits.iter_mut() { m.keep_last_bytes(len); } } } /// Returns true if this sequence is finite. /// /// When false, this sequence is infinite and must be treated as if it /// contains every possible literal. #[inline] pub fn is_finite(&self) -> bool { self.literals.is_some() } /// Returns true if and only if this sequence is finite and empty. /// /// An empty sequence never matches anything. It can only be produced by /// literal extraction when the corresponding regex itself cannot match. #[inline] pub fn is_empty(&self) -> bool { self.len() == Some(0) } /// Returns the number of literals in this sequence if the sequence is /// finite. If the sequence is infinite, then `None` is returned. #[inline] pub fn len(&self) -> Option { self.literals.as_ref().map(|lits| lits.len()) } /// Returns true if and only if all literals in this sequence are exact. /// /// This returns false if the sequence is infinite. #[inline] pub fn is_exact(&self) -> bool { self.literals().map_or(false, |lits| lits.iter().all(|x| x.is_exact())) } /// Returns true if and only if all literals in this sequence are inexact. /// /// This returns true if the sequence is infinite. #[inline] pub fn is_inexact(&self) -> bool { self.literals().map_or(true, |lits| lits.iter().all(|x| !x.is_exact())) } /// Return the maximum length of the sequence that would result from /// unioning `self` with `other`. If either set is infinite, then this /// returns `None`. #[inline] pub fn max_union_len(&self, other: &Seq) -> Option { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_add(len2)) } /// Return the maximum length of the sequence that would result from the /// cross product of `self` with `other`. If either set is infinite, then /// this returns `None`. #[inline] pub fn max_cross_len(&self, other: &Seq) -> Option { let len1 = self.len()?; let len2 = other.len()?; Some(len1.saturating_mul(len2)) } /// Returns the length of the shortest literal in this sequence. /// /// If the sequence is infinite or empty, then this returns `None`. #[inline] pub fn min_literal_len(&self) -> Option { self.literals.as_ref()?.iter().map(|x| x.len()).min() } /// Returns the length of the longest literal in this sequence. /// /// If the sequence is infinite or empty, then this returns `None`. #[inline] pub fn max_literal_len(&self) -> Option { self.literals.as_ref()?.iter().map(|x| x.len()).max() } /// Returns the longest common prefix from this seq. /// /// If the seq matches any literal or other contains no literals, then /// there is no meaningful prefix and this returns `None`. /// /// # Example /// /// This shows some example seqs and their longest common prefix. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let seq = Seq::new(&["foo", "foobar", "fo"]); /// assert_eq!(Some(&b"fo"[..]), seq.longest_common_prefix()); /// let seq = Seq::new(&["foo", "foo"]); /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_prefix()); /// let seq = Seq::new(&["foo", "bar"]); /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); /// let seq = Seq::new(&[""]); /// assert_eq!(Some(&b""[..]), seq.longest_common_prefix()); /// /// let seq = Seq::infinite(); /// assert_eq!(None, seq.longest_common_prefix()); /// let seq = Seq::empty(); /// assert_eq!(None, seq.longest_common_prefix()); /// ``` #[inline] pub fn longest_common_prefix(&self) -> Option<&[u8]> { // If we match everything or match nothing, then there's no meaningful // longest common prefix. let lits = match self.literals { None => return None, Some(ref lits) => lits, }; if lits.len() == 0 { return None; } let base = lits[0].as_bytes(); let mut len = base.len(); for m in lits.iter().skip(1) { len = m .as_bytes() .iter() .zip(base[..len].iter()) .take_while(|&(a, b)| a == b) .count(); if len == 0 { return Some(&[]); } } Some(&base[..len]) } /// Returns the longest common suffix from this seq. /// /// If the seq matches any literal or other contains no literals, then /// there is no meaningful suffix and this returns `None`. /// /// # Example /// /// This shows some example seqs and their longest common suffix. /// /// ``` /// use regex_syntax::hir::literal::Seq; /// /// let seq = Seq::new(&["oof", "raboof", "of"]); /// assert_eq!(Some(&b"of"[..]), seq.longest_common_suffix()); /// let seq = Seq::new(&["foo", "foo"]); /// assert_eq!(Some(&b"foo"[..]), seq.longest_common_suffix()); /// let seq = Seq::new(&["foo", "bar"]); /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); /// let seq = Seq::new(&[""]); /// assert_eq!(Some(&b""[..]), seq.longest_common_suffix()); /// /// let seq = Seq::infinite(); /// assert_eq!(None, seq.longest_common_suffix()); /// let seq = Seq::empty(); /// assert_eq!(None, seq.longest_common_suffix()); /// ``` #[inline] pub fn longest_common_suffix(&self) -> Option<&[u8]> { // If we match everything or match nothing, then there's no meaningful // longest common suffix. let lits = match self.literals { None => return None, Some(ref lits) => lits, }; if lits.len() == 0 { return None; } let base = lits[0].as_bytes(); let mut len = base.len(); for m in lits.iter().skip(1) { len = m .as_bytes() .iter() .rev() .zip(base[base.len() - len..].iter().rev()) .take_while(|&(a, b)| a == b) .count(); if len == 0 { return Some(&[]); } } Some(&base[base.len() - len..]) } /// Optimizes this seq while treating its literals as prefixes and /// respecting the preference order of its literals. /// /// The specific way "optimization" works is meant to be an implementation /// detail, as it essentially represents a set of heuristics. The goal /// that optimization tries to accomplish is to make the literals in this /// set reflect inputs that will result in a more effective prefilter. /// Principally by reducing the false positive rate of candidates found by /// the literals in this sequence. That is, when a match of a literal is /// found, we would like it to be a strong predictor of the overall match /// of the regex. If it isn't, then much time will be spent starting and /// stopping the prefilter search and attempting to confirm the match only /// to have it fail. /// /// Some of those heuristics might be: /// /// * Identifying a common prefix from a larger sequence of literals, and /// shrinking the sequence down to that single common prefix. /// * Rejecting the sequence entirely if it is believed to result in very /// high false positive rate. When this happens, the sequence is made /// infinite. /// * Shrinking the sequence to a smaller number of literals representing /// prefixes, but not shrinking it so much as to make literals too short. /// (A sequence with very short literals, of 1 or 2 bytes, will typically /// result in a higher false positive rate.) /// /// Optimization should only be run once extraction is complete. Namely, /// optimization may make assumptions that do not compose with other /// operations in the middle of extraction. For example, optimization will /// reduce `[E(sam), E(samwise)]` to `[E(sam)]`, but such a transformation /// is only valid if no other extraction will occur. If other extraction /// may occur, then the correct transformation would be to `[I(sam)]`. /// /// The [`Seq::optimize_for_suffix_by_preference`] does the same thing, but /// for suffixes. /// /// # Example /// /// This shows how optimization might transform a sequence. Note that /// the specific behavior is not a documented guarantee. The heuristics /// used are an implementation detail and may change over time in semver /// compatible releases. /// /// ``` /// use regex_syntax::hir::literal::{Seq, Literal}; /// /// let mut seq = Seq::new(&[ /// "samantha", /// "sam", /// "samwise", /// "frodo", /// ]); /// seq.optimize_for_prefix_by_preference(); /// assert_eq!(Seq::from_iter([ /// Literal::exact("samantha"), /// // Kept exact even though 'samwise' got pruned /// // because optimization assumes literal extraction /// // has finished. /// Literal::exact("sam"), /// Literal::exact("frodo"), /// ]), seq); /// ``` /// /// # Example: optimization may make the sequence infinite /// /// If the heuristics deem that the sequence could cause a very high false /// positive rate, then it may make the sequence infinite, effectively /// disabling its use as a prefilter. /// /// ``` /// use regex_syntax::hir::literal::{Seq, Literal}; /// /// let mut seq = Seq::new(&[ /// "samantha", /// // An empty string matches at every position, /// // thus rendering the prefilter completely /// // ineffective. /// "", /// "sam", /// "samwise", /// "frodo", /// ]); /// seq.optimize_for_prefix_by_preference(); /// assert!(!seq.is_finite()); /// ``` /// /// Do note that just because there is a `" "` in the sequence, that /// doesn't mean the sequence will always be made infinite after it is /// optimized. Namely, if the sequence is considered exact (any match /// corresponds to an overall match of the original regex), then any match /// is an overall match, and so the false positive rate is always `0`. /// /// To demonstrate this, we remove `samwise` from our sequence. This /// results in no optimization happening and all literals remain exact. /// Thus the entire sequence is exact, and it is kept as-is, even though /// one is an ASCII space: /// /// ``` /// use regex_syntax::hir::literal::{Seq, Literal}; /// /// let mut seq = Seq::new(&[ /// "samantha", /// " ", /// "sam", /// "frodo", /// ]); /// seq.optimize_for_prefix_by_preference(); /// assert!(seq.is_finite()); /// ``` #[inline] pub fn optimize_for_prefix_by_preference(&mut self) { self.optimize_by_preference(true); } /// Optimizes this seq while treating its literals as suffixes and /// respecting the preference order of its literals. /// /// Optimization should only be run once extraction is complete. /// /// The [`Seq::optimize_for_prefix_by_preference`] does the same thing, but /// for prefixes. See its documentation for more explanation. #[inline] pub fn optimize_for_suffix_by_preference(&mut self) { self.optimize_by_preference(false); } fn optimize_by_preference(&mut self, prefix: bool) { let origlen = match self.len() { None => return, Some(len) => len, }; // Just give up now if our sequence contains an empty string. if self.min_literal_len().map_or(false, |len| len == 0) { // We squash the sequence so that nobody else gets any bright // ideas to try and use it. An empty string implies a match at // every position. A prefilter cannot help you here. self.make_infinite(); return; } // Make sure we start with the smallest sequence possible. We use a // special version of preference minimization that retains exactness. // This is legal because optimization is only expected to occur once // extraction is complete. if prefix { if let Some(ref mut lits) = self.literals { PreferenceTrie::minimize(lits, true); } } // Look for a common prefix (or suffix). If we found one of those and // it's long enough, then it's a good bet that it will be our fastest // possible prefilter since single-substring search is so fast. let fix = if prefix { self.longest_common_prefix() } else { self.longest_common_suffix() }; if let Some(fix) = fix { // As a special case, if we have a common prefix and the leading // byte of that prefix is one that we think probably occurs rarely, // then strip everything down to just that single byte. This should // promote the use of memchr. // // ... we only do this though if our sequence has more than one // literal. Otherwise, we'd rather just stick with a single literal // scan. That is, using memchr is probably better than looking // for 2 or more literals, but probably not as good as a straight // memmem search. // // ... and also only do this when the prefix is short and probably // not too discriminatory anyway. If it's longer, then it's // probably quite discriminatory and thus is likely to have a low // false positive rate. if prefix && origlen > 1 && fix.len() >= 1 && fix.len() <= 3 && rank(fix[0]) < 200 { self.keep_first_bytes(1); self.dedup(); return; } // We only strip down to the common prefix/suffix if we think // the existing set of literals isn't great, or if the common // prefix/suffix is expected to be particularly discriminatory. let isfast = self.is_exact() && self.len().map_or(false, |len| len <= 16); let usefix = fix.len() > 4 || (fix.len() > 1 && !isfast); if usefix { // If we keep exactly the number of bytes equal to the length // of the prefix (or suffix), then by the definition of a // prefix, every literal in the sequence will be equivalent. // Thus, 'dedup' will leave us with one literal. // // We do it this way to avoid an alloc, but also to make sure // the exactness of literals is kept (or not). if prefix { self.keep_first_bytes(fix.len()); } else { self.keep_last_bytes(fix.len()); } self.dedup(); assert_eq!(Some(1), self.len()); // We still fall through here. In particular, we want our // longest common prefix to be subject to the poison check. } } // If we have an exact sequence, we *probably* just want to keep it // as-is. But there are some cases where we don't. So we save a copy of // the exact sequence now, and then try to do some more optimizations // below. If those don't work out, we go back to this exact sequence. // // The specific motivation for this is that we sometimes wind up with // an exact sequence with a hefty number of literals. Say, 100. If we // stuck with that, it would be too big for Teddy and would result in // using Aho-Corasick. Which is fine... but the lazy DFA is plenty // suitable in such cases. The real issue is that we will wind up not // using a fast prefilter at all. So in cases like this, even though // we have an exact sequence, it would be better to try and shrink the // sequence (which we do below) and use it as a prefilter that can // produce false positive matches. // // But if the shrinking below results in a sequence that "sucks," then // we don't want to use that because we already have an exact sequence // in hand. let exact: Option = if self.is_exact() { Some(self.clone()) } else { None }; // Now we attempt to shorten the sequence. The idea here is that we // don't want to look for too many literals, but we want to shorten // our sequence enough to improve our odds of using better algorithms // downstream (such as Teddy). // // The pair of numbers in this list corresponds to the maximal prefix // (in bytes) to keep for all literals and the length of the sequence // at which to do it. // // So for example, the pair (3, 500) would mean, "if we have more than // 500 literals in our sequence, then truncate all of our literals // such that they are at most 3 bytes in length and the minimize the // sequence." const ATTEMPTS: [(usize, usize); 5] = [(5, 10), (4, 10), (3, 64), (2, 64), (1, 10)]; for (keep, limit) in ATTEMPTS { let len = match self.len() { None => break, Some(len) => len, }; if len <= limit { break; } if prefix { self.keep_first_bytes(keep); } else { self.keep_last_bytes(keep); } if prefix { if let Some(ref mut lits) = self.literals { PreferenceTrie::minimize(lits, true); } } } // Check for a poison literal. A poison literal is one that is short // and is believed to have a very high match count. These poisons // generally lead to a prefilter with a very high false positive rate, // and thus overall worse performance. // // We do this last because we could have gone from a non-poisonous // sequence to a poisonous one. Perhaps we should add some code to // prevent such transitions in the first place, but then again, we // likely only made the transition in the first place if the sequence // was itself huge. And huge sequences are themselves poisonous. So... if let Some(lits) = self.literals() { if lits.iter().any(|lit| lit.is_poisonous()) { self.make_infinite(); } } // OK, if we had an exact sequence before attempting more optimizations // above and our post-optimized sequence sucks for some reason or // another, then we go back to the exact sequence. if let Some(exact) = exact { // If optimizing resulted in dropping our literals, then certainly // backup and use the exact sequence that we had. if !self.is_finite() { *self = exact; return; } // If our optimized sequence contains a short literal, then it's // *probably* not so great. So throw it away and revert to the // exact sequence. if self.min_literal_len().map_or(true, |len| len <= 2) { *self = exact; return; } // Finally, if our optimized sequence is "big" (i.e., can't use // Teddy), then also don't use it and rely on the exact sequence. if self.len().map_or(true, |len| len > 64) { *self = exact; return; } } } } impl core::fmt::Debug for Seq { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "Seq")?; if let Some(lits) = self.literals() { f.debug_list().entries(lits.iter()).finish() } else { write!(f, "[∞]") } } } impl FromIterator for Seq { fn from_iter>(it: T) -> Seq { let mut seq = Seq::empty(); for literal in it { seq.push(literal); } seq } } /// A single literal extracted from an [`Hir`] expression. /// /// A literal is composed of two things: /// /// * A sequence of bytes. No guarantees with respect to UTF-8 are provided. /// In particular, even if the regex a literal is extracted from is UTF-8, the /// literal extracted may not be valid UTF-8. (For example, if an [`Extractor`] /// limit resulted in trimming a literal in a way that splits a codepoint.) /// * Whether the literal is "exact" or not. An "exact" literal means that it /// has not been trimmed, and may continue to be extended. If a literal is /// "exact" after visiting the entire `Hir` expression, then this implies that /// the literal leads to a match state. (Although it doesn't necessarily imply /// all occurrences of the literal correspond to a match of the regex, since /// literal extraction ignores look-around assertions.) #[derive(Clone, Eq, PartialEq, PartialOrd, Ord)] pub struct Literal { bytes: Vec, exact: bool, } impl Literal { /// Returns a new exact literal containing the bytes given. #[inline] pub fn exact>>(bytes: B) -> Literal { Literal { bytes: bytes.into(), exact: true } } /// Returns a new inexact literal containing the bytes given. #[inline] pub fn inexact>>(bytes: B) -> Literal { Literal { bytes: bytes.into(), exact: false } } /// Returns the bytes in this literal. #[inline] pub fn as_bytes(&self) -> &[u8] { &self.bytes } /// Yields ownership of the bytes inside this literal. /// /// Note that this throws away whether the literal is "exact" or not. #[inline] pub fn into_bytes(self) -> Vec { self.bytes } /// Returns the length of this literal in bytes. #[inline] pub fn len(&self) -> usize { self.as_bytes().len() } /// Returns true if and only if this literal has zero bytes. #[inline] pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns true if and only if this literal is exact. #[inline] pub fn is_exact(&self) -> bool { self.exact } /// Marks this literal as inexact. /// /// Inexact literals can never be extended. For example, /// [`Seq::cross_forward`] will not extend inexact literals. #[inline] pub fn make_inexact(&mut self) { self.exact = false; } /// Reverse the bytes in this literal. #[inline] pub fn reverse(&mut self) { self.bytes.reverse(); } /// Extend this literal with the literal given. /// /// If this literal is inexact, then this is a no-op. #[inline] pub fn extend(&mut self, lit: &Literal) { if !self.is_exact() { return; } self.bytes.extend_from_slice(&lit.bytes); } /// Trims this literal such that only the first `len` bytes remain. If /// this literal has fewer than `len` bytes, then it remains unchanged. /// Otherwise, the literal is marked as inexact. #[inline] pub fn keep_first_bytes(&mut self, len: usize) { if len >= self.len() { return; } self.make_inexact(); self.bytes.truncate(len); } /// Trims this literal such that only the last `len` bytes remain. If this /// literal has fewer than `len` bytes, then it remains unchanged. /// Otherwise, the literal is marked as inexact. #[inline] pub fn keep_last_bytes(&mut self, len: usize) { if len >= self.len() { return; } self.make_inexact(); self.bytes.drain(..self.len() - len); } /// Returns true if it is believe that this literal is likely to match very /// frequently, and is thus not a good candidate for a prefilter. fn is_poisonous(&self) -> bool { self.is_empty() || (self.len() == 1 && rank(self.as_bytes()[0]) >= 250) } } impl From for Literal { fn from(byte: u8) -> Literal { Literal::exact(vec![byte]) } } impl From for Literal { fn from(ch: char) -> Literal { use alloc::string::ToString; Literal::exact(ch.encode_utf8(&mut [0; 4]).to_string()) } } impl AsRef<[u8]> for Literal { fn as_ref(&self) -> &[u8] { self.as_bytes() } } impl core::fmt::Debug for Literal { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let tag = if self.exact { "E" } else { "I" }; f.debug_tuple(tag) .field(&crate::debug::Bytes(self.as_bytes())) .finish() } } /// A "preference" trie that rejects literals that will never match when /// executing a leftmost first or "preference" search. /// /// For example, if 'sam' is inserted, then trying to insert 'samwise' will be /// rejected because 'samwise' can never match since 'sam' will always take /// priority. However, if 'samwise' is inserted first, then inserting 'sam' /// after it is accepted. In this case, either 'samwise' or 'sam' can match in /// a "preference" search. /// /// Note that we only use this trie as a "set." That is, given a sequence of /// literals, we insert each one in order. An `insert` will reject a literal /// if a prefix of that literal already exists in the trie. Thus, to rebuild /// the "minimal" sequence, we simply only keep literals that were successfully /// inserted. (Since we don't need traversal, one wonders whether we can make /// some simplifications here, but I haven't given it a ton of thought and I've /// never seen this show up on a profile. Because of the heuristic limits /// imposed on literal extractions, the size of the inputs here is usually /// very small.) #[derive(Debug)] struct PreferenceTrie { /// The states in this trie. The index of a state in this vector is its ID. states: Vec, /// This vec indicates which states are match states. It always has /// the same length as `states` and is indexed by the same state ID. /// A state with identifier `sid` is a match state if and only if /// `matches[sid].is_some()`. The option contains the index of the literal /// corresponding to the match. The index is offset by 1 so that it fits in /// a NonZeroUsize. matches: Vec>, /// The index to allocate to the next literal added to this trie. Starts at /// 1 and increments by 1 for every literal successfully added to the trie. next_literal_index: usize, } /// A single state in a trie. Uses a sparse representation for its transitions. #[derive(Debug, Default)] struct State { /// Sparse representation of the transitions out of this state. Transitions /// are sorted by byte. There is at most one such transition for any /// particular byte. trans: Vec<(u8, usize)>, } impl PreferenceTrie { /// Minimizes the given sequence of literals while preserving preference /// order semantics. /// /// When `keep_exact` is true, the exactness of every literal retained is /// kept. This is useful when dealing with a fully extracted `Seq` that /// only contains exact literals. In that case, we can keep all retained /// literals as exact because we know we'll never need to match anything /// after them and because any removed literals are guaranteed to never /// match. fn minimize(literals: &mut Vec, keep_exact: bool) { let mut trie = PreferenceTrie { states: vec![], matches: vec![], next_literal_index: 1, }; let mut make_inexact = vec![]; literals.retain_mut(|lit| match trie.insert(lit.as_bytes()) { Ok(_) => true, Err(i) => { if !keep_exact { make_inexact.push(i.checked_sub(1).unwrap()); } false } }); for i in make_inexact { literals[i].make_inexact(); } } /// Returns `Ok` if the given byte string is accepted into this trie and /// `Err` otherwise. The index for the success case corresponds to the /// index of the literal added. The index for the error case corresponds to /// the index of the literal already in the trie that prevented the given /// byte string from being added. (Which implies it is a prefix of the one /// given.) /// /// In short, the byte string given is accepted into the trie if and only /// if it is possible for it to match when executing a preference order /// search. fn insert(&mut self, bytes: &[u8]) -> Result { let mut prev = self.root(); if let Some(idx) = self.matches[prev] { return Err(idx.get()); } for &b in bytes.iter() { match self.states[prev].trans.binary_search_by_key(&b, |t| t.0) { Ok(i) => { prev = self.states[prev].trans[i].1; if let Some(idx) = self.matches[prev] { return Err(idx.get()); } } Err(i) => { let next = self.create_state(); self.states[prev].trans.insert(i, (b, next)); prev = next; } } } let idx = self.next_literal_index; self.next_literal_index += 1; self.matches[prev] = NonZeroUsize::new(idx); Ok(idx) } /// Returns the root state ID, and if it doesn't exist, creates it. fn root(&mut self) -> usize { if !self.states.is_empty() { 0 } else { self.create_state() } } /// Creates a new empty state and returns its ID. fn create_state(&mut self) -> usize { let id = self.states.len(); self.states.push(State::default()); self.matches.push(None); id } } /// Returns the "rank" of the given byte. /// /// The minimum rank value is `0` and the maximum rank value is `255`. /// /// The rank of a byte is derived from a heuristic background distribution of /// relative frequencies of bytes. The heuristic says that lower the rank of a /// byte, the less likely that byte is to appear in any arbitrary haystack. pub fn rank(byte: u8) -> u8 { crate::rank::BYTE_FREQUENCIES[usize::from(byte)] } #[cfg(test)] mod tests { use super::*; fn parse(pattern: &str) -> Hir { crate::ParserBuilder::new().utf8(false).build().parse(pattern).unwrap() } fn prefixes(pattern: &str) -> Seq { Extractor::new().kind(ExtractKind::Prefix).extract(&parse(pattern)) } fn suffixes(pattern: &str) -> Seq { Extractor::new().kind(ExtractKind::Suffix).extract(&parse(pattern)) } fn e(pattern: &str) -> (Seq, Seq) { (prefixes(pattern), suffixes(pattern)) } #[allow(non_snake_case)] fn E(x: &str) -> Literal { Literal::exact(x.as_bytes()) } #[allow(non_snake_case)] fn I(x: &str) -> Literal { Literal::inexact(x.as_bytes()) } fn seq>(it: I) -> Seq { Seq::from_iter(it) } fn infinite() -> (Seq, Seq) { (Seq::infinite(), Seq::infinite()) } fn inexact(it1: I1, it2: I2) -> (Seq, Seq) where I1: IntoIterator, I2: IntoIterator, { (Seq::from_iter(it1), Seq::from_iter(it2)) } fn exact, I: IntoIterator>(it: I) -> (Seq, Seq) { let s1 = Seq::new(it); let s2 = s1.clone(); (s1, s2) } fn opt, I: IntoIterator>(it: I) -> (Seq, Seq) { let (mut p, mut s) = exact(it); p.optimize_for_prefix_by_preference(); s.optimize_for_suffix_by_preference(); (p, s) } #[test] fn literal() { assert_eq!(exact(["a"]), e("a")); assert_eq!(exact(["aaaaa"]), e("aaaaa")); assert_eq!(exact(["A", "a"]), e("(?i-u)a")); assert_eq!(exact(["AB", "Ab", "aB", "ab"]), e("(?i-u)ab")); assert_eq!(exact(["abC", "abc"]), e("ab(?i-u)c")); assert_eq!(exact([b"\xFF"]), e(r"(?-u:\xFF)")); #[cfg(feature = "unicode-case")] { assert_eq!(exact(["☃"]), e("☃")); assert_eq!(exact(["☃"]), e("(?i)☃")); assert_eq!(exact(["☃☃☃☃☃"]), e("☃☃☃☃☃")); assert_eq!(exact(["Δ"]), e("Δ")); assert_eq!(exact(["δ"]), e("δ")); assert_eq!(exact(["Δ", "δ"]), e("(?i)Δ")); assert_eq!(exact(["Δ", "δ"]), e("(?i)δ")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)S")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)s")); assert_eq!(exact(["S", "s", "ſ"]), e("(?i)ſ")); } let letters = "ͱͳͷΐάέήίΰαβγδεζηθικλμνξοπρςστυφχψωϊϋ"; assert_eq!(exact([letters]), e(letters)); } #[test] fn class() { assert_eq!(exact(["a", "b", "c"]), e("[abc]")); assert_eq!(exact(["a1b", "a2b", "a3b"]), e("a[123]b")); assert_eq!(exact(["δ", "ε"]), e("[εδ]")); #[cfg(feature = "unicode-case")] { assert_eq!(exact(["Δ", "Ε", "δ", "ε", "ϵ"]), e(r"(?i)[εδ]")); } } #[test] fn look() { assert_eq!(exact(["ab"]), e(r"a\Ab")); assert_eq!(exact(["ab"]), e(r"a\zb")); assert_eq!(exact(["ab"]), e(r"a(?m:^)b")); assert_eq!(exact(["ab"]), e(r"a(?m:$)b")); assert_eq!(exact(["ab"]), e(r"a\bb")); assert_eq!(exact(["ab"]), e(r"a\Bb")); assert_eq!(exact(["ab"]), e(r"a(?-u:\b)b")); assert_eq!(exact(["ab"]), e(r"a(?-u:\B)b")); assert_eq!(exact(["ab"]), e(r"^ab")); assert_eq!(exact(["ab"]), e(r"$ab")); assert_eq!(exact(["ab"]), e(r"(?m:^)ab")); assert_eq!(exact(["ab"]), e(r"(?m:$)ab")); assert_eq!(exact(["ab"]), e(r"\bab")); assert_eq!(exact(["ab"]), e(r"\Bab")); assert_eq!(exact(["ab"]), e(r"(?-u:\b)ab")); assert_eq!(exact(["ab"]), e(r"(?-u:\B)ab")); assert_eq!(exact(["ab"]), e(r"ab^")); assert_eq!(exact(["ab"]), e(r"ab$")); assert_eq!(exact(["ab"]), e(r"ab(?m:^)")); assert_eq!(exact(["ab"]), e(r"ab(?m:$)")); assert_eq!(exact(["ab"]), e(r"ab\b")); assert_eq!(exact(["ab"]), e(r"ab\B")); assert_eq!(exact(["ab"]), e(r"ab(?-u:\b)")); assert_eq!(exact(["ab"]), e(r"ab(?-u:\B)")); let expected = (seq([I("aZ"), E("ab")]), seq([I("Zb"), E("ab")])); assert_eq!(expected, e(r"^aZ*b")); } #[test] fn repetition() { assert_eq!(exact(["a", ""]), e(r"a?")); assert_eq!(exact(["", "a"]), e(r"a??")); assert_eq!(inexact([I("a"), E("")], [I("a"), E("")]), e(r"a*")); assert_eq!(inexact([E(""), I("a")], [E(""), I("a")]), e(r"a*?")); assert_eq!(inexact([I("a")], [I("a")]), e(r"a+")); assert_eq!(inexact([I("a")], [I("a")]), e(r"(a+)+")); assert_eq!(exact(["ab"]), e(r"aZ{0}b")); assert_eq!(exact(["aZb", "ab"]), e(r"aZ?b")); assert_eq!(exact(["ab", "aZb"]), e(r"aZ??b")); assert_eq!( inexact([I("aZ"), E("ab")], [I("Zb"), E("ab")]), e(r"aZ*b") ); assert_eq!( inexact([E("ab"), I("aZ")], [E("ab"), I("Zb")]), e(r"aZ*?b") ); assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+b")); assert_eq!(inexact([I("aZ")], [I("Zb")]), e(r"aZ+?b")); assert_eq!(exact(["aZZb"]), e(r"aZ{2}b")); assert_eq!(inexact([I("aZZ")], [I("ZZb")]), e(r"aZ{2,3}b")); assert_eq!(exact(["abc", ""]), e(r"(abc)?")); assert_eq!(exact(["", "abc"]), e(r"(abc)??")); assert_eq!(inexact([I("a"), E("b")], [I("ab"), E("b")]), e(r"a*b")); assert_eq!(inexact([E("b"), I("a")], [E("b"), I("ab")]), e(r"a*?b")); assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); assert_eq!(inexact([I("a"), I("b")], [I("b")]), e(r"a*b+")); // FIXME: The suffixes for this don't look quite right to me. I think // the right suffixes would be: [I(ac), I(bc), E(c)]. The main issue I // think is that suffixes are computed by iterating over concatenations // in reverse, and then [bc, ac, c] ordering is indeed correct from // that perspective. We also test a few more equivalent regexes, and // we get the same result, so it is consistent at least I suppose. // // The reason why this isn't an issue is that it only messes up // preference order, and currently, suffixes are never used in a // context where preference order matters. For prefixes it matters // because we sometimes want to use prefilters without confirmation // when all of the literals are exact (and there's no look-around). But // we never do that for suffixes. Any time we use suffixes, we always // include a confirmation step. If that ever changes, then it's likely // this bug will need to be fixed, but last time I looked, it appears // hard to do so. assert_eq!( inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), e(r"a*b*c") ); assert_eq!( inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), e(r"(a+)?(b+)?c") ); assert_eq!( inexact([I("a"), I("b"), E("c")], [I("bc"), I("ac"), E("c")]), e(r"(a+|)(b+|)c") ); // A few more similarish but not identical regexes. These may have a // similar problem as above. assert_eq!( inexact( [I("a"), I("b"), I("c"), E("")], [I("c"), I("b"), I("a"), E("")] ), e(r"a*b*c*") ); assert_eq!(inexact([I("a"), I("b"), I("c")], [I("c")]), e(r"a*b*c+")); assert_eq!(inexact([I("a"), I("b")], [I("bc")]), e(r"a*b+c")); assert_eq!(inexact([I("a"), I("b")], [I("c"), I("b")]), e(r"a*b+c*")); assert_eq!(inexact([I("ab"), E("a")], [I("b"), E("a")]), e(r"ab*")); assert_eq!( inexact([I("ab"), E("ac")], [I("bc"), E("ac")]), e(r"ab*c") ); assert_eq!(inexact([I("ab")], [I("b")]), e(r"ab+")); assert_eq!(inexact([I("ab")], [I("bc")]), e(r"ab+c")); assert_eq!( inexact([I("z"), E("azb")], [I("zazb"), E("azb")]), e(r"z*azb") ); let expected = exact(["aaa", "aab", "aba", "abb", "baa", "bab", "bba", "bbb"]); assert_eq!(expected, e(r"[ab]{3}")); let expected = inexact( [ I("aaa"), I("aab"), I("aba"), I("abb"), I("baa"), I("bab"), I("bba"), I("bbb"), ], [ I("aaa"), I("aab"), I("aba"), I("abb"), I("baa"), I("bab"), I("bba"), I("bbb"), ], ); assert_eq!(expected, e(r"[ab]{3,4}")); } #[test] fn concat() { let empty: [&str; 0] = []; assert_eq!(exact(["abcxyz"]), e(r"abc()xyz")); assert_eq!(exact(["abcxyz"]), e(r"(abc)(xyz)")); assert_eq!(exact(["abcmnoxyz"]), e(r"abc()mno()xyz")); assert_eq!(exact(empty), e(r"abc[a&&b]xyz")); assert_eq!(exact(["abcxyz"]), e(r"abc[a&&b]*xyz")); } #[test] fn alternation() { assert_eq!(exact(["abc", "mno", "xyz"]), e(r"abc|mno|xyz")); assert_eq!( inexact( [E("abc"), I("mZ"), E("mo"), E("xyz")], [E("abc"), I("Zo"), E("mo"), E("xyz")] ), e(r"abc|mZ*o|xyz") ); assert_eq!(exact(["abc", "xyz"]), e(r"abc|M[a&&b]N|xyz")); assert_eq!(exact(["abc", "MN", "xyz"]), e(r"abc|M[a&&b]*N|xyz")); assert_eq!(exact(["aaa", "aaaaa"]), e(r"(?:|aa)aaa")); assert_eq!( inexact( [I("aaa"), E(""), I("aaaaa"), E("aa")], [I("aaa"), E(""), E("aa")] ), e(r"(?:|aa)(?:aaa)*") ); assert_eq!( inexact( [E(""), I("aaa"), E("aa"), I("aaaaa")], [E(""), I("aaa"), E("aa")] ), e(r"(?:|aa)(?:aaa)*?") ); assert_eq!( inexact([E("a"), I("b"), E("")], [E("a"), I("b"), E("")]), e(r"a|b*") ); assert_eq!(inexact([E("a"), I("b")], [E("a"), I("b")]), e(r"a|b+")); assert_eq!( inexact([I("a"), E("b"), E("c")], [I("ab"), E("b"), E("c")]), e(r"a*b|c") ); assert_eq!( inexact( [E("a"), E("b"), I("c"), E("")], [E("a"), E("b"), I("c"), E("")] ), e(r"a|(?:b|c*)") ); assert_eq!( inexact( [I("a"), I("b"), E("c"), I("a"), I("ab"), E("c")], [I("ac"), I("bc"), E("c"), I("ac"), I("abc"), E("c")], ), e(r"(a|b)*c|(a|ab)*c") ); assert_eq!( exact(["abef", "abgh", "cdef", "cdgh"]), e(r"(ab|cd)(ef|gh)") ); assert_eq!( exact([ "abefij", "abefkl", "abghij", "abghkl", "cdefij", "cdefkl", "cdghij", "cdghkl", ]), e(r"(ab|cd)(ef|gh)(ij|kl)") ); assert_eq!(inexact([E("abab")], [E("abab")]), e(r"(ab){2}")); assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,3}")); assert_eq!(inexact([I("abab")], [I("abab")]), e(r"(ab){2,}")); } #[test] fn impossible() { let empty: [&str; 0] = []; assert_eq!(exact(empty), e(r"[a&&b]")); assert_eq!(exact(empty), e(r"a[a&&b]")); assert_eq!(exact(empty), e(r"[a&&b]b")); assert_eq!(exact(empty), e(r"a[a&&b]b")); assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]|b")); assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]|b")); assert_eq!(exact(["a", "b"]), e(r"a|[a&&b]d|b")); assert_eq!(exact(["a", "b"]), e(r"a|c[a&&b]d|b")); assert_eq!(exact([""]), e(r"[a&&b]*")); assert_eq!(exact(["MN"]), e(r"M[a&&b]*N")); } // This tests patterns that contain something that defeats literal // detection, usually because it would blow some limit on the total number // of literals that can be returned. // // The main idea is that when literal extraction sees something that // it knows will blow a limit, it replaces it with a marker that says // "any literal will match here." While not necessarily true, the // over-estimation is just fine for the purposes of literal extraction, // because the imprecision doesn't matter: too big is too big. // // This is one of the trickier parts of literal extraction, since we need // to make sure all of our literal extraction operations correctly compose // with the markers. #[test] fn anything() { assert_eq!(infinite(), e(r".")); assert_eq!(infinite(), e(r"(?s).")); assert_eq!(infinite(), e(r"[A-Za-z]")); assert_eq!(infinite(), e(r"[A-Z]")); assert_eq!(exact([""]), e(r"[A-Z]{0}")); assert_eq!(infinite(), e(r"[A-Z]?")); assert_eq!(infinite(), e(r"[A-Z]*")); assert_eq!(infinite(), e(r"[A-Z]+")); assert_eq!((seq([I("1")]), Seq::infinite()), e(r"1[A-Z]")); assert_eq!((seq([I("1")]), seq([I("2")])), e(r"1[A-Z]2")); assert_eq!((Seq::infinite(), seq([I("123")])), e(r"[A-Z]+123")); assert_eq!(infinite(), e(r"[A-Z]+123[A-Z]+")); assert_eq!(infinite(), e(r"1|[A-Z]|3")); assert_eq!( (seq([E("1"), I("2"), E("3")]), Seq::infinite()), e(r"1|2[A-Z]|3"), ); assert_eq!( (Seq::infinite(), seq([E("1"), I("2"), E("3")])), e(r"1|[A-Z]2|3"), ); assert_eq!( (seq([E("1"), I("2"), E("4")]), seq([E("1"), I("3"), E("4")])), e(r"1|2[A-Z]3|4"), ); assert_eq!((Seq::infinite(), seq([I("2")])), e(r"(?:|1)[A-Z]2")); assert_eq!(inexact([I("a")], [I("z")]), e(r"a.z")); } // Like the 'anything' test, but it uses smaller limits in order to test // the logic for effectively aborting literal extraction when the seqs get // too big. #[test] fn anything_small_limits() { fn prefixes(pattern: &str) -> Seq { Extractor::new() .kind(ExtractKind::Prefix) .limit_total(10) .extract(&parse(pattern)) } fn suffixes(pattern: &str) -> Seq { Extractor::new() .kind(ExtractKind::Suffix) .limit_total(10) .extract(&parse(pattern)) } fn e(pattern: &str) -> (Seq, Seq) { (prefixes(pattern), suffixes(pattern)) } assert_eq!( ( seq([ I("aaa"), I("aab"), I("aba"), I("abb"), I("baa"), I("bab"), I("bba"), I("bbb") ]), seq([ I("aaa"), I("aab"), I("aba"), I("abb"), I("baa"), I("bab"), I("bba"), I("bbb") ]) ), e(r"[ab]{3}{3}") ); assert_eq!(infinite(), e(r"ab|cd|ef|gh|ij|kl|mn|op|qr|st|uv|wx|yz")); } #[test] fn empty() { assert_eq!(exact([""]), e(r"")); assert_eq!(exact([""]), e(r"^")); assert_eq!(exact([""]), e(r"$")); assert_eq!(exact([""]), e(r"(?m:^)")); assert_eq!(exact([""]), e(r"(?m:$)")); assert_eq!(exact([""]), e(r"\b")); assert_eq!(exact([""]), e(r"\B")); assert_eq!(exact([""]), e(r"(?-u:\b)")); assert_eq!(exact([""]), e(r"(?-u:\B)")); } #[test] fn odds_and_ends() { assert_eq!((Seq::infinite(), seq([I("a")])), e(r".a")); assert_eq!((seq([I("a")]), Seq::infinite()), e(r"a.")); assert_eq!(infinite(), e(r"a|.")); assert_eq!(infinite(), e(r".|a")); let pat = r"M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]"; let expected = inexact( ["Mo'am", "Moam", "Mu'am", "Muam"].map(I), [ "ddafi", "ddafy", "dhafi", "dhafy", "dzafi", "dzafy", "dafi", "dafy", "tdafi", "tdafy", "thafi", "thafy", "tzafi", "tzafy", "tafi", "tafy", "zdafi", "zdafy", "zhafi", "zhafy", "zzafi", "zzafy", "zafi", "zafy", ] .map(I), ); assert_eq!(expected, e(pat)); assert_eq!( (seq(["fn is_", "fn as_"].map(I)), Seq::infinite()), e(r"fn is_([A-Z]+)|fn as_([A-Z]+)"), ); assert_eq!( inexact([I("foo")], [I("quux")]), e(r"foo[A-Z]+bar[A-Z]+quux") ); assert_eq!(infinite(), e(r"[A-Z]+bar[A-Z]+")); assert_eq!( exact(["Sherlock Holmes"]), e(r"(?m)^Sherlock Holmes|Sherlock Holmes$") ); assert_eq!(exact(["sa", "sb"]), e(r"\bs(?:[ab])")); } // This tests a specific regex along with some heuristic steps to reduce // the sequences extracted. This is meant to roughly correspond to the // types of heuristics used to shrink literal sets in practice. (Shrinking // is done because you want to balance "spend too much work looking for // too many literals" and "spend too much work processing false positive // matches from short literals.") #[test] #[cfg(feature = "unicode-case")] fn holmes() { let expected = inexact( ["HOL", "HOl", "HoL", "Hol", "hOL", "hOl", "hoL", "hol"].map(I), [ "MES", "MEs", "Eſ", "MeS", "Mes", "eſ", "mES", "mEs", "meS", "mes", ] .map(I), ); let (mut prefixes, mut suffixes) = e(r"(?i)Holmes"); prefixes.keep_first_bytes(3); suffixes.keep_last_bytes(3); prefixes.minimize_by_preference(); suffixes.minimize_by_preference(); assert_eq!(expected, (prefixes, suffixes)); } // This tests that we get some kind of literals extracted for a beefier // alternation with case insensitive mode enabled. At one point during // development, this returned nothing, and motivated some special case // code in Extractor::union to try and trim down the literal sequences // if the union would blow the limits set. #[test] #[cfg(feature = "unicode-case")] fn holmes_alt() { let mut pre = prefixes(r"(?i)Sherlock|Holmes|Watson|Irene|Adler|John|Baker"); assert!(pre.len().unwrap() > 0); pre.optimize_for_prefix_by_preference(); assert!(pre.len().unwrap() > 0); } // See: https://github.com/rust-lang/regex/security/advisories/GHSA-m5pq-gvj9-9vr8 // See: CVE-2022-24713 // // We test this here to ensure literal extraction completes in reasonable // time and isn't materially impacted by these sorts of pathological // repeats. #[test] fn crazy_repeats() { assert_eq!(inexact([E("")], [E("")]), e(r"(?:){4294967295}")); assert_eq!( inexact([E("")], [E("")]), e(r"(?:){64}{64}{64}{64}{64}{64}") ); assert_eq!(inexact([E("")], [E("")]), e(r"x{0}{4294967295}")); assert_eq!(inexact([E("")], [E("")]), e(r"(?:|){4294967295}")); assert_eq!( inexact([E("")], [E("")]), e(r"(?:){8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") ); let repa = "a".repeat(100); assert_eq!( inexact([I(&repa)], [I(&repa)]), e(r"a{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}{8}") ); } #[test] fn huge() { let pat = r#"(?-u) 2(?: [45]\d{3}| 7(?: 1[0-267]| 2[0-289]| 3[0-29]| 4[01]| 5[1-3]| 6[013]| 7[0178]| 91 )| 8(?: 0[125]| [139][1-6]| 2[0157-9]| 41| 6[1-35]| 7[1-5]| 8[1-8]| 90 )| 9(?: 0[0-2]| 1[0-4]| 2[568]| 3[3-6]| 5[5-7]| 6[0167]| 7[15]| 8[0146-9] ) )\d{4}| 3(?: 12?[5-7]\d{2}| 0(?: 2(?: [025-79]\d| [348]\d{1,2} )| 3(?: [2-4]\d| [56]\d? ) )| 2(?: 1\d{2}| 2(?: [12]\d| [35]\d{1,2}| 4\d? ) )| 3(?: 1\d{2}| 2(?: [2356]\d| 4\d{1,2} ) )| 4(?: 1\d{2}| 2(?: 2\d{1,2}| [47]| 5\d{2} ) )| 5(?: 1\d{2}| 29 )| [67]1\d{2}| 8(?: 1\d{2}| 2(?: 2\d{2}| 3| 4\d ) ) )\d{3}| 4(?: 0(?: 2(?: [09]\d| 7 )| 33\d{2} )| 1\d{3}| 2(?: 1\d{2}| 2(?: [25]\d?| [348]\d| [67]\d{1,2} ) )| 3(?: 1\d{2}(?: \d{2} )?| 2(?: [045]\d| [236-9]\d{1,2} )| 32\d{2} )| 4(?: [18]\d{2}| 2(?: [2-46]\d{2}| 3 )| 5[25]\d{2} )| 5(?: 1\d{2}| 2(?: 3\d| 5 ) )| 6(?: [18]\d{2}| 2(?: 3(?: \d{2} )?| [46]\d{1,2}| 5\d{2}| 7\d )| 5(?: 3\d?| 4\d| [57]\d{1,2}| 6\d{2}| 8 ) )| 71\d{2}| 8(?: [18]\d{2}| 23\d{2}| 54\d{2} )| 9(?: [18]\d{2}| 2[2-5]\d{2}| 53\d{1,2} ) )\d{3}| 5(?: 02[03489]\d{2}| 1\d{2}| 2(?: 1\d{2}| 2(?: 2(?: \d{2} )?| [457]\d{2} ) )| 3(?: 1\d{2}| 2(?: [37](?: \d{2} )?| [569]\d{2} ) )| 4(?: 1\d{2}| 2[46]\d{2} )| 5(?: 1\d{2}| 26\d{1,2} )| 6(?: [18]\d{2}| 2| 53\d{2} )| 7(?: 1| 24 )\d{2}| 8(?: 1| 26 )\d{2}| 91\d{2} )\d{3}| 6(?: 0(?: 1\d{2}| 2(?: 3\d{2}| 4\d{1,2} ) )| 2(?: 2[2-5]\d{2}| 5(?: [3-5]\d{2}| 7 )| 8\d{2} )| 3(?: 1| 2[3478] )\d{2}| 4(?: 1| 2[34] )\d{2}| 5(?: 1| 2[47] )\d{2}| 6(?: [18]\d{2}| 6(?: 2(?: 2\d| [34]\d{2} )| 5(?: [24]\d{2}| 3\d| 5\d{1,2} ) ) )| 72[2-5]\d{2}| 8(?: 1\d{2}| 2[2-5]\d{2} )| 9(?: 1\d{2}| 2[2-6]\d{2} ) )\d{3}| 7(?: (?: 02| [3-589]1| 6[12]| 72[24] )\d{2}| 21\d{3}| 32 )\d{3}| 8(?: (?: 4[12]| [5-7]2| 1\d? )| (?: 0| 3[12]| [5-7]1| 217 )\d )\d{4}| 9(?: [35]1| (?: [024]2| 81 )\d| (?: 1| [24]1 )\d{2} )\d{3} "#; // TODO: This is a good candidate of a seq of literals that could be // shrunk quite a bit and still be very productive with respect to // literal optimizations. let (prefixes, suffixes) = e(pat); assert!(!suffixes.is_finite()); assert_eq!(Some(243), prefixes.len()); } #[test] fn optimize() { // This gets a common prefix that isn't too short. let (p, s) = opt(["foobarfoobar", "foobar", "foobarzfoobar", "foobarfoobar"]); assert_eq!(seq([I("foobar")]), p); assert_eq!(seq([I("foobar")]), s); // This also finds a common prefix, but since it's only one byte, it // prefers the multiple literals. let (p, s) = opt(["abba", "akka", "abccba"]); assert_eq!(exact(["abba", "akka", "abccba"]), (p, s)); let (p, s) = opt(["sam", "samwise"]); assert_eq!((seq([E("sam")]), seq([E("sam"), E("samwise")])), (p, s)); // The empty string is poisonous, so our seq becomes infinite, even // though all literals are exact. let (p, s) = opt(["foobarfoo", "foo", "", "foozfoo", "foofoo"]); assert!(!p.is_finite()); assert!(!s.is_finite()); // A space is also poisonous, so our seq becomes infinite. But this // only gets triggered when we don't have a completely exact sequence. // When the sequence is exact, spaces are okay, since we presume that // any prefilter will match a space more quickly than the regex engine. // (When the sequence is exact, there's a chance of the prefilter being // used without needing the regex engine at all.) let mut p = seq([E("foobarfoo"), I("foo"), E(" "), E("foofoo")]); p.optimize_for_prefix_by_preference(); assert!(!p.is_finite()); } } regex-syntax-0.8.2/src/hir/mod.rs000064400000000000000000004361441046102023000150030ustar 00000000000000/*! Defines a high-level intermediate (HIR) representation for regular expressions. The HIR is represented by the [`Hir`] type, and it principally constructed via [translation](translate) from an [`Ast`](crate::ast::Ast). Alternatively, users may use the smart constructors defined on `Hir` to build their own by hand. The smart constructors simultaneously simplify and "optimize" the HIR, and are also the same routines used by translation. Most regex engines only have an HIR like this, and usually construct it directly from the concrete syntax. This crate however first parses the concrete syntax into an `Ast`, and only then creates the HIR from the `Ast`, as mentioned above. It's done this way to facilitate better error reporting, and to have a structured representation of a regex that faithfully represents its concrete syntax. Namely, while an `Hir` value can be converted back to an equivalent regex pattern string, it is unlikely to look like the original due to its simplified structure. */ use core::{char, cmp}; use alloc::{ boxed::Box, format, string::{String, ToString}, vec, vec::Vec, }; use crate::{ ast::Span, hir::interval::{Interval, IntervalSet, IntervalSetIter}, unicode, }; pub use crate::{ hir::visitor::{visit, Visitor}, unicode::CaseFoldError, }; mod interval; pub mod literal; pub mod print; pub mod translate; mod visitor; /// An error that can occur while translating an `Ast` to a `Hir`. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Error { /// The kind of error. kind: ErrorKind, /// The original pattern that the translator's Ast was parsed from. Every /// span in an error is a valid range into this string. pattern: String, /// The span of this error, derived from the Ast given to the translator. span: Span, } impl Error { /// Return the type of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } /// The original pattern string in which this error occurred. /// /// Every span reported by this error is reported in terms of this string. pub fn pattern(&self) -> &str { &self.pattern } /// Return the span at which this error occurred. pub fn span(&self) -> &Span { &self.span } } /// The type of an error that occurred while building an `Hir`. /// /// This error type is marked as `non_exhaustive`. This means that adding a /// new variant is not considered a breaking change. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum ErrorKind { /// This error occurs when a Unicode feature is used when Unicode /// support is disabled. For example `(?-u:\pL)` would trigger this error. UnicodeNotAllowed, /// This error occurs when translating a pattern that could match a byte /// sequence that isn't UTF-8 and `utf8` was enabled. InvalidUtf8, /// This error occurs when one uses a non-ASCII byte for a line terminator, /// but where Unicode mode is enabled and UTF-8 mode is disabled. InvalidLineTerminator, /// This occurs when an unrecognized Unicode property name could not /// be found. UnicodePropertyNotFound, /// This occurs when an unrecognized Unicode property value could not /// be found. UnicodePropertyValueNotFound, /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or /// `\d`) could not be found. This can occur when the `unicode-perl` /// crate feature is not enabled. UnicodePerlClassNotFound, /// This occurs when the Unicode simple case mapping tables are not /// available, and the regular expression required Unicode aware case /// insensitivity. UnicodeCaseUnavailable, } #[cfg(feature = "std")] impl std::error::Error for Error {} impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { crate::error::Formatter::from(self).fmt(f) } } impl core::fmt::Display for ErrorKind { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::ErrorKind::*; let msg = match *self { UnicodeNotAllowed => "Unicode not allowed here", InvalidUtf8 => "pattern can match invalid UTF-8", InvalidLineTerminator => "invalid line terminator, must be ASCII", UnicodePropertyNotFound => "Unicode property not found", UnicodePropertyValueNotFound => "Unicode property value not found", UnicodePerlClassNotFound => { "Unicode-aware Perl class not found \ (make sure the unicode-perl feature is enabled)" } UnicodeCaseUnavailable => { "Unicode-aware case insensitivity matching is not available \ (make sure the unicode-case feature is enabled)" } }; f.write_str(msg) } } /// A high-level intermediate representation (HIR) for a regular expression. /// /// An HIR value is a combination of a [`HirKind`] and a set of [`Properties`]. /// An `HirKind` indicates what kind of regular expression it is (a literal, /// a repetition, a look-around assertion, etc.), where as a `Properties` /// describes various facts about the regular expression. For example, whether /// it matches UTF-8 or if it matches the empty string. /// /// The HIR of a regular expression represents an intermediate step between /// its abstract syntax (a structured description of the concrete syntax) and /// an actual regex matcher. The purpose of HIR is to make regular expressions /// easier to analyze. In particular, the AST is much more complex than the /// HIR. For example, while an AST supports arbitrarily nested character /// classes, the HIR will flatten all nested classes into a single set. The HIR /// will also "compile away" every flag present in the concrete syntax. For /// example, users of HIR expressions never need to worry about case folding; /// it is handled automatically by the translator (e.g., by translating /// `(?i:A)` to `[aA]`). /// /// The specific type of an HIR expression can be accessed via its `kind` /// or `into_kind` methods. This extra level of indirection exists for two /// reasons: /// /// 1. Construction of an HIR expression *must* use the constructor methods on /// this `Hir` type instead of building the `HirKind` values directly. This /// permits construction to enforce invariants like "concatenations always /// consist of two or more sub-expressions." /// 2. Every HIR expression contains attributes that are defined inductively, /// and can be computed cheaply during the construction process. For example, /// one such attribute is whether the expression must match at the beginning of /// the haystack. /// /// In particular, if you have an `HirKind` value, then there is intentionally /// no way to build an `Hir` value from it. You instead need to do case /// analysis on the `HirKind` value and build the `Hir` value using its smart /// constructors. /// /// # UTF-8 /// /// If the HIR was produced by a translator with /// [`TranslatorBuilder::utf8`](translate::TranslatorBuilder::utf8) enabled, /// then the HIR is guaranteed to match UTF-8 exclusively for all non-empty /// matches. /// /// For empty matches, those can occur at any position. It is the /// responsibility of the regex engine to determine whether empty matches are /// permitted between the code units of a single codepoint. /// /// # Stack space /// /// This type defines its own destructor that uses constant stack space and /// heap space proportional to the size of the HIR. /// /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular /// expression pattern string, and uses constant stack space and heap space /// proportional to the size of the `Hir`. The regex it prints is guaranteed to /// be _semantically_ equivalent to the original concrete syntax, but it may /// look very different. (And potentially not practically readable by a human.) /// /// An `Hir`'s `fmt::Debug` implementation currently does not use constant /// stack space. The implementation will also suppress some details (such as /// the `Properties` inlined into every `Hir` value to make it less noisy). #[derive(Clone, Eq, PartialEq)] pub struct Hir { /// The underlying HIR kind. kind: HirKind, /// Analysis info about this HIR, computed during construction. props: Properties, } /// Methods for accessing the underlying `HirKind` and `Properties`. impl Hir { /// Returns a reference to the underlying HIR kind. pub fn kind(&self) -> &HirKind { &self.kind } /// Consumes ownership of this HIR expression and returns its underlying /// `HirKind`. pub fn into_kind(mut self) -> HirKind { core::mem::replace(&mut self.kind, HirKind::Empty) } /// Returns the properties computed for this `Hir`. pub fn properties(&self) -> &Properties { &self.props } /// Splits this HIR into its constituent parts. /// /// This is useful because `let Hir { kind, props } = hir;` does not work /// because of `Hir`'s custom `Drop` implementation. fn into_parts(mut self) -> (HirKind, Properties) { ( core::mem::replace(&mut self.kind, HirKind::Empty), core::mem::replace(&mut self.props, Properties::empty()), ) } } /// Smart constructors for HIR values. /// /// These constructors are called "smart" because they do inductive work or /// simplifications. For example, calling `Hir::repetition` with a repetition /// like `a{0}` will actually return a `Hir` with a `HirKind::Empty` kind /// since it is equivalent to an empty regex. Another example is calling /// `Hir::concat(vec![expr])`. Instead of getting a `HirKind::Concat`, you'll /// just get back the original `expr` since it's precisely equivalent. /// /// Smart constructors enable maintaining invariants about the HIR data type /// while also simulanteously keeping the representation as simple as possible. impl Hir { /// Returns an empty HIR expression. /// /// An empty HIR expression always matches, including the empty string. #[inline] pub fn empty() -> Hir { let props = Properties::empty(); Hir { kind: HirKind::Empty, props } } /// Returns an HIR expression that can never match anything. That is, /// the size of the set of strings in the language described by the HIR /// returned is `0`. /// /// This is distinct from [`Hir::empty`] in that the empty string matches /// the HIR returned by `Hir::empty`. That is, the set of strings in the /// language describe described by `Hir::empty` is non-empty. /// /// Note that currently, the HIR returned uses an empty character class to /// indicate that nothing can match. An equivalent expression that cannot /// match is an empty alternation, but all such "fail" expressions are /// normalized (via smart constructors) to empty character classes. This is /// because empty character classes can be spelled in the concrete syntax /// of a regex (e.g., `\P{any}` or `(?-u:[^\x00-\xFF])` or `[a&&b]`), but /// empty alternations cannot. #[inline] pub fn fail() -> Hir { let class = Class::Bytes(ClassBytes::empty()); let props = Properties::class(&class); // We can't just call Hir::class here because it defers to Hir::fail // in order to canonicalize the Hir value used to represent "cannot // match." Hir { kind: HirKind::Class(class), props } } /// Creates a literal HIR expression. /// /// This accepts anything that can be converted into a `Box<[u8]>`. /// /// Note that there is no mechanism for storing a `char` or a `Box` /// in an HIR. Everything is "just bytes." Whether a `Literal` (or /// any HIR node) matches valid UTF-8 exclusively can be queried via /// [`Properties::is_utf8`]. /// /// # Example /// /// This example shows that concatenations of `Literal` HIR values will /// automatically get flattened and combined together. So for example, even /// if you concat multiple `Literal` values that are themselves not valid /// UTF-8, they might add up to valid UTF-8. This also demonstrates just /// how "smart" Hir's smart constructors are. /// /// ``` /// use regex_syntax::hir::{Hir, HirKind, Literal}; /// /// let literals = vec![ /// Hir::literal([0xE2]), /// Hir::literal([0x98]), /// Hir::literal([0x83]), /// ]; /// // Each literal, on its own, is invalid UTF-8. /// assert!(literals.iter().all(|hir| !hir.properties().is_utf8())); /// /// let concat = Hir::concat(literals); /// // But the concatenation is valid UTF-8! /// assert!(concat.properties().is_utf8()); /// /// // And also notice that the literals have been concatenated into a /// // single `Literal`, to the point where there is no explicit `Concat`! /// let expected = HirKind::Literal(Literal(Box::from("☃".as_bytes()))); /// assert_eq!(&expected, concat.kind()); /// ``` #[inline] pub fn literal>>(lit: B) -> Hir { let bytes = lit.into(); if bytes.is_empty() { return Hir::empty(); } let lit = Literal(bytes); let props = Properties::literal(&lit); Hir { kind: HirKind::Literal(lit), props } } /// Creates a class HIR expression. The class may either be defined over /// ranges of Unicode codepoints or ranges of raw byte values. /// /// Note that an empty class is permitted. An empty class is equivalent to /// `Hir::fail()`. #[inline] pub fn class(class: Class) -> Hir { if class.is_empty() { return Hir::fail(); } else if let Some(bytes) = class.literal() { return Hir::literal(bytes); } let props = Properties::class(&class); Hir { kind: HirKind::Class(class), props } } /// Creates a look-around assertion HIR expression. #[inline] pub fn look(look: Look) -> Hir { let props = Properties::look(look); Hir { kind: HirKind::Look(look), props } } /// Creates a repetition HIR expression. #[inline] pub fn repetition(mut rep: Repetition) -> Hir { // If the sub-expression of a repetition can only match the empty // string, then we force its maximum to be at most 1. if rep.sub.properties().maximum_len() == Some(0) { rep.min = cmp::min(rep.min, 1); rep.max = rep.max.map(|n| cmp::min(n, 1)).or(Some(1)); } // The regex 'a{0}' is always equivalent to the empty regex. This is // true even when 'a' is an expression that never matches anything // (like '\P{any}'). // // Additionally, the regex 'a{1}' is always equivalent to 'a'. if rep.min == 0 && rep.max == Some(0) { return Hir::empty(); } else if rep.min == 1 && rep.max == Some(1) { return *rep.sub; } let props = Properties::repetition(&rep); Hir { kind: HirKind::Repetition(rep), props } } /// Creates a capture HIR expression. /// /// Note that there is no explicit HIR value for a non-capturing group. /// Since a non-capturing group only exists to override precedence in the /// concrete syntax and since an HIR already does its own grouping based on /// what is parsed, there is no need to explicitly represent non-capturing /// groups in the HIR. #[inline] pub fn capture(capture: Capture) -> Hir { let props = Properties::capture(&capture); Hir { kind: HirKind::Capture(capture), props } } /// Returns the concatenation of the given expressions. /// /// This attempts to flatten and simplify the concatenation as appropriate. /// /// # Example /// /// This shows a simple example of basic flattening of both concatenations /// and literals. /// /// ``` /// use regex_syntax::hir::Hir; /// /// let hir = Hir::concat(vec![ /// Hir::concat(vec![ /// Hir::literal([b'a']), /// Hir::literal([b'b']), /// Hir::literal([b'c']), /// ]), /// Hir::concat(vec![ /// Hir::literal([b'x']), /// Hir::literal([b'y']), /// Hir::literal([b'z']), /// ]), /// ]); /// let expected = Hir::literal("abcxyz".as_bytes()); /// assert_eq!(expected, hir); /// ``` pub fn concat(subs: Vec) -> Hir { // We rebuild the concatenation by simplifying it. Would be nice to do // it in place, but that seems a little tricky? let mut new = vec![]; // This gobbles up any adjacent literals in a concatenation and smushes // them together. Basically, when we see a literal, we add its bytes // to 'prior_lit', and whenever we see anything else, we first take // any bytes in 'prior_lit' and add it to the 'new' concatenation. let mut prior_lit: Option> = None; for sub in subs { let (kind, props) = sub.into_parts(); match kind { HirKind::Literal(Literal(bytes)) => { if let Some(ref mut prior_bytes) = prior_lit { prior_bytes.extend_from_slice(&bytes); } else { prior_lit = Some(bytes.to_vec()); } } // We also flatten concats that are direct children of another // concat. We only need to do this one level deep since // Hir::concat is the only way to build concatenations, and so // flattening happens inductively. HirKind::Concat(subs2) => { for sub2 in subs2 { let (kind2, props2) = sub2.into_parts(); match kind2 { HirKind::Literal(Literal(bytes)) => { if let Some(ref mut prior_bytes) = prior_lit { prior_bytes.extend_from_slice(&bytes); } else { prior_lit = Some(bytes.to_vec()); } } kind2 => { if let Some(prior_bytes) = prior_lit.take() { new.push(Hir::literal(prior_bytes)); } new.push(Hir { kind: kind2, props: props2 }); } } } } // We can just skip empty HIRs. HirKind::Empty => {} kind => { if let Some(prior_bytes) = prior_lit.take() { new.push(Hir::literal(prior_bytes)); } new.push(Hir { kind, props }); } } } if let Some(prior_bytes) = prior_lit.take() { new.push(Hir::literal(prior_bytes)); } if new.is_empty() { return Hir::empty(); } else if new.len() == 1 { return new.pop().unwrap(); } let props = Properties::concat(&new); Hir { kind: HirKind::Concat(new), props } } /// Returns the alternation of the given expressions. /// /// This flattens and simplifies the alternation as appropriate. This may /// include factoring out common prefixes or even rewriting the alternation /// as a character class. /// /// Note that an empty alternation is equivalent to `Hir::fail()`. (It /// is not possible for one to write an empty alternation, or even an /// alternation with a single sub-expression, in the concrete syntax of a /// regex.) /// /// # Example /// /// This is a simple example showing how an alternation might get /// simplified. /// /// ``` /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; /// /// let hir = Hir::alternation(vec![ /// Hir::literal([b'a']), /// Hir::literal([b'b']), /// Hir::literal([b'c']), /// Hir::literal([b'd']), /// Hir::literal([b'e']), /// Hir::literal([b'f']), /// ]); /// let expected = Hir::class(Class::Unicode(ClassUnicode::new([ /// ClassUnicodeRange::new('a', 'f'), /// ]))); /// assert_eq!(expected, hir); /// ``` /// /// And another example showing how common prefixes might get factored /// out. /// /// ``` /// use regex_syntax::hir::{Hir, Class, ClassUnicode, ClassUnicodeRange}; /// /// let hir = Hir::alternation(vec![ /// Hir::concat(vec![ /// Hir::literal("abc".as_bytes()), /// Hir::class(Class::Unicode(ClassUnicode::new([ /// ClassUnicodeRange::new('A', 'Z'), /// ]))), /// ]), /// Hir::concat(vec![ /// Hir::literal("abc".as_bytes()), /// Hir::class(Class::Unicode(ClassUnicode::new([ /// ClassUnicodeRange::new('a', 'z'), /// ]))), /// ]), /// ]); /// let expected = Hir::concat(vec![ /// Hir::literal("abc".as_bytes()), /// Hir::alternation(vec![ /// Hir::class(Class::Unicode(ClassUnicode::new([ /// ClassUnicodeRange::new('A', 'Z'), /// ]))), /// Hir::class(Class::Unicode(ClassUnicode::new([ /// ClassUnicodeRange::new('a', 'z'), /// ]))), /// ]), /// ]); /// assert_eq!(expected, hir); /// ``` /// /// Note that these sorts of simplifications are not guaranteed. pub fn alternation(subs: Vec) -> Hir { // We rebuild the alternation by simplifying it. We proceed similarly // as the concatenation case. But in this case, there's no literal // simplification happening. We're just flattening alternations. let mut new = Vec::with_capacity(subs.len()); for sub in subs { let (kind, props) = sub.into_parts(); match kind { HirKind::Alternation(subs2) => { new.extend(subs2); } kind => { new.push(Hir { kind, props }); } } } if new.is_empty() { return Hir::fail(); } else if new.len() == 1 { return new.pop().unwrap(); } // Now that it's completely flattened, look for the special case of // 'char1|char2|...|charN' and collapse that into a class. Note that // we look for 'char' first and then bytes. The issue here is that if // we find both non-ASCII codepoints and non-ASCII singleton bytes, // then it isn't actually possible to smush them into a single class. // (Because classes are either "all codepoints" or "all bytes." You // can have a class that both matches non-ASCII but valid UTF-8 and // invalid UTF-8.) So we look for all chars and then all bytes, and // don't handle anything else. if let Some(singletons) = singleton_chars(&new) { let it = singletons .into_iter() .map(|ch| ClassUnicodeRange { start: ch, end: ch }); return Hir::class(Class::Unicode(ClassUnicode::new(it))); } if let Some(singletons) = singleton_bytes(&new) { let it = singletons .into_iter() .map(|b| ClassBytesRange { start: b, end: b }); return Hir::class(Class::Bytes(ClassBytes::new(it))); } // Similar to singleton chars, we can also look for alternations of // classes. Those can be smushed into a single class. if let Some(cls) = class_chars(&new) { return Hir::class(cls); } if let Some(cls) = class_bytes(&new) { return Hir::class(cls); } // Factor out a common prefix if we can, which might potentially // simplify the expression and unlock other optimizations downstream. // It also might generally make NFA matching and DFA construction // faster by reducing the scope of branching in the regex. new = match lift_common_prefix(new) { Ok(hir) => return hir, Err(unchanged) => unchanged, }; let props = Properties::alternation(&new); Hir { kind: HirKind::Alternation(new), props } } /// Returns an HIR expression for `.`. /// /// * [`Dot::AnyChar`] maps to `(?su-R:.)`. /// * [`Dot::AnyByte`] maps to `(?s-Ru:.)`. /// * [`Dot::AnyCharExceptLF`] maps to `(?u-Rs:.)`. /// * [`Dot::AnyCharExceptCRLF`] maps to `(?Ru-s:.)`. /// * [`Dot::AnyByteExceptLF`] maps to `(?-Rsu:.)`. /// * [`Dot::AnyByteExceptCRLF`] maps to `(?R-su:.)`. /// /// # Example /// /// Note that this is a convenience routine for constructing the correct /// character class based on the value of `Dot`. There is no explicit "dot" /// HIR value. It is just an abbreviation for a common character class. /// /// ``` /// use regex_syntax::hir::{Hir, Dot, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::dot(Dot::AnyByte); /// let expected = Hir::class(Class::Bytes(ClassBytes::new([ /// ClassBytesRange::new(0x00, 0xFF), /// ]))); /// assert_eq!(expected, hir); /// ``` #[inline] pub fn dot(dot: Dot) -> Hir { match dot { Dot::AnyChar => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } Dot::AnyByte => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\xFF')); Hir::class(Class::Bytes(cls)) } Dot::AnyCharExcept(ch) => { let mut cls = ClassUnicode::new([ClassUnicodeRange::new(ch, ch)]); cls.negate(); Hir::class(Class::Unicode(cls)) } Dot::AnyCharExceptLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } Dot::AnyCharExceptCRLF => { let mut cls = ClassUnicode::empty(); cls.push(ClassUnicodeRange::new('\0', '\x09')); cls.push(ClassUnicodeRange::new('\x0B', '\x0C')); cls.push(ClassUnicodeRange::new('\x0E', '\u{10FFFF}')); Hir::class(Class::Unicode(cls)) } Dot::AnyByteExcept(byte) => { let mut cls = ClassBytes::new([ClassBytesRange::new(byte, byte)]); cls.negate(); Hir::class(Class::Bytes(cls)) } Dot::AnyByteExceptLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); Hir::class(Class::Bytes(cls)) } Dot::AnyByteExceptCRLF => { let mut cls = ClassBytes::empty(); cls.push(ClassBytesRange::new(b'\0', b'\x09')); cls.push(ClassBytesRange::new(b'\x0B', b'\x0C')); cls.push(ClassBytesRange::new(b'\x0E', b'\xFF')); Hir::class(Class::Bytes(cls)) } } } } /// The underlying kind of an arbitrary [`Hir`] expression. /// /// An `HirKind` is principally useful for doing case analysis on the type /// of a regular expression. If you're looking to build new `Hir` values, /// then you _must_ use the smart constructors defined on `Hir`, like /// [`Hir::repetition`], to build new `Hir` values. The API intentionally does /// not expose any way of building an `Hir` directly from an `HirKind`. #[derive(Clone, Debug, Eq, PartialEq)] pub enum HirKind { /// The empty regular expression, which matches everything, including the /// empty string. Empty, /// A literalstring that matches exactly these bytes. Literal(Literal), /// A single character class that matches any of the characters in the /// class. A class can either consist of Unicode scalar values as /// characters, or it can use bytes. /// /// A class may be empty. In which case, it matches nothing. Class(Class), /// A look-around assertion. A look-around match always has zero length. Look(Look), /// A repetition operation applied to a sub-expression. Repetition(Repetition), /// A capturing group, which contains a sub-expression. Capture(Capture), /// A concatenation of expressions. /// /// A concatenation matches only if each of its sub-expressions match one /// after the other. /// /// Concatenations are guaranteed by `Hir`'s smart constructors to always /// have at least two sub-expressions. Concat(Vec), /// An alternation of expressions. /// /// An alternation matches only if at least one of its sub-expressions /// match. If multiple sub-expressions match, then the leftmost is /// preferred. /// /// Alternations are guaranteed by `Hir`'s smart constructors to always /// have at least two sub-expressions. Alternation(Vec), } impl HirKind { /// Returns a slice of this kind's sub-expressions, if any. pub fn subs(&self) -> &[Hir] { use core::slice::from_ref; match *self { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => &[], HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, HirKind::Alternation(ref subs) => subs, } } } impl core::fmt::Debug for Hir { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { self.kind.fmt(f) } } /// Print a display representation of this Hir. /// /// The result of this is a valid regular expression pattern string. /// /// This implementation uses constant stack space and heap space proportional /// to the size of the `Hir`. impl core::fmt::Display for Hir { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { crate::hir::print::Printer::new().print(self, f) } } /// The high-level intermediate representation of a literal. /// /// A literal corresponds to `0` or more bytes that should be matched /// literally. The smart constructors defined on `Hir` will automatically /// concatenate adjacent literals into one literal, and will even automatically /// replace empty literals with `Hir::empty()`. /// /// Note that despite a literal being represented by a sequence of bytes, its /// `Debug` implementation will attempt to print it as a normal string. (That /// is, not a sequence of decimal numbers.) #[derive(Clone, Eq, PartialEq)] pub struct Literal(pub Box<[u8]>); impl core::fmt::Debug for Literal { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { crate::debug::Bytes(&self.0).fmt(f) } } /// The high-level intermediate representation of a character class. /// /// A character class corresponds to a set of characters. A character is either /// defined by a Unicode scalar value or a byte. /// /// A character class, regardless of its character type, is represented by a /// sequence of non-overlapping non-adjacent ranges of characters. /// /// There are no guarantees about which class variant is used. Generally /// speaking, the Unicode variat is used whenever a class needs to contain /// non-ASCII Unicode scalar values. But the Unicode variant can be used even /// when Unicode mode is disabled. For example, at the time of writing, the /// regex `(?-u:a|\xc2\xa0)` will compile down to HIR for the Unicode class /// `[a\u00A0]` due to optimizations. /// /// Note that `Bytes` variant may be produced even when it exclusively matches /// valid UTF-8. This is because a `Bytes` variant represents an intention by /// the author of the regular expression to disable Unicode mode, which in turn /// impacts the semantics of case insensitive matching. For example, `(?i)k` /// and `(?i-u)k` will not match the same set of strings. #[derive(Clone, Eq, PartialEq)] pub enum Class { /// A set of characters represented by Unicode scalar values. Unicode(ClassUnicode), /// A set of characters represented by arbitrary bytes (one byte per /// character). Bytes(ClassBytes), } impl Class { /// Apply Unicode simple case folding to this character class, in place. /// The character class will be expanded to include all simple case folded /// character variants. /// /// If this is a byte oriented character class, then this will be limited /// to the ASCII ranges `A-Z` and `a-z`. /// /// # Panics /// /// This routine panics when the case mapping data necessary for this /// routine to complete is unavailable. This occurs when the `unicode-case` /// feature is not enabled and the underlying class is Unicode oriented. /// /// Callers should prefer using `try_case_fold_simple` instead, which will /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { match *self { Class::Unicode(ref mut x) => x.case_fold_simple(), Class::Bytes(ref mut x) => x.case_fold_simple(), } } /// Apply Unicode simple case folding to this character class, in place. /// The character class will be expanded to include all simple case folded /// character variants. /// /// If this is a byte oriented character class, then this will be limited /// to the ASCII ranges `A-Z` and `a-z`. /// /// # Error /// /// This routine returns an error when the case mapping data necessary /// for this routine to complete is unavailable. This occurs when the /// `unicode-case` feature is not enabled and the underlying class is /// Unicode oriented. pub fn try_case_fold_simple( &mut self, ) -> core::result::Result<(), CaseFoldError> { match *self { Class::Unicode(ref mut x) => x.try_case_fold_simple()?, Class::Bytes(ref mut x) => x.case_fold_simple(), } Ok(()) } /// Negate this character class in place. /// /// After completion, this character class will contain precisely the /// characters that weren't previously in the class. pub fn negate(&mut self) { match *self { Class::Unicode(ref mut x) => x.negate(), Class::Bytes(ref mut x) => x.negate(), } } /// Returns true if and only if this character class will only ever match /// valid UTF-8. /// /// A character class can match invalid UTF-8 only when the following /// conditions are met: /// /// 1. The translator was configured to permit generating an expression /// that can match invalid UTF-8. (By default, this is disabled.) /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete /// syntax or in the parser builder. By default, Unicode mode is /// enabled. pub fn is_utf8(&self) -> bool { match *self { Class::Unicode(_) => true, Class::Bytes(ref x) => x.is_ascii(), } } /// Returns the length, in bytes, of the smallest string matched by this /// character class. /// /// For non-empty byte oriented classes, this always returns `1`. For /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or /// `4`. For empty classes, `None` is returned. It is impossible for `0` to /// be returned. /// /// # Example /// /// This example shows some examples of regexes and their corresponding /// minimum length, if any. /// /// ``` /// use regex_syntax::{hir::Properties, parse}; /// /// // The empty string has a min length of 0. /// let hir = parse(r"")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // As do other types of regexes that only match the empty string. /// let hir = parse(r"^$\b\B")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // A regex that can match the empty string but match more is still 0. /// let hir = parse(r"a*")?; /// assert_eq!(Some(0), hir.properties().minimum_len()); /// // A regex that matches nothing has no minimum defined. /// let hir = parse(r"[a&&b]")?; /// assert_eq!(None, hir.properties().minimum_len()); /// // Character classes usually have a minimum length of 1. /// let hir = parse(r"\w")?; /// assert_eq!(Some(1), hir.properties().minimum_len()); /// // But sometimes Unicode classes might be bigger! /// let hir = parse(r"\p{Cyrillic}")?; /// assert_eq!(Some(2), hir.properties().minimum_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn minimum_len(&self) -> Option { match *self { Class::Unicode(ref x) => x.minimum_len(), Class::Bytes(ref x) => x.minimum_len(), } } /// Returns the length, in bytes, of the longest string matched by this /// character class. /// /// For non-empty byte oriented classes, this always returns `1`. For /// non-empty Unicode oriented classes, this can return `1`, `2`, `3` or /// `4`. For empty classes, `None` is returned. It is impossible for `0` to /// be returned. /// /// # Example /// /// This example shows some examples of regexes and their corresponding /// maximum length, if any. /// /// ``` /// use regex_syntax::{hir::Properties, parse}; /// /// // The empty string has a max length of 0. /// let hir = parse(r"")?; /// assert_eq!(Some(0), hir.properties().maximum_len()); /// // As do other types of regexes that only match the empty string. /// let hir = parse(r"^$\b\B")?; /// assert_eq!(Some(0), hir.properties().maximum_len()); /// // A regex that matches nothing has no maximum defined. /// let hir = parse(r"[a&&b]")?; /// assert_eq!(None, hir.properties().maximum_len()); /// // Bounded repeats work as you expect. /// let hir = parse(r"x{2,10}")?; /// assert_eq!(Some(10), hir.properties().maximum_len()); /// // An unbounded repeat means there is no maximum. /// let hir = parse(r"x{2,}")?; /// assert_eq!(None, hir.properties().maximum_len()); /// // With Unicode enabled, \w can match up to 4 bytes! /// let hir = parse(r"\w")?; /// assert_eq!(Some(4), hir.properties().maximum_len()); /// // Without Unicode enabled, \w matches at most 1 byte. /// let hir = parse(r"(?-u)\w")?; /// assert_eq!(Some(1), hir.properties().maximum_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn maximum_len(&self) -> Option { match *self { Class::Unicode(ref x) => x.maximum_len(), Class::Bytes(ref x) => x.maximum_len(), } } /// Returns true if and only if this character class is empty. That is, /// it has no elements. /// /// An empty character can never match anything, including an empty string. pub fn is_empty(&self) -> bool { match *self { Class::Unicode(ref x) => x.ranges().is_empty(), Class::Bytes(ref x) => x.ranges().is_empty(), } } /// If this class consists of exactly one element (whether a codepoint or a /// byte), then return it as a literal byte string. /// /// If this class is empty or contains more than one element, then `None` /// is returned. pub fn literal(&self) -> Option> { match *self { Class::Unicode(ref x) => x.literal(), Class::Bytes(ref x) => x.literal(), } } } impl core::fmt::Debug for Class { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use crate::debug::Byte; let mut fmter = f.debug_set(); match *self { Class::Unicode(ref cls) => { for r in cls.ranges().iter() { fmter.entry(&(r.start..=r.end)); } } Class::Bytes(ref cls) => { for r in cls.ranges().iter() { fmter.entry(&(Byte(r.start)..=Byte(r.end))); } } } fmter.finish() } } /// A set of characters represented by Unicode scalar values. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassUnicode { set: IntervalSet, } impl ClassUnicode { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges /// may overlap. Ranges will automatically be sorted into a canonical /// non-overlapping order. pub fn new(ranges: I) -> ClassUnicode where I: IntoIterator, { ClassUnicode { set: IntervalSet::new(ranges) } } /// Create a new class with no ranges. /// /// An empty class matches nothing. That is, it is equivalent to /// [`Hir::fail`]. pub fn empty() -> ClassUnicode { ClassUnicode::new(vec![]) } /// Add a new range to this set. pub fn push(&mut self, range: ClassUnicodeRange) { self.set.push(range); } /// Return an iterator over all ranges in this class. /// /// The iterator yields ranges in ascending order. pub fn iter(&self) -> ClassUnicodeIter<'_> { ClassUnicodeIter(self.set.iter()) } /// Return the underlying ranges as a slice. pub fn ranges(&self) -> &[ClassUnicodeRange] { self.set.intervals() } /// Expand this character class such that it contains all case folded /// characters, according to Unicode's "simple" mapping. For example, if /// this class consists of the range `a-z`, then applying case folding will /// result in the class containing both the ranges `a-z` and `A-Z`. /// /// # Panics /// /// This routine panics when the case mapping data necessary for this /// routine to complete is unavailable. This occurs when the `unicode-case` /// feature is not enabled. /// /// Callers should prefer using `try_case_fold_simple` instead, which will /// return an error instead of panicking. pub fn case_fold_simple(&mut self) { self.set .case_fold_simple() .expect("unicode-case feature must be enabled"); } /// Expand this character class such that it contains all case folded /// characters, according to Unicode's "simple" mapping. For example, if /// this class consists of the range `a-z`, then applying case folding will /// result in the class containing both the ranges `a-z` and `A-Z`. /// /// # Error /// /// This routine returns an error when the case mapping data necessary /// for this routine to complete is unavailable. This occurs when the /// `unicode-case` feature is not enabled. pub fn try_case_fold_simple( &mut self, ) -> core::result::Result<(), CaseFoldError> { self.set.case_fold_simple() } /// Negate this character class. /// /// For all `c` where `c` is a Unicode scalar value, if `c` was in this /// set, then it will not be in this set after negation. pub fn negate(&mut self) { self.set.negate(); } /// Union this character class with the given character class, in place. pub fn union(&mut self, other: &ClassUnicode) { self.set.union(&other.set); } /// Intersect this character class with the given character class, in /// place. pub fn intersect(&mut self, other: &ClassUnicode) { self.set.intersect(&other.set); } /// Subtract the given character class from this character class, in place. pub fn difference(&mut self, other: &ClassUnicode) { self.set.difference(&other.set); } /// Compute the symmetric difference of the given character classes, in /// place. /// /// This computes the symmetric difference of two character classes. This /// removes all elements in this class that are also in the given class, /// but all adds all elements from the given class that aren't in this /// class. That is, the class will contain all elements in either class, /// but will not contain any elements that are in both classes. pub fn symmetric_difference(&mut self, other: &ClassUnicode) { self.set.symmetric_difference(&other.set); } /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII codepoint. pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') } /// Returns the length, in bytes, of the smallest string matched by this /// character class. /// /// Returns `None` when the class is empty. pub fn minimum_len(&self) -> Option { let first = self.ranges().get(0)?; // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). Some(first.start.len_utf8()) } /// Returns the length, in bytes, of the longest string matched by this /// character class. /// /// Returns `None` when the class is empty. pub fn maximum_len(&self) -> Option { let last = self.ranges().last()?; // Correct because c1 < c2 implies c1.len_utf8() < c2.len_utf8(). Some(last.end.len_utf8()) } /// If this class consists of exactly one codepoint, then return it as /// a literal byte string. /// /// If this class is empty or contains more than one codepoint, then `None` /// is returned. pub fn literal(&self) -> Option> { let rs = self.ranges(); if rs.len() == 1 && rs[0].start == rs[0].end { Some(rs[0].start.encode_utf8(&mut [0; 4]).to_string().into_bytes()) } else { None } } /// If this class consists of only ASCII ranges, then return its /// corresponding and equivalent byte class. pub fn to_byte_class(&self) -> Option { if !self.is_ascii() { return None; } Some(ClassBytes::new(self.ranges().iter().map(|r| { // Since we are guaranteed that our codepoint range is ASCII, the // 'u8::try_from' calls below are guaranteed to be correct. ClassBytesRange { start: u8::try_from(r.start).unwrap(), end: u8::try_from(r.end).unwrap(), } }))) } } /// An iterator over all ranges in a Unicode character class. /// /// The lifetime `'a` refers to the lifetime of the underlying class. #[derive(Debug)] pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); impl<'a> Iterator for ClassUnicodeIter<'a> { type Item = &'a ClassUnicodeRange; fn next(&mut self) -> Option<&'a ClassUnicodeRange> { self.0.next() } } /// A single range of characters represented by Unicode scalar values. /// /// The range is closed. That is, the start and end of the range are included /// in the range. #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] pub struct ClassUnicodeRange { start: char, end: char, } impl core::fmt::Debug for ClassUnicodeRange { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { let start = if !self.start.is_whitespace() && !self.start.is_control() { self.start.to_string() } else { format!("0x{:X}", u32::from(self.start)) }; let end = if !self.end.is_whitespace() && !self.end.is_control() { self.end.to_string() } else { format!("0x{:X}", u32::from(self.end)) }; f.debug_struct("ClassUnicodeRange") .field("start", &start) .field("end", &end) .finish() } } impl Interval for ClassUnicodeRange { type Bound = char; #[inline] fn lower(&self) -> char { self.start } #[inline] fn upper(&self) -> char { self.end } #[inline] fn set_lower(&mut self, bound: char) { self.start = bound; } #[inline] fn set_upper(&mut self, bound: char) { self.end = bound; } /// Apply simple case folding to this Unicode scalar value range. /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. fn case_fold_simple( &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { let mut folder = unicode::SimpleCaseFolder::new()?; if !folder.overlaps(self.start, self.end) { return Ok(()); } let (start, end) = (u32::from(self.start), u32::from(self.end)); for cp in (start..=end).filter_map(char::from_u32) { for &cp_folded in folder.mapping(cp) { ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); } } Ok(()) } } impl ClassUnicodeRange { /// Create a new Unicode scalar value range for a character class. /// /// The returned range is always in a canonical form. That is, the range /// returned always satisfies the invariant that `start <= end`. pub fn new(start: char, end: char) -> ClassUnicodeRange { ClassUnicodeRange::create(start, end) } /// Return the start of this range. /// /// The start of a range is always less than or equal to the end of the /// range. pub fn start(&self) -> char { self.start } /// Return the end of this range. /// /// The end of a range is always greater than or equal to the start of the /// range. pub fn end(&self) -> char { self.end } /// Returns the number of codepoints in this range. pub fn len(&self) -> usize { let diff = 1 + u32::from(self.end) - u32::from(self.start); // This is likely to panic in 16-bit targets since a usize can only fit // 2^16. It's not clear what to do here, other than to return an error // when building a Unicode class that contains a range whose length // overflows usize. (Which, to be honest, is probably quite common on // 16-bit targets. For example, this would imply that '.' and '\p{any}' // would be impossible to build.) usize::try_from(diff).expect("char class len fits in usize") } } /// A set of characters represented by arbitrary bytes. /// /// Each byte corresponds to one character. #[derive(Clone, Debug, Eq, PartialEq)] pub struct ClassBytes { set: IntervalSet, } impl ClassBytes { /// Create a new class from a sequence of ranges. /// /// The given ranges do not need to be in any specific order, and ranges /// may overlap. Ranges will automatically be sorted into a canonical /// non-overlapping order. pub fn new(ranges: I) -> ClassBytes where I: IntoIterator, { ClassBytes { set: IntervalSet::new(ranges) } } /// Create a new class with no ranges. /// /// An empty class matches nothing. That is, it is equivalent to /// [`Hir::fail`]. pub fn empty() -> ClassBytes { ClassBytes::new(vec![]) } /// Add a new range to this set. pub fn push(&mut self, range: ClassBytesRange) { self.set.push(range); } /// Return an iterator over all ranges in this class. /// /// The iterator yields ranges in ascending order. pub fn iter(&self) -> ClassBytesIter<'_> { ClassBytesIter(self.set.iter()) } /// Return the underlying ranges as a slice. pub fn ranges(&self) -> &[ClassBytesRange] { self.set.intervals() } /// Expand this character class such that it contains all case folded /// characters. For example, if this class consists of the range `a-z`, /// then applying case folding will result in the class containing both the /// ranges `a-z` and `A-Z`. /// /// Note that this only applies ASCII case folding, which is limited to the /// characters `a-z` and `A-Z`. pub fn case_fold_simple(&mut self) { self.set.case_fold_simple().expect("ASCII case folding never fails"); } /// Negate this byte class. /// /// For all `b` where `b` is a any byte, if `b` was in this set, then it /// will not be in this set after negation. pub fn negate(&mut self) { self.set.negate(); } /// Union this byte class with the given byte class, in place. pub fn union(&mut self, other: &ClassBytes) { self.set.union(&other.set); } /// Intersect this byte class with the given byte class, in place. pub fn intersect(&mut self, other: &ClassBytes) { self.set.intersect(&other.set); } /// Subtract the given byte class from this byte class, in place. pub fn difference(&mut self, other: &ClassBytes) { self.set.difference(&other.set); } /// Compute the symmetric difference of the given byte classes, in place. /// /// This computes the symmetric difference of two byte classes. This /// removes all elements in this class that are also in the given class, /// but all adds all elements from the given class that aren't in this /// class. That is, the class will contain all elements in either class, /// but will not contain any elements that are in both classes. pub fn symmetric_difference(&mut self, other: &ClassBytes) { self.set.symmetric_difference(&other.set); } /// Returns true if and only if this character class will either match /// nothing or only ASCII bytes. Stated differently, this returns false /// if and only if this class contains a non-ASCII byte. pub fn is_ascii(&self) -> bool { self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) } /// Returns the length, in bytes, of the smallest string matched by this /// character class. /// /// Returns `None` when the class is empty. pub fn minimum_len(&self) -> Option { if self.ranges().is_empty() { None } else { Some(1) } } /// Returns the length, in bytes, of the longest string matched by this /// character class. /// /// Returns `None` when the class is empty. pub fn maximum_len(&self) -> Option { if self.ranges().is_empty() { None } else { Some(1) } } /// If this class consists of exactly one byte, then return it as /// a literal byte string. /// /// If this class is empty or contains more than one byte, then `None` /// is returned. pub fn literal(&self) -> Option> { let rs = self.ranges(); if rs.len() == 1 && rs[0].start == rs[0].end { Some(vec![rs[0].start]) } else { None } } /// If this class consists of only ASCII ranges, then return its /// corresponding and equivalent Unicode class. pub fn to_unicode_class(&self) -> Option { if !self.is_ascii() { return None; } Some(ClassUnicode::new(self.ranges().iter().map(|r| { // Since we are guaranteed that our byte range is ASCII, the // 'char::from' calls below are correct and will not erroneously // convert a raw byte value into its corresponding codepoint. ClassUnicodeRange { start: char::from(r.start), end: char::from(r.end), } }))) } } /// An iterator over all ranges in a byte character class. /// /// The lifetime `'a` refers to the lifetime of the underlying class. #[derive(Debug)] pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); impl<'a> Iterator for ClassBytesIter<'a> { type Item = &'a ClassBytesRange; fn next(&mut self) -> Option<&'a ClassBytesRange> { self.0.next() } } /// A single range of characters represented by arbitrary bytes. /// /// The range is closed. That is, the start and end of the range are included /// in the range. #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] pub struct ClassBytesRange { start: u8, end: u8, } impl Interval for ClassBytesRange { type Bound = u8; #[inline] fn lower(&self) -> u8 { self.start } #[inline] fn upper(&self) -> u8 { self.end } #[inline] fn set_lower(&mut self, bound: u8) { self.start = bound; } #[inline] fn set_upper(&mut self, bound: u8) { self.end = bound; } /// Apply simple case folding to this byte range. Only ASCII case mappings /// (for a-z) are applied. /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. fn case_fold_simple( &self, ranges: &mut Vec, ) -> Result<(), unicode::CaseFoldError> { if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { let lower = cmp::max(self.start, b'a'); let upper = cmp::min(self.end, b'z'); ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); } if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { let lower = cmp::max(self.start, b'A'); let upper = cmp::min(self.end, b'Z'); ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); } Ok(()) } } impl ClassBytesRange { /// Create a new byte range for a character class. /// /// The returned range is always in a canonical form. That is, the range /// returned always satisfies the invariant that `start <= end`. pub fn new(start: u8, end: u8) -> ClassBytesRange { ClassBytesRange::create(start, end) } /// Return the start of this range. /// /// The start of a range is always less than or equal to the end of the /// range. pub fn start(&self) -> u8 { self.start } /// Return the end of this range. /// /// The end of a range is always greater than or equal to the start of the /// range. pub fn end(&self) -> u8 { self.end } /// Returns the number of bytes in this range. pub fn len(&self) -> usize { usize::from(self.end.checked_sub(self.start).unwrap()) .checked_add(1) .unwrap() } } impl core::fmt::Debug for ClassBytesRange { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("ClassBytesRange") .field("start", &crate::debug::Byte(self.start)) .field("end", &crate::debug::Byte(self.end)) .finish() } } /// The high-level intermediate representation for a look-around assertion. /// /// An assertion match is always zero-length. Also called an "empty match." #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. Start = 1 << 0, /// Match the end of text. Specifically, this matches at the ending /// position of the input. End = 1 << 1, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following a `\n` character. StartLF = 1 << 2, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\n` character. EndLF = 1 << 3, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following either a `\r` or `\n` character, but never after /// a `\r` when a `\n` follows. StartCRLF = 1 << 4, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` /// precedes it. EndCRLF = 1 << 5, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. WordAscii = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordAsciiNegate = 1 << 7, /// Match a Unicode-aware word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, /// Match the start of an ASCII-only word boundary. That is, this matches a /// position at either the beginning of the haystack or where the previous /// character is not a word character and the following character is a word /// character. WordStartAscii = 1 << 10, /// Match the end of an ASCII-only word boundary. That is, this matches /// a position at either the end of the haystack or where the previous /// character is a word character and the following character is not a word /// character. WordEndAscii = 1 << 11, /// Match the start of a Unicode word boundary. That is, this matches a /// position at either the beginning of the haystack or where the previous /// character is not a word character and the following character is a word /// character. WordStartUnicode = 1 << 12, /// Match the end of a Unicode word boundary. That is, this matches a /// position at either the end of the haystack or where the previous /// character is a word character and the following character is not a word /// character. WordEndUnicode = 1 << 13, /// Match the start half of an ASCII-only word boundary. That is, this /// matches a position at either the beginning of the haystack or where the /// previous character is not a word character. WordStartHalfAscii = 1 << 14, /// Match the end half of an ASCII-only word boundary. That is, this /// matches a position at either the end of the haystack or where the /// following character is not a word character. WordEndHalfAscii = 1 << 15, /// Match the start half of a Unicode word boundary. That is, this matches /// a position at either the beginning of the haystack or where the /// previous character is not a word character. WordStartHalfUnicode = 1 << 16, /// Match the end half of a Unicode word boundary. That is, this matches /// a position at either the end of the haystack or where the following /// character is not a word character. WordEndHalfUnicode = 1 << 17, } impl Look { /// Flip the look-around assertion to its equivalent for reverse searches. /// For example, `StartLF` gets translated to `EndLF`. /// /// Some assertions, such as `WordUnicode`, remain the same since they /// match the same positions regardless of the direction of the search. #[inline] pub const fn reversed(self) -> Look { match self { Look::Start => Look::End, Look::End => Look::Start, Look::StartLF => Look::EndLF, Look::EndLF => Look::StartLF, Look::StartCRLF => Look::EndCRLF, Look::EndCRLF => Look::StartCRLF, Look::WordAscii => Look::WordAscii, Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, Look::WordStartAscii => Look::WordEndAscii, Look::WordEndAscii => Look::WordStartAscii, Look::WordStartUnicode => Look::WordEndUnicode, Look::WordEndUnicode => Look::WordStartUnicode, Look::WordStartHalfAscii => Look::WordEndHalfAscii, Look::WordEndHalfAscii => Look::WordStartHalfAscii, Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } /// Return the underlying representation of this look-around enumeration /// as an integer. Giving the return value to the [`Look::from_repr`] /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0000_0000_0001 => Some(Look::Start), 0b00_0000_0000_0000_0010 => Some(Look::End), 0b00_0000_0000_0000_0100 => Some(Look::StartLF), 0b00_0000_0000_0000_1000 => Some(Look::EndLF), 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } /// Returns a convenient single codepoint representation of this /// look-around assertion. Each assertion is guaranteed to be represented /// by a distinct character. /// /// This is useful for succinctly representing a look-around assertion in /// human friendly but succinct output intended for a programmer working on /// regex internals. #[inline] pub const fn as_char(self) -> char { match self { Look::Start => 'A', Look::End => 'z', Look::StartLF => '^', Look::EndLF => '$', Look::StartCRLF => 'r', Look::EndCRLF => 'R', Look::WordAscii => 'b', Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', Look::WordStartAscii => '<', Look::WordEndAscii => '>', Look::WordStartUnicode => '〈', Look::WordEndUnicode => '〉', Look::WordStartHalfAscii => '◁', Look::WordEndHalfAscii => '▷', Look::WordStartHalfUnicode => '◀', Look::WordEndHalfUnicode => '▶', } } } /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may /// also have a name associated with it (e.g., `(?P\w)`), but it's not /// necessary. /// /// Note that there is no explicit representation of a non-capturing group /// in a `Hir`. Instead, non-capturing grouping is handled automatically by /// the recursive structure of the `Hir` itself. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Capture { /// The capture index of the capture. pub index: u32, /// The name of the capture, if it exists. pub name: Option>, /// The expression inside the capturing group, which may be empty. pub sub: Box, } /// The high-level intermediate representation of a repetition operator. /// /// A repetition operator permits the repetition of an arbitrary /// sub-expression. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Repetition { /// The minimum range of the repetition. /// /// Note that special cases like `?`, `+` and `*` all get translated into /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. /// /// When `min` is zero, this expression can match the empty string /// regardless of what its sub-expression is. pub min: u32, /// The maximum range of the repetition. /// /// Note that when `max` is `None`, `min` acts as a lower bound but where /// there is no upper bound. For something like `x{5}` where the min and /// max are equivalent, `min` will be set to `5` and `max` will be set to /// `Some(5)`. pub max: Option, /// Whether this repetition operator is greedy or not. A greedy operator /// will match as much as it can. A non-greedy operator will match as /// little as it can. /// /// Typically, operators are greedy by default and are only non-greedy when /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is /// not. However, this can be inverted via the `U` "ungreedy" flag. pub greedy: bool, /// The expression being repeated. pub sub: Box, } impl Repetition { /// Returns a new repetition with the same `min`, `max` and `greedy` /// values, but with its sub-expression replaced with the one given. pub fn with(&self, sub: Hir) -> Repetition { Repetition { min: self.min, max: self.max, greedy: self.greedy, sub: Box::new(sub), } } } /// A type describing the different flavors of `.`. /// /// This type is meant to be used with [`Hir::dot`], which is a convenience /// routine for building HIR values derived from the `.` regex. #[non_exhaustive] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Dot { /// Matches the UTF-8 encoding of any Unicode scalar value. /// /// This is equivalent to `(?su:.)` and also `\p{any}`. AnyChar, /// Matches any byte value. /// /// This is equivalent to `(?s-u:.)` and also `(?-u:[\x00-\xFF])`. AnyByte, /// Matches the UTF-8 encoding of any Unicode scalar value except for the /// `char` given. /// /// This is equivalent to using `(?u-s:.)` with the line terminator set /// to a particular ASCII byte. (Because of peculiarities in the regex /// engines, a line terminator must be a single byte. It follows that when /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar /// value. That is, ti must be ASCII.) /// /// (This and `AnyCharExceptLF` both exist because of legacy reasons. /// `AnyCharExceptLF` will be dropped in the next breaking change release.) AnyCharExcept(char), /// Matches the UTF-8 encoding of any Unicode scalar value except for `\n`. /// /// This is equivalent to `(?u-s:.)` and also `[\p{any}--\n]`. AnyCharExceptLF, /// Matches the UTF-8 encoding of any Unicode scalar value except for `\r` /// and `\n`. /// /// This is equivalent to `(?uR-s:.)` and also `[\p{any}--\r\n]`. AnyCharExceptCRLF, /// Matches any byte value except for the `u8` given. /// /// This is equivalent to using `(?-us:.)` with the line terminator set /// to a particular ASCII byte. (Because of peculiarities in the regex /// engines, a line terminator must be a single byte. It follows that when /// UTF-8 mode is enabled, this single byte must also be a Unicode scalar /// value. That is, ti must be ASCII.) /// /// (This and `AnyByteExceptLF` both exist because of legacy reasons. /// `AnyByteExceptLF` will be dropped in the next breaking change release.) AnyByteExcept(u8), /// Matches any byte value except for `\n`. /// /// This is equivalent to `(?-su:.)` and also `(?-u:[[\x00-\xFF]--\n])`. AnyByteExceptLF, /// Matches any byte value except for `\r` and `\n`. /// /// This is equivalent to `(?R-su:.)` and also `(?-u:[[\x00-\xFF]--\r\n])`. AnyByteExceptCRLF, } /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack /// space but heap space proportional to the depth of the total `Hir`. impl Drop for Hir { fn drop(&mut self) { use core::mem; match *self.kind() { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } HirKind::Concat(ref x) if x.is_empty() => return, HirKind::Alternation(ref x) if x.is_empty() => return, _ => {} } let mut stack = vec![mem::replace(self, Hir::empty())]; while let Some(mut expr) = stack.pop() { match expr.kind { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Concat(ref mut x) => { stack.extend(x.drain(..)); } HirKind::Alternation(ref mut x) => { stack.extend(x.drain(..)); } } } } } /// A type that collects various properties of an HIR value. /// /// Properties are always scalar values and represent meta data that is /// computed inductively on an HIR value. Properties are defined for all /// HIR values. /// /// All methods on a `Properties` value take constant time and are meant to /// be cheap to call. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Properties(Box); /// The property definition. It is split out so that we can box it, and /// there by make `Properties` use less stack size. This is kind-of important /// because every HIR value has a `Properties` attached to it. /// /// This does have the unfortunate consequence that creating any HIR value /// always leads to at least one alloc for properties, but this is generally /// true anyway (for pretty much all HirKinds except for look-arounds). #[derive(Clone, Debug, Eq, PartialEq)] struct PropertiesI { minimum_len: Option, maximum_len: Option, look_set: LookSet, look_set_prefix: LookSet, look_set_suffix: LookSet, look_set_prefix_any: LookSet, look_set_suffix_any: LookSet, utf8: bool, explicit_captures_len: usize, static_explicit_captures_len: Option, literal: bool, alternation_literal: bool, } impl Properties { /// Returns the length (in bytes) of the smallest string matched by this /// HIR. /// /// A return value of `0` is possible and occurs when the HIR can match an /// empty string. /// /// `None` is returned when there is no minimum length. This occurs in /// precisely the cases where the HIR matches nothing. i.e., The language /// the regex matches is empty. An example of such a regex is `\P{any}`. #[inline] pub fn minimum_len(&self) -> Option { self.0.minimum_len } /// Returns the length (in bytes) of the longest string matched by this /// HIR. /// /// A return value of `0` is possible and occurs when nothing longer than /// the empty string is in the language described by this HIR. /// /// `None` is returned when there is no longest matching string. This /// occurs when the HIR matches nothing or when there is no upper bound on /// the length of matching strings. Example of such regexes are `\P{any}` /// (matches nothing) and `a+` (has no upper bound). #[inline] pub fn maximum_len(&self) -> Option { self.0.maximum_len } /// Returns a set of all look-around assertions that appear at least once /// in this HIR value. #[inline] pub fn look_set(&self) -> LookSet { self.0.look_set } /// Returns a set of all look-around assertions that appear as a prefix for /// this HIR value. That is, the set returned corresponds to the set of /// assertions that must be passed before matching any bytes in a haystack. /// /// For example, `hir.look_set_prefix().contains(Look::Start)` returns true /// if and only if the HIR is fully anchored at the start. #[inline] pub fn look_set_prefix(&self) -> LookSet { self.0.look_set_prefix } /// Returns a set of all look-around assertions that appear as a _possible_ /// prefix for this HIR value. That is, the set returned corresponds to the /// set of assertions that _may_ be passed before matching any bytes in a /// haystack. /// /// For example, `hir.look_set_prefix_any().contains(Look::Start)` returns /// true if and only if it's possible for the regex to match through a /// anchored assertion before consuming any input. #[inline] pub fn look_set_prefix_any(&self) -> LookSet { self.0.look_set_prefix_any } /// Returns a set of all look-around assertions that appear as a suffix for /// this HIR value. That is, the set returned corresponds to the set of /// assertions that must be passed in order to be considered a match after /// all other consuming HIR expressions. /// /// For example, `hir.look_set_suffix().contains(Look::End)` returns true /// if and only if the HIR is fully anchored at the end. #[inline] pub fn look_set_suffix(&self) -> LookSet { self.0.look_set_suffix } /// Returns a set of all look-around assertions that appear as a _possible_ /// suffix for this HIR value. That is, the set returned corresponds to the /// set of assertions that _may_ be passed before matching any bytes in a /// haystack. /// /// For example, `hir.look_set_suffix_any().contains(Look::End)` returns /// true if and only if it's possible for the regex to match through a /// anchored assertion at the end of a match without consuming any input. #[inline] pub fn look_set_suffix_any(&self) -> LookSet { self.0.look_set_suffix_any } /// Return true if and only if the corresponding HIR will always match /// valid UTF-8. /// /// When this returns false, then it is possible for this HIR expression to /// match invalid UTF-8, including by matching between the code units of /// a single UTF-8 encoded codepoint. /// /// Note that this returns true even when the corresponding HIR can match /// the empty string. Since an empty string can technically appear between /// UTF-8 code units, it is possible for a match to be reported that splits /// a codepoint which could in turn be considered matching invalid UTF-8. /// However, it is generally assumed that such empty matches are handled /// specially by the search routine if it is absolutely required that /// matches not split a codepoint. /// /// # Example /// /// This code example shows the UTF-8 property of a variety of patterns. /// /// ``` /// use regex_syntax::{ParserBuilder, parse}; /// /// // Examples of 'is_utf8() == true'. /// assert!(parse(r"a")?.properties().is_utf8()); /// assert!(parse(r"[^a]")?.properties().is_utf8()); /// assert!(parse(r".")?.properties().is_utf8()); /// assert!(parse(r"\W")?.properties().is_utf8()); /// assert!(parse(r"\b")?.properties().is_utf8()); /// assert!(parse(r"\B")?.properties().is_utf8()); /// assert!(parse(r"(?-u)\b")?.properties().is_utf8()); /// assert!(parse(r"(?-u)\B")?.properties().is_utf8()); /// // Unicode mode is enabled by default, and in /// // that mode, all \x hex escapes are treated as /// // codepoints. So this actually matches the UTF-8 /// // encoding of U+00FF. /// assert!(parse(r"\xFF")?.properties().is_utf8()); /// /// // Now we show examples of 'is_utf8() == false'. /// // The only way to do this is to force the parser /// // to permit invalid UTF-8, otherwise all of these /// // would fail to parse! /// let parse = |pattern| { /// ParserBuilder::new().utf8(false).build().parse(pattern) /// }; /// assert!(!parse(r"(?-u)[^a]")?.properties().is_utf8()); /// assert!(!parse(r"(?-u).")?.properties().is_utf8()); /// assert!(!parse(r"(?-u)\W")?.properties().is_utf8()); /// // Conversely to the equivalent example above, /// // when Unicode mode is disabled, \x hex escapes /// // are treated as their raw byte values. /// assert!(!parse(r"(?-u)\xFF")?.properties().is_utf8()); /// // Note that just because we disabled UTF-8 in the /// // parser doesn't mean we still can't use Unicode. /// // It is enabled by default, so \xFF is still /// // equivalent to matching the UTF-8 encoding of /// // U+00FF by default. /// assert!(parse(r"\xFF")?.properties().is_utf8()); /// // Even though we use raw bytes that individually /// // are not valid UTF-8, when combined together, the /// // overall expression *does* match valid UTF-8! /// assert!(parse(r"(?-u)\xE2\x98\x83")?.properties().is_utf8()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_utf8(&self) -> bool { self.0.utf8 } /// Returns the total number of explicit capturing groups in the /// corresponding HIR. /// /// Note that this does not include the implicit capturing group /// corresponding to the entire match that is typically included by regex /// engines. /// /// # Example /// /// This method will return `0` for `a` and `1` for `(a)`: /// /// ``` /// use regex_syntax::parse; /// /// assert_eq!(0, parse("a")?.properties().explicit_captures_len()); /// assert_eq!(1, parse("(a)")?.properties().explicit_captures_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn explicit_captures_len(&self) -> usize { self.0.explicit_captures_len } /// Returns the total number of explicit capturing groups that appear in /// every possible match. /// /// If the number of capture groups can vary depending on the match, then /// this returns `None`. That is, a value is only returned when the number /// of matching groups is invariant or "static." /// /// Note that this does not include the implicit capturing group /// corresponding to the entire match. /// /// # Example /// /// This shows a few cases where a static number of capture groups is /// available and a few cases where it is not. /// /// ``` /// use regex_syntax::parse; /// /// let len = |pattern| { /// parse(pattern).map(|h| { /// h.properties().static_explicit_captures_len() /// }) /// }; /// /// assert_eq!(Some(0), len("a")?); /// assert_eq!(Some(1), len("(a)")?); /// assert_eq!(Some(1), len("(a)|(b)")?); /// assert_eq!(Some(2), len("(a)(b)|(c)(d)")?); /// assert_eq!(None, len("(a)|b")?); /// assert_eq!(None, len("a|(b)")?); /// assert_eq!(None, len("(b)*")?); /// assert_eq!(Some(1), len("(b)+")?); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn static_explicit_captures_len(&self) -> Option { self.0.static_explicit_captures_len } /// Return true if and only if this HIR is a simple literal. This is /// only true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s. /// /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()` and /// the empty string are not (even though they contain sub-expressions that /// are literals). #[inline] pub fn is_literal(&self) -> bool { self.0.literal } /// Return true if and only if this HIR is either a simple literal or an /// alternation of simple literals. This is only /// true when this HIR expression is either itself a `Literal` or a /// concatenation of only `Literal`s or an alternation of only `Literal`s. /// /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation /// literals, but `f+`, `(foo)`, `foo()`, and the empty pattern are not /// (even though that contain sub-expressions that are literals). #[inline] pub fn is_alternation_literal(&self) -> bool { self.0.alternation_literal } /// Returns the total amount of heap memory usage, in bytes, used by this /// `Properties` value. #[inline] pub fn memory_usage(&self) -> usize { core::mem::size_of::() } /// Returns a new set of properties that corresponds to the union of the /// iterator of properties given. /// /// This is useful when one has multiple `Hir` expressions and wants /// to combine them into a single alternation without constructing the /// corresponding `Hir`. This routine provides a way of combining the /// properties of each `Hir` expression into one set of properties /// representing the union of those expressions. /// /// # Example: union with HIRs that never match /// /// This example shows that unioning properties together with one that /// represents a regex that never matches will "poison" certain attributes, /// like the minimum and maximum lengths. /// /// ``` /// use regex_syntax::{hir::Properties, parse}; /// /// let hir1 = parse("ab?c?")?; /// assert_eq!(Some(1), hir1.properties().minimum_len()); /// assert_eq!(Some(3), hir1.properties().maximum_len()); /// /// let hir2 = parse(r"[a&&b]")?; /// assert_eq!(None, hir2.properties().minimum_len()); /// assert_eq!(None, hir2.properties().maximum_len()); /// /// let hir3 = parse(r"wxy?z?")?; /// assert_eq!(Some(2), hir3.properties().minimum_len()); /// assert_eq!(Some(4), hir3.properties().maximum_len()); /// /// let unioned = Properties::union([ /// hir1.properties(), /// hir2.properties(), /// hir3.properties(), /// ]); /// assert_eq!(None, unioned.minimum_len()); /// assert_eq!(None, unioned.maximum_len()); /// /// # Ok::<(), Box>(()) /// ``` /// /// The maximum length can also be "poisoned" by a pattern that has no /// upper bound on the length of a match. The minimum length remains /// unaffected: /// /// ``` /// use regex_syntax::{hir::Properties, parse}; /// /// let hir1 = parse("ab?c?")?; /// assert_eq!(Some(1), hir1.properties().minimum_len()); /// assert_eq!(Some(3), hir1.properties().maximum_len()); /// /// let hir2 = parse(r"a+")?; /// assert_eq!(Some(1), hir2.properties().minimum_len()); /// assert_eq!(None, hir2.properties().maximum_len()); /// /// let hir3 = parse(r"wxy?z?")?; /// assert_eq!(Some(2), hir3.properties().minimum_len()); /// assert_eq!(Some(4), hir3.properties().maximum_len()); /// /// let unioned = Properties::union([ /// hir1.properties(), /// hir2.properties(), /// hir3.properties(), /// ]); /// assert_eq!(Some(1), unioned.minimum_len()); /// assert_eq!(None, unioned.maximum_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn union(props: I) -> Properties where I: IntoIterator, P: core::borrow::Borrow, { let mut it = props.into_iter().peekable(); // While empty alternations aren't possible, we still behave as if they // are. When we have an empty alternate, then clearly the look-around // prefix and suffix is empty. Otherwise, it is the intersection of all // prefixes and suffixes (respectively) of the branches. let fix = if it.peek().is_none() { LookSet::empty() } else { LookSet::full() }; // And also, an empty alternate means we have 0 static capture groups, // but we otherwise start with the number corresponding to the first // alternate. If any subsequent alternate has a different number of // static capture groups, then we overall have a variation and not a // static number of groups. let static_explicit_captures_len = it.peek().and_then(|p| p.borrow().static_explicit_captures_len()); // The base case is an empty alternation, which matches nothing. // Note though that empty alternations aren't possible, because the // Hir::alternation smart constructor rewrites those as empty character // classes. let mut props = PropertiesI { minimum_len: None, maximum_len: None, look_set: LookSet::empty(), look_set_prefix: fix, look_set_suffix: fix, look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), utf8: true, explicit_captures_len: 0, static_explicit_captures_len, literal: false, alternation_literal: true, }; let (mut min_poisoned, mut max_poisoned) = (false, false); // Handle properties that need to visit every child hir. for prop in it { let p = prop.borrow(); props.look_set.set_union(p.look_set()); props.look_set_prefix.set_intersect(p.look_set_prefix()); props.look_set_suffix.set_intersect(p.look_set_suffix()); props.look_set_prefix_any.set_union(p.look_set_prefix_any()); props.look_set_suffix_any.set_union(p.look_set_suffix_any()); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); if props.static_explicit_captures_len != p.static_explicit_captures_len() { props.static_explicit_captures_len = None; } props.alternation_literal = props.alternation_literal && p.is_literal(); if !min_poisoned { if let Some(xmin) = p.minimum_len() { if props.minimum_len.map_or(true, |pmin| xmin < pmin) { props.minimum_len = Some(xmin); } } else { props.minimum_len = None; min_poisoned = true; } } if !max_poisoned { if let Some(xmax) = p.maximum_len() { if props.maximum_len.map_or(true, |pmax| xmax > pmax) { props.maximum_len = Some(xmax); } } else { props.maximum_len = None; max_poisoned = true; } } } Properties(Box::new(props)) } } impl Properties { /// Create a new set of HIR properties for an empty regex. fn empty() -> Properties { let inner = PropertiesI { minimum_len: Some(0), maximum_len: Some(0), look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), // It is debatable whether an empty regex always matches at valid // UTF-8 boundaries. Strictly speaking, at a byte oriented view, // it is clearly false. There are, for example, many empty strings // between the bytes encoding a '☃'. // // However, when Unicode mode is enabled, the fundamental atom // of matching is really a codepoint. And in that scenario, an // empty regex is defined to only match at valid UTF-8 boundaries // and to never split a codepoint. It just so happens that this // enforcement is somewhat tricky to do for regexes that match // the empty string inside regex engines themselves. It usually // requires some layer above the regex engine to filter out such // matches. // // In any case, 'true' is really the only coherent option. If it // were false, for example, then 'a*' would also need to be false // since it too can match the empty string. utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; Properties(Box::new(inner)) } /// Create a new set of HIR properties for a literal regex. fn literal(lit: &Literal) -> Properties { let inner = PropertiesI { minimum_len: Some(lit.0.len()), maximum_len: Some(lit.0.len()), look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), utf8: core::str::from_utf8(&lit.0).is_ok(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), literal: true, alternation_literal: true, }; Properties(Box::new(inner)) } /// Create a new set of HIR properties for a character class. fn class(class: &Class) -> Properties { let inner = PropertiesI { minimum_len: class.minimum_len(), maximum_len: class.maximum_len(), look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), utf8: class.is_utf8(), explicit_captures_len: 0, static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; Properties(Box::new(inner)) } /// Create a new set of HIR properties for a look-around assertion. fn look(look: Look) -> Properties { let inner = PropertiesI { minimum_len: Some(0), maximum_len: Some(0), look_set: LookSet::singleton(look), look_set_prefix: LookSet::singleton(look), look_set_suffix: LookSet::singleton(look), look_set_prefix_any: LookSet::singleton(look), look_set_suffix_any: LookSet::singleton(look), // This requires a little explanation. Basically, we don't consider // matching an empty string to be equivalent to matching invalid // UTF-8, even though technically matching every empty string will // split the UTF-8 encoding of a single codepoint when treating a // UTF-8 encoded string as a sequence of bytes. Our defense here is // that in such a case, a codepoint should logically be treated as // the fundamental atom for matching, and thus the only valid match // points are between codepoints and not bytes. // // More practically, this is true here because it's also true // for 'Hir::empty()', otherwise something like 'a*' would be // considered to match invalid UTF-8. That in turn makes this // property borderline useless. utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), literal: false, alternation_literal: false, }; Properties(Box::new(inner)) } /// Create a new set of HIR properties for a repetition. fn repetition(rep: &Repetition) -> Properties { let p = rep.sub.properties(); let minimum_len = p.minimum_len().map(|child_min| { let rep_min = usize::try_from(rep.min).unwrap_or(usize::MAX); child_min.saturating_mul(rep_min) }); let maximum_len = rep.max.and_then(|rep_max| { let rep_max = usize::try_from(rep_max).ok()?; let child_max = p.maximum_len()?; child_max.checked_mul(rep_max) }); let mut inner = PropertiesI { minimum_len, maximum_len, look_set: p.look_set(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), look_set_prefix_any: p.look_set_prefix_any(), look_set_suffix_any: p.look_set_suffix_any(), utf8: p.is_utf8(), explicit_captures_len: p.explicit_captures_len(), static_explicit_captures_len: p.static_explicit_captures_len(), literal: false, alternation_literal: false, }; // If the repetition operator can match the empty string, then its // lookset prefix and suffixes themselves remain empty since they are // no longer required to match. if rep.min > 0 { inner.look_set_prefix = p.look_set_prefix(); inner.look_set_suffix = p.look_set_suffix(); } // If the static captures len of the sub-expression is not known or // is greater than zero, then it automatically propagates to the // repetition, regardless of the repetition. Otherwise, it might // change, but only when the repetition can match 0 times. if rep.min == 0 && inner.static_explicit_captures_len.map_or(false, |len| len > 0) { // If we require a match 0 times, then our captures len is // guaranteed to be zero. Otherwise, if we *can* match the empty // string, then it's impossible to know how many captures will be // in the resulting match. if rep.max == Some(0) { inner.static_explicit_captures_len = Some(0); } else { inner.static_explicit_captures_len = None; } } Properties(Box::new(inner)) } /// Create a new set of HIR properties for a capture. fn capture(capture: &Capture) -> Properties { let p = capture.sub.properties(); Properties(Box::new(PropertiesI { explicit_captures_len: p.explicit_captures_len().saturating_add(1), static_explicit_captures_len: p .static_explicit_captures_len() .map(|len| len.saturating_add(1)), literal: false, alternation_literal: false, ..*p.0.clone() })) } /// Create a new set of HIR properties for a concatenation. fn concat(concat: &[Hir]) -> Properties { // The base case is an empty concatenation, which matches the empty // string. Note though that empty concatenations aren't possible, // because the Hir::concat smart constructor rewrites those as // Hir::empty. let mut props = PropertiesI { minimum_len: Some(0), maximum_len: Some(0), look_set: LookSet::empty(), look_set_prefix: LookSet::empty(), look_set_suffix: LookSet::empty(), look_set_prefix_any: LookSet::empty(), look_set_suffix_any: LookSet::empty(), utf8: true, explicit_captures_len: 0, static_explicit_captures_len: Some(0), literal: true, alternation_literal: true, }; // Handle properties that need to visit every child hir. for x in concat.iter() { let p = x.properties(); props.look_set.set_union(p.look_set()); props.utf8 = props.utf8 && p.is_utf8(); props.explicit_captures_len = props .explicit_captures_len .saturating_add(p.explicit_captures_len()); props.static_explicit_captures_len = p .static_explicit_captures_len() .and_then(|len1| { Some((len1, props.static_explicit_captures_len?)) }) .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); props.literal = props.literal && p.is_literal(); props.alternation_literal = props.alternation_literal && p.is_alternation_literal(); if let Some(minimum_len) = props.minimum_len { match p.minimum_len() { None => props.minimum_len = None, Some(len) => { // We use saturating arithmetic here because the // minimum is just a lower bound. We can't go any // higher than what our number types permit. props.minimum_len = Some(minimum_len.saturating_add(len)); } } } if let Some(maximum_len) = props.maximum_len { match p.maximum_len() { None => props.maximum_len = None, Some(len) => { props.maximum_len = maximum_len.checked_add(len) } } } } // Handle the prefix properties, which only requires visiting // child exprs until one matches more than the empty string. let mut it = concat.iter(); while let Some(x) = it.next() { props.look_set_prefix.set_union(x.properties().look_set_prefix()); props .look_set_prefix_any .set_union(x.properties().look_set_prefix_any()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } } // Same thing for the suffix properties, but in reverse. let mut it = concat.iter().rev(); while let Some(x) = it.next() { props.look_set_suffix.set_union(x.properties().look_set_suffix()); props .look_set_suffix_any .set_union(x.properties().look_set_suffix_any()); if x.properties().maximum_len().map_or(true, |x| x > 0) { break; } } Properties(Box::new(props)) } /// Create a new set of HIR properties for a concatenation. fn alternation(alts: &[Hir]) -> Properties { Properties::union(alts.iter().map(|hir| hir.properties())) } } /// A set of look-around assertions. /// /// This is useful for efficiently tracking look-around assertions. For /// example, an [`Hir`] provides properties that return `LookSet`s. #[derive(Clone, Copy, Default, Eq, PartialEq)] pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that /// of a bitset, where each assertion occupies bit `i` where `i = /// Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. pub bits: u32, } impl LookSet { /// Create an empty set of look-around assertions. #[inline] pub fn empty() -> LookSet { LookSet { bits: 0 } } /// Create a full set of look-around assertions. /// /// This set contains all possible look-around assertions. #[inline] pub fn full() -> LookSet { LookSet { bits: !0 } } /// Create a look-around set containing the look-around assertion given. /// /// This is a convenience routine for creating an empty set and inserting /// one look-around assertions. #[inline] pub fn singleton(look: Look) -> LookSet { LookSet::empty().insert(look) } /// Returns the total number of look-around assertions in this set. #[inline] pub fn len(self) -> usize { // OK because max value always fits in a u8, which in turn always // fits in a usize, regardless of target. usize::try_from(self.bits.count_ones()).unwrap() } /// Returns true if and only if this set is empty. #[inline] pub fn is_empty(self) -> bool { self.len() == 0 } /// Returns true if and only if the given look-around assertion is in this /// set. #[inline] pub fn contains(self, look: Look) -> bool { self.bits & look.as_repr() != 0 } /// Returns true if and only if this set contains any anchor assertions. /// This includes both "start/end of haystack" and "start/end of line." #[inline] pub fn contains_anchor(&self) -> bool { self.contains_anchor_haystack() || self.contains_anchor_line() } /// Returns true if and only if this set contains any "start/end of /// haystack" anchors. This doesn't include "start/end of line" anchors. #[inline] pub fn contains_anchor_haystack(&self) -> bool { self.contains(Look::Start) || self.contains(Look::End) } /// Returns true if and only if this set contains any "start/end of line" /// anchors. This doesn't include "start/end of haystack" anchors. This /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. #[inline] pub fn contains_anchor_line(&self) -> bool { self.contains(Look::StartLF) || self.contains(Look::EndLF) || self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } /// Returns true if and only if this set contains any "start/end of line" /// anchors that only treat `\n` as line terminators. This does not include /// haystack anchors or CRLF aware line anchors. #[inline] pub fn contains_anchor_lf(&self) -> bool { self.contains(Look::StartLF) || self.contains(Look::EndLF) } /// Returns true if and only if this set contains any "start/end of line" /// anchors that are CRLF-aware. This doesn't include "start/end of /// haystack" or "start/end of line-feed" anchors. #[inline] pub fn contains_anchor_crlf(&self) -> bool { self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } /// Returns true if and only if this set contains any word boundary or /// negated word boundary assertions. This include both Unicode and ASCII /// word boundaries. #[inline] pub fn contains_word(self) -> bool { self.contains_word_unicode() || self.contains_word_ascii() } /// Returns true if and only if this set contains any Unicode word boundary /// or negated Unicode word boundary assertions. #[inline] pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) || self.contains(Look::WordStartUnicode) || self.contains(Look::WordEndUnicode) || self.contains(Look::WordStartHalfUnicode) || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) || self.contains(Look::WordStartAscii) || self.contains(Look::WordEndAscii) || self.contains(Look::WordStartHalfAscii) || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. #[inline] pub fn iter(self) -> LookSetIter { LookSetIter { set: self } } /// Return a new set that is equivalent to the original, but with the given /// assertion added to it. If the assertion is already in the set, then the /// returned set is equivalent to the original. #[inline] pub fn insert(self, look: Look) -> LookSet { LookSet { bits: self.bits | look.as_repr() } } /// Updates this set in place with the result of inserting the given /// assertion into this set. #[inline] pub fn set_insert(&mut self, look: Look) { *self = self.insert(look); } /// Return a new set that is equivalent to the original, but with the given /// assertion removed from it. If the assertion is not in the set, then the /// returned set is equivalent to the original. #[inline] pub fn remove(self, look: Look) -> LookSet { LookSet { bits: self.bits & !look.as_repr() } } /// Updates this set in place with the result of removing the given /// assertion from this set. #[inline] pub fn set_remove(&mut self, look: Look) { *self = self.remove(look); } /// Returns a new set that is the result of subtracting the given set from /// this set. #[inline] pub fn subtract(self, other: LookSet) -> LookSet { LookSet { bits: self.bits & !other.bits } } /// Updates this set in place with the result of subtracting the given set /// from this set. #[inline] pub fn set_subtract(&mut self, other: LookSet) { *self = self.subtract(other); } /// Returns a new set that is the union of this and the one given. #[inline] pub fn union(self, other: LookSet) -> LookSet { LookSet { bits: self.bits | other.bits } } /// Updates this set in place with the result of unioning it with the one /// given. #[inline] pub fn set_union(&mut self, other: LookSet) { *self = self.union(other); } /// Returns a new set that is the intersection of this and the one given. #[inline] pub fn intersect(self, other: LookSet) -> LookSet { LookSet { bits: self.bits & other.bits } } /// Updates this set in place with the result of intersecting it with the /// one given. #[inline] pub fn set_intersect(&mut self, other: LookSet) { *self = self.intersect(other); } /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; slice[2] = raw[2]; slice[3] = raw[3]; } } impl core::fmt::Debug for LookSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if self.is_empty() { return write!(f, "∅"); } for look in self.iter() { write!(f, "{}", look.as_char())?; } Ok(()) } } /// An iterator over all look-around assertions in a [`LookSet`]. /// /// This iterator is created by [`LookSet::iter`]. #[derive(Clone, Debug)] pub struct LookSetIter { set: LookSet, } impl Iterator for LookSetIter { type Item = Look; #[inline] fn next(&mut self) -> Option { if self.set.is_empty() { return None; } // We'll never have more than u8::MAX distinct look-around assertions, // so 'bit' will always fit into a u16. let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } } /// Given a sequence of HIR values where each value corresponds to a Unicode /// class (or an all-ASCII byte class), return a single Unicode class /// corresponding to the union of the classes found. fn class_chars(hirs: &[Hir]) -> Option { let mut cls = ClassUnicode::new(vec![]); for hir in hirs.iter() { match *hir.kind() { HirKind::Class(Class::Unicode(ref cls2)) => { cls.union(cls2); } HirKind::Class(Class::Bytes(ref cls2)) => { cls.union(&cls2.to_unicode_class()?); } _ => return None, }; } Some(Class::Unicode(cls)) } /// Given a sequence of HIR values where each value corresponds to a byte class /// (or an all-ASCII Unicode class), return a single byte class corresponding /// to the union of the classes found. fn class_bytes(hirs: &[Hir]) -> Option { let mut cls = ClassBytes::new(vec![]); for hir in hirs.iter() { match *hir.kind() { HirKind::Class(Class::Unicode(ref cls2)) => { cls.union(&cls2.to_byte_class()?); } HirKind::Class(Class::Bytes(ref cls2)) => { cls.union(cls2); } _ => return None, }; } Some(Class::Bytes(cls)) } /// Given a sequence of HIR values where each value corresponds to a literal /// that is a single `char`, return that sequence of `char`s. Otherwise return /// None. No deduplication is done. fn singleton_chars(hirs: &[Hir]) -> Option> { let mut singletons = vec![]; for hir in hirs.iter() { let literal = match *hir.kind() { HirKind::Literal(Literal(ref bytes)) => bytes, _ => return None, }; let ch = match crate::debug::utf8_decode(literal) { None => return None, Some(Err(_)) => return None, Some(Ok(ch)) => ch, }; if literal.len() != ch.len_utf8() { return None; } singletons.push(ch); } Some(singletons) } /// Given a sequence of HIR values where each value corresponds to a literal /// that is a single byte, return that sequence of bytes. Otherwise return /// None. No deduplication is done. fn singleton_bytes(hirs: &[Hir]) -> Option> { let mut singletons = vec![]; for hir in hirs.iter() { let literal = match *hir.kind() { HirKind::Literal(Literal(ref bytes)) => bytes, _ => return None, }; if literal.len() != 1 { return None; } singletons.push(literal[0]); } Some(singletons) } /// Looks for a common prefix in the list of alternation branches given. If one /// is found, then an equivalent but (hopefully) simplified Hir is returned. /// Otherwise, the original given list of branches is returned unmodified. /// /// This is not quite as good as it could be. Right now, it requires that /// all branches are 'Concat' expressions. It also doesn't do well with /// literals. For example, given 'foofoo|foobar', it will not refactor it to /// 'foo(?:foo|bar)' because literals are flattened into their own special /// concatenation. (One wonders if perhaps 'Literal' should be a single atom /// instead of a string of bytes because of this. Otherwise, handling the /// current representation in this routine will be pretty gnarly. Sigh.) fn lift_common_prefix(hirs: Vec) -> Result> { if hirs.len() <= 1 { return Err(hirs); } let mut prefix = match hirs[0].kind() { HirKind::Concat(ref xs) => &**xs, _ => return Err(hirs), }; if prefix.is_empty() { return Err(hirs); } for h in hirs.iter().skip(1) { let concat = match h.kind() { HirKind::Concat(ref xs) => xs, _ => return Err(hirs), }; let common_len = prefix .iter() .zip(concat.iter()) .take_while(|(x, y)| x == y) .count(); prefix = &prefix[..common_len]; if prefix.is_empty() { return Err(hirs); } } let len = prefix.len(); assert_ne!(0, len); let mut prefix_concat = vec![]; let mut suffix_alts = vec![]; for h in hirs { let mut concat = match h.into_kind() { HirKind::Concat(xs) => xs, // We required all sub-expressions to be // concats above, so we're only here if we // have a concat. _ => unreachable!(), }; suffix_alts.push(Hir::concat(concat.split_off(len))); if prefix_concat.is_empty() { prefix_concat = concat; } } let mut concat = prefix_concat; concat.push(Hir::alternation(suffix_alts)); Ok(Hir::concat(concat)) } #[cfg(test)] mod tests { use super::*; fn uclass(ranges: &[(char, char)]) -> ClassUnicode { let ranges: Vec = ranges .iter() .map(|&(s, e)| ClassUnicodeRange::new(s, e)) .collect(); ClassUnicode::new(ranges) } fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { let ranges: Vec = ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); ClassBytes::new(ranges) } fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { cls.iter().map(|x| (x.start(), x.end())).collect() } #[cfg(feature = "unicode-case")] fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls.clone(); cls_.case_fold_simple(); cls_ } fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls1.clone(); cls_.union(cls2); cls_ } fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls1.clone(); cls_.intersect(cls2); cls_ } fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls1.clone(); cls_.difference(cls2); cls_ } fn usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode { let mut cls_ = cls1.clone(); cls_.symmetric_difference(cls2); cls_ } fn unegate(cls: &ClassUnicode) -> ClassUnicode { let mut cls_ = cls.clone(); cls_.negate(); cls_ } fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { cls.iter().map(|x| (x.start(), x.end())).collect() } fn bcasefold(cls: &ClassBytes) -> ClassBytes { let mut cls_ = cls.clone(); cls_.case_fold_simple(); cls_ } fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { let mut cls_ = cls1.clone(); cls_.union(cls2); cls_ } fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { let mut cls_ = cls1.clone(); cls_.intersect(cls2); cls_ } fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { let mut cls_ = cls1.clone(); cls_.difference(cls2); cls_ } fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { let mut cls_ = cls1.clone(); cls_.symmetric_difference(cls2); cls_ } fn bnegate(cls: &ClassBytes) -> ClassBytes { let mut cls_ = cls.clone(); cls_.negate(); cls_ } #[test] fn class_range_canonical_unicode() { let range = ClassUnicodeRange::new('\u{00FF}', '\0'); assert_eq!('\0', range.start()); assert_eq!('\u{00FF}', range.end()); } #[test] fn class_range_canonical_bytes() { let range = ClassBytesRange::new(b'\xFF', b'\0'); assert_eq!(b'\0', range.start()); assert_eq!(b'\xFF', range.end()); } #[test] fn class_canonicalize_unicode() { let cls = uclass(&[('a', 'c'), ('x', 'z')]); let expected = vec![('a', 'c'), ('x', 'z')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[('x', 'z'), ('a', 'c')]); let expected = vec![('a', 'c'), ('x', 'z')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[('x', 'z'), ('w', 'y')]); let expected = vec![('w', 'z')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[ ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'), ('m', 'p'), ('l', 's'), ]); let expected = vec![('a', 'j'), ('l', 's')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[('x', 'z'), ('u', 'w')]); let expected = vec![('u', 'z')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); let expected = vec![('\x00', '\u{10FFFF}')]; assert_eq!(expected, uranges(&cls)); let cls = uclass(&[('a', 'a'), ('b', 'b')]); let expected = vec![('a', 'b')]; assert_eq!(expected, uranges(&cls)); } #[test] fn class_canonicalize_bytes() { let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); let expected = vec![(b'a', b'c'), (b'x', b'z')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); let expected = vec![(b'a', b'c'), (b'x', b'z')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); let expected = vec![(b'w', b'z')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[ (b'c', b'f'), (b'a', b'g'), (b'd', b'j'), (b'a', b'c'), (b'm', b'p'), (b'l', b's'), ]); let expected = vec![(b'a', b'j'), (b'l', b's')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); let expected = vec![(b'u', b'z')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); let expected = vec![(b'\x00', b'\xFF')]; assert_eq!(expected, branges(&cls)); let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); let expected = vec![(b'a', b'b')]; assert_eq!(expected, branges(&cls)); } #[test] #[cfg(feature = "unicode-case")] fn class_case_fold_unicode() { let cls = uclass(&[ ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), ('M', 'P'), ('L', 'S'), ('c', 'f'), ]); let expected = uclass(&[ ('A', 'J'), ('L', 'S'), ('a', 'j'), ('l', 's'), ('\u{17F}', '\u{17F}'), ]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('A', 'Z')]); let expected = uclass(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('a', 'z')]); let expected = uclass(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('A', 'A'), ('_', '_')]); let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('A', 'A'), ('=', '=')]); let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('\x00', '\x10')]); assert_eq!(cls, ucasefold(&cls)); let cls = uclass(&[('k', 'k')]); let expected = uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); assert_eq!(expected, ucasefold(&cls)); let cls = uclass(&[('@', '@')]); assert_eq!(cls, ucasefold(&cls)); } #[test] #[cfg(not(feature = "unicode-case"))] fn class_case_fold_unicode_disabled() { let mut cls = uclass(&[ ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), ('M', 'P'), ('L', 'S'), ('c', 'f'), ]); assert!(cls.try_case_fold_simple().is_err()); } #[test] #[should_panic] #[cfg(not(feature = "unicode-case"))] fn class_case_fold_unicode_disabled_panics() { let mut cls = uclass(&[ ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'), ('M', 'P'), ('L', 'S'), ('c', 'f'), ]); cls.case_fold_simple(); } #[test] fn class_case_fold_bytes() { let cls = bclass(&[ (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'), (b'M', b'P'), (b'L', b'S'), (b'c', b'f'), ]); let expected = bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'A', b'Z')]); let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'a', b'z')]); let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'\x00', b'\x10')]); assert_eq!(cls, bcasefold(&cls)); let cls = bclass(&[(b'k', b'k')]); let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); assert_eq!(expected, bcasefold(&cls)); let cls = bclass(&[(b'@', b'@')]); assert_eq!(cls, bcasefold(&cls)); } #[test] fn class_negate_unicode() { let cls = uclass(&[('a', 'a')]); let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('a', 'a'), ('b', 'b')]); let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('a', 'c'), ('x', 'z')]); let expected = uclass(&[ ('\x00', '\x60'), ('\x64', '\x77'), ('\x7B', '\u{10FFFF}'), ]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\x00', 'a')]); let expected = uclass(&[('\x62', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('a', '\u{10FFFF}')]); let expected = uclass(&[('\x00', '\x60')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\x00', '\u{10FFFF}')]); let expected = uclass(&[]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[]); let expected = uclass(&[('\x00', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\x00', '\u{D7FF}')]); let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\x00', '\u{D7FE}')]); let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); let expected = uclass(&[('\x00', '\u{D7FF}')]); assert_eq!(expected, unegate(&cls)); let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); let expected = uclass(&[('\x00', '\u{E000}')]); assert_eq!(expected, unegate(&cls)); } #[test] fn class_negate_bytes() { let cls = bclass(&[(b'a', b'a')]); let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); let expected = bclass(&[ (b'\x00', b'\x60'), (b'\x64', b'\x77'), (b'\x7B', b'\xFF'), ]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'\x00', b'a')]); let expected = bclass(&[(b'\x62', b'\xFF')]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'a', b'\xFF')]); let expected = bclass(&[(b'\x00', b'\x60')]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'\x00', b'\xFF')]); let expected = bclass(&[]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[]); let expected = bclass(&[(b'\x00', b'\xFF')]); assert_eq!(expected, bnegate(&cls)); let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); let expected = bclass(&[(b'\xFE', b'\xFE')]); assert_eq!(expected, bnegate(&cls)); } #[test] fn class_union_unicode() { let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); let cls2 = uclass(&[('a', 'z')]); let expected = uclass(&[('a', 'z'), ('A', 'C')]); assert_eq!(expected, uunion(&cls1, &cls2)); } #[test] fn class_union_bytes() { let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); let cls2 = bclass(&[(b'a', b'z')]); let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); assert_eq!(expected, bunion(&cls1, &cls2)); } #[test] fn class_intersect_unicode() { let cls1 = uclass(&[]); let cls2 = uclass(&[('a', 'a')]); let expected = uclass(&[]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'a')]); let cls2 = uclass(&[('a', 'a')]); let expected = uclass(&[('a', 'a')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'a')]); let cls2 = uclass(&[('b', 'b')]); let expected = uclass(&[]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'a')]); let cls2 = uclass(&[('a', 'c')]); let expected = uclass(&[('a', 'a')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b')]); let cls2 = uclass(&[('a', 'c')]); let expected = uclass(&[('a', 'b')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b')]); let cls2 = uclass(&[('b', 'c')]); let expected = uclass(&[('b', 'b')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b')]); let cls2 = uclass(&[('c', 'd')]); let expected = uclass(&[]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('b', 'c')]); let cls2 = uclass(&[('a', 'd')]); let expected = uclass(&[('b', 'c')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); let cls2 = uclass(&[('a', 'h')]); let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); let expected = uclass(&[]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); let cls2 = uclass(&[('h', 'h')]); let expected = uclass(&[('h', 'h')]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); let expected = uclass(&[]); assert_eq!(expected, uintersect(&cls1, &cls2)); let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); let expected = uclass(&[('b', 'f')]); assert_eq!(expected, uintersect(&cls1, &cls2)); } #[test] fn class_intersect_bytes() { let cls1 = bclass(&[]); let cls2 = bclass(&[(b'a', b'a')]); let expected = bclass(&[]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'a')]); let cls2 = bclass(&[(b'a', b'a')]); let expected = bclass(&[(b'a', b'a')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'a')]); let cls2 = bclass(&[(b'b', b'b')]); let expected = bclass(&[]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'a')]); let cls2 = bclass(&[(b'a', b'c')]); let expected = bclass(&[(b'a', b'a')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b')]); let cls2 = bclass(&[(b'a', b'c')]); let expected = bclass(&[(b'a', b'b')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b')]); let cls2 = bclass(&[(b'b', b'c')]); let expected = bclass(&[(b'b', b'b')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b')]); let cls2 = bclass(&[(b'c', b'd')]); let expected = bclass(&[]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'b', b'c')]); let cls2 = bclass(&[(b'a', b'd')]); let expected = bclass(&[(b'b', b'c')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); let cls2 = bclass(&[(b'a', b'h')]); let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); let expected = bclass(&[]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); let cls2 = bclass(&[(b'h', b'h')]); let expected = bclass(&[(b'h', b'h')]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); let expected = bclass(&[]); assert_eq!(expected, bintersect(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); let expected = bclass(&[(b'b', b'f')]); assert_eq!(expected, bintersect(&cls1, &cls2)); } #[test] fn class_difference_unicode() { let cls1 = uclass(&[('a', 'a')]); let cls2 = uclass(&[('a', 'a')]); let expected = uclass(&[]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'a')]); let cls2 = uclass(&[]); let expected = uclass(&[('a', 'a')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[]); let cls2 = uclass(&[('a', 'a')]); let expected = uclass(&[]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'z')]); let cls2 = uclass(&[('a', 'a')]); let expected = uclass(&[('b', 'z')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'z')]); let cls2 = uclass(&[('z', 'z')]); let expected = uclass(&[('a', 'y')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'z')]); let cls2 = uclass(&[('m', 'm')]); let expected = uclass(&[('a', 'l'), ('n', 'z')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); let cls2 = uclass(&[('a', 'z')]); let expected = uclass(&[]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); let cls2 = uclass(&[('d', 'v')]); let expected = uclass(&[('a', 'c')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('x', 'z')]); let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); let expected = uclass(&[('x', 'z')]); assert_eq!(expected, udifference(&cls1, &cls2)); let cls1 = uclass(&[('a', 'z')]); let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); assert_eq!(expected, udifference(&cls1, &cls2)); } #[test] fn class_difference_bytes() { let cls1 = bclass(&[(b'a', b'a')]); let cls2 = bclass(&[(b'a', b'a')]); let expected = bclass(&[]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'a')]); let cls2 = bclass(&[]); let expected = bclass(&[(b'a', b'a')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[]); let cls2 = bclass(&[(b'a', b'a')]); let expected = bclass(&[]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'z')]); let cls2 = bclass(&[(b'a', b'a')]); let expected = bclass(&[(b'b', b'z')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'z')]); let cls2 = bclass(&[(b'z', b'z')]); let expected = bclass(&[(b'a', b'y')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'z')]); let cls2 = bclass(&[(b'm', b'm')]); let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); let cls2 = bclass(&[(b'a', b'z')]); let expected = bclass(&[]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); let cls2 = bclass(&[(b'd', b'v')]); let expected = bclass(&[(b'a', b'c')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'x', b'z')]); let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); let expected = bclass(&[(b'x', b'z')]); assert_eq!(expected, bdifference(&cls1, &cls2)); let cls1 = bclass(&[(b'a', b'z')]); let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); assert_eq!(expected, bdifference(&cls1, &cls2)); } #[test] fn class_symmetric_difference_unicode() { let cls1 = uclass(&[('a', 'm')]); let cls2 = uclass(&[('g', 't')]); let expected = uclass(&[('a', 'f'), ('n', 't')]); assert_eq!(expected, usymdifference(&cls1, &cls2)); } #[test] fn class_symmetric_difference_bytes() { let cls1 = bclass(&[(b'a', b'm')]); let cls2 = bclass(&[(b'g', b't')]); let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); assert_eq!(expected, bsymdifference(&cls1, &cls2)); } // We use a thread with an explicit stack size to test that our destructor // for Hir can handle arbitrarily sized expressions in constant stack // space. In case we run on a platform without threads (WASM?), we limit // this test to Windows/Unix. #[test] #[cfg(any(unix, windows))] fn no_stack_overflow_on_drop() { use std::thread; let run = || { let mut expr = Hir::empty(); for _ in 0..100 { expr = Hir::capture(Capture { index: 1, name: None, sub: Box::new(expr), }); expr = Hir::repetition(Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(expr), }); expr = Hir { kind: HirKind::Concat(vec![expr]), props: Properties::empty(), }; expr = Hir { kind: HirKind::Alternation(vec![expr]), props: Properties::empty(), }; } assert!(!matches!(*expr.kind(), HirKind::Empty)); }; // We run our test on a thread with a small stack size so we can // force the issue more easily. // // NOTE(2023-03-21): See the corresponding test in 'crate::ast::tests' // for context on the specific stack size chosen here. thread::Builder::new() .stack_size(16 << 10) .spawn(run) .unwrap() .join() .unwrap(); } #[test] fn look_set_iter() { let set = LookSet::empty(); assert_eq!(0, set.iter().count()); let set = LookSet::full(); assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); assert_eq!(2, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF); assert_eq!(1, set.iter().count()); let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); } #[test] fn look_set_debug() { let res = format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = format!("{:?}", LookSet::full()); assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } regex-syntax-0.8.2/src/hir/print.rs000064400000000000000000000526311046102023000153530ustar 00000000000000/*! This module provides a regular expression printer for `Hir`. */ use core::fmt; use crate::{ hir::{ self, visitor::{self, Visitor}, Hir, HirKind, }, is_meta_character, }; /// A builder for constructing a printer. /// /// Note that since a printer doesn't have any configuration knobs, this type /// remains unexported. #[derive(Clone, Debug)] struct PrinterBuilder { _priv: (), } impl Default for PrinterBuilder { fn default() -> PrinterBuilder { PrinterBuilder::new() } } impl PrinterBuilder { fn new() -> PrinterBuilder { PrinterBuilder { _priv: () } } fn build(&self) -> Printer { Printer { _priv: () } } } /// A printer for a regular expression's high-level intermediate /// representation. /// /// A printer converts a high-level intermediate representation (HIR) to a /// regular expression pattern string. This particular printer uses constant /// stack space and heap space proportional to the size of the HIR. /// /// Since this printer is only using the HIR, the pattern it prints will likely /// not resemble the original pattern at all. For example, a pattern like /// `\pL` will have its entire class written out. /// /// The purpose of this printer is to provide a means to mutate an HIR and then /// build a regular expression from the result of that mutation. (A regex /// library could provide a constructor from this HIR explicitly, but that /// creates an unnecessary public coupling between the regex library and this /// specific HIR representation.) #[derive(Debug)] pub struct Printer { _priv: (), } impl Printer { /// Create a new printer. pub fn new() -> Printer { PrinterBuilder::new().build() } /// Print the given `Ast` to the given writer. The writer must implement /// `fmt::Write`. Typical implementations of `fmt::Write` that can be used /// here are a `fmt::Formatter` (which is available in `fmt::Display` /// implementations) or a `&mut String`. pub fn print(&mut self, hir: &Hir, wtr: W) -> fmt::Result { visitor::visit(hir, Writer { wtr }) } } #[derive(Debug)] struct Writer { wtr: W, } impl Visitor for Writer { type Output = (); type Err = fmt::Error; fn finish(self) -> fmt::Result { Ok(()) } fn visit_pre(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { HirKind::Empty => { // Technically an empty sub-expression could be "printed" by // just ignoring it, but in practice, you could have a // repetition operator attached to an empty expression, and you // really need something in the concrete syntax to make that // work as you'd expect. self.wtr.write_str(r"(?:)")?; } // Repetition operators are strictly suffix oriented. HirKind::Repetition(_) => {} HirKind::Literal(hir::Literal(ref bytes)) => { // See the comment on the 'Concat' and 'Alternation' case below // for why we put parens here. Literals are, conceptually, // a special case of concatenation where each element is a // character. The HIR flattens this into a Box<[u8]>, but we // still need to treat it like a concatenation for correct // printing. As a special case, we don't write parens if there // is only one character. One character means there is no // concat so we don't need parens. Adding parens would still be // correct, but we drop them here because it tends to create // rather noisy regexes even in simple cases. let result = core::str::from_utf8(bytes); let len = result.map_or(bytes.len(), |s| s.chars().count()); if len > 1 { self.wtr.write_str(r"(?:")?; } match result { Ok(string) => { for c in string.chars() { self.write_literal_char(c)?; } } Err(_) => { for &b in bytes.iter() { self.write_literal_byte(b)?; } } } if len > 1 { self.wtr.write_str(r")")?; } } HirKind::Class(hir::Class::Unicode(ref cls)) => { if cls.ranges().is_empty() { return self.wtr.write_str("[a&&b]"); } self.wtr.write_str("[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_char(range.start())?; } else if u32::from(range.start()) + 1 == u32::from(range.end()) { self.write_literal_char(range.start())?; self.write_literal_char(range.end())?; } else { self.write_literal_char(range.start())?; self.wtr.write_str("-")?; self.write_literal_char(range.end())?; } } self.wtr.write_str("]")?; } HirKind::Class(hir::Class::Bytes(ref cls)) => { if cls.ranges().is_empty() { return self.wtr.write_str("[a&&b]"); } self.wtr.write_str("(?-u:[")?; for range in cls.iter() { if range.start() == range.end() { self.write_literal_class_byte(range.start())?; } else if range.start() + 1 == range.end() { self.write_literal_class_byte(range.start())?; self.write_literal_class_byte(range.end())?; } else { self.write_literal_class_byte(range.start())?; self.wtr.write_str("-")?; self.write_literal_class_byte(range.end())?; } } self.wtr.write_str("])")?; } HirKind::Look(ref look) => match *look { hir::Look::Start => { self.wtr.write_str(r"\A")?; } hir::Look::End => { self.wtr.write_str(r"\z")?; } hir::Look::StartLF => { self.wtr.write_str("(?m:^)")?; } hir::Look::EndLF => { self.wtr.write_str("(?m:$)")?; } hir::Look::StartCRLF => { self.wtr.write_str("(?mR:^)")?; } hir::Look::EndCRLF => { self.wtr.write_str("(?mR:$)")?; } hir::Look::WordAscii => { self.wtr.write_str(r"(?-u:\b)")?; } hir::Look::WordAsciiNegate => { self.wtr.write_str(r"(?-u:\B)")?; } hir::Look::WordUnicode => { self.wtr.write_str(r"\b")?; } hir::Look::WordUnicodeNegate => { self.wtr.write_str(r"\B")?; } hir::Look::WordStartAscii => { self.wtr.write_str(r"(?-u:\b{start})")?; } hir::Look::WordEndAscii => { self.wtr.write_str(r"(?-u:\b{end})")?; } hir::Look::WordStartUnicode => { self.wtr.write_str(r"\b{start}")?; } hir::Look::WordEndUnicode => { self.wtr.write_str(r"\b{end}")?; } hir::Look::WordStartHalfAscii => { self.wtr.write_str(r"(?-u:\b{start-half})")?; } hir::Look::WordEndHalfAscii => { self.wtr.write_str(r"(?-u:\b{end-half})")?; } hir::Look::WordStartHalfUnicode => { self.wtr.write_str(r"\b{start-half}")?; } hir::Look::WordEndHalfUnicode => { self.wtr.write_str(r"\b{end-half}")?; } }, HirKind::Capture(hir::Capture { ref name, .. }) => { self.wtr.write_str("(")?; if let Some(ref name) = *name { write!(self.wtr, "?P<{}>", name)?; } } // Why do this? Wrapping concats and alts in non-capturing groups // is not *always* necessary, but is sometimes necessary. For // example, 'concat(a, alt(b, c))' should be written as 'a(?:b|c)' // and not 'ab|c'. The former is clearly the intended meaning, but // the latter is actually 'alt(concat(a, b), c)'. // // It would be possible to only group these things in cases where // it's strictly necessary, but it requires knowing the parent // expression. And since this technique is simpler and always // correct, we take this route. More to the point, it is a non-goal // of an HIR printer to show a nice easy-to-read regex. Indeed, // its construction forbids it from doing so. Therefore, inserting // extra groups where they aren't necessary is perfectly okay. HirKind::Concat(_) | HirKind::Alternation(_) => { self.wtr.write_str(r"(?:")?; } } Ok(()) } fn visit_post(&mut self, hir: &Hir) -> fmt::Result { match *hir.kind() { // Handled during visit_pre HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) => {} HirKind::Repetition(ref x) => { match (x.min, x.max) { (0, Some(1)) => { self.wtr.write_str("?")?; } (0, None) => { self.wtr.write_str("*")?; } (1, None) => { self.wtr.write_str("+")?; } (1, Some(1)) => { // 'a{1}' and 'a{1}?' are exactly equivalent to 'a'. return Ok(()); } (m, None) => { write!(self.wtr, "{{{},}}", m)?; } (m, Some(n)) if m == n => { write!(self.wtr, "{{{}}}", m)?; // a{m} and a{m}? are always exactly equivalent. return Ok(()); } (m, Some(n)) => { write!(self.wtr, "{{{},{}}}", m, n)?; } } if !x.greedy { self.wtr.write_str("?")?; } } HirKind::Capture(_) | HirKind::Concat(_) | HirKind::Alternation(_) => { self.wtr.write_str(r")")?; } } Ok(()) } fn visit_alternation_in(&mut self) -> fmt::Result { self.wtr.write_str("|") } } impl Writer { fn write_literal_char(&mut self, c: char) -> fmt::Result { if is_meta_character(c) { self.wtr.write_str("\\")?; } self.wtr.write_char(c) } fn write_literal_byte(&mut self, b: u8) -> fmt::Result { if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "(?-u:\\x{:02X})", b) } } fn write_literal_class_byte(&mut self, b: u8) -> fmt::Result { if b <= 0x7F && !b.is_ascii_control() && !b.is_ascii_whitespace() { self.write_literal_char(char::try_from(b).unwrap()) } else { write!(self.wtr, "\\x{:02X}", b) } } } #[cfg(test)] mod tests { use alloc::{ boxed::Box, string::{String, ToString}, }; use crate::ParserBuilder; use super::*; fn roundtrip(given: &str, expected: &str) { roundtrip_with(|b| b, given, expected); } fn roundtrip_bytes(given: &str, expected: &str) { roundtrip_with(|b| b.utf8(false), given, expected); } fn roundtrip_with(mut f: F, given: &str, expected: &str) where F: FnMut(&mut ParserBuilder) -> &mut ParserBuilder, { let mut builder = ParserBuilder::new(); f(&mut builder); let hir = builder.build().parse(given).unwrap(); let mut printer = Printer::new(); let mut dst = String::new(); printer.print(&hir, &mut dst).unwrap(); // Check that the result is actually valid. builder.build().parse(&dst).unwrap(); assert_eq!(expected, dst); } #[test] fn print_literal() { roundtrip("a", "a"); roundtrip(r"\xff", "\u{FF}"); roundtrip_bytes(r"\xff", "\u{FF}"); roundtrip_bytes(r"(?-u)\xff", r"(?-u:\xFF)"); roundtrip("☃", "☃"); } #[test] fn print_class() { roundtrip(r"[a]", r"a"); roundtrip(r"[ab]", r"[ab]"); roundtrip(r"[a-z]", r"[a-z]"); roundtrip(r"[a-z--b-c--x-y]", r"[ad-wz]"); roundtrip(r"[^\x01-\u{10FFFF}]", "\u{0}"); roundtrip(r"[-]", r"\-"); roundtrip(r"[☃-⛄]", r"[☃-⛄]"); roundtrip(r"(?-u)[a]", r"a"); roundtrip(r"(?-u)[ab]", r"(?-u:[ab])"); roundtrip(r"(?-u)[a-z]", r"(?-u:[a-z])"); roundtrip_bytes(r"(?-u)[a-\xFF]", r"(?-u:[a-\xFF])"); // The following test that the printer escapes meta characters // in character classes. roundtrip(r"[\[]", r"\["); roundtrip(r"[Z-_]", r"[Z-_]"); roundtrip(r"[Z-_--Z]", r"[\[-_]"); // The following test that the printer escapes meta characters // in byte oriented character classes. roundtrip_bytes(r"(?-u)[\[]", r"\["); roundtrip_bytes(r"(?-u)[Z-_]", r"(?-u:[Z-_])"); roundtrip_bytes(r"(?-u)[Z-_--Z]", r"(?-u:[\[-_])"); // This tests that an empty character class is correctly roundtripped. #[cfg(feature = "unicode-gencat")] roundtrip(r"\P{any}", r"[a&&b]"); roundtrip_bytes(r"(?-u)[^\x00-\xFF]", r"[a&&b]"); } #[test] fn print_anchor() { roundtrip(r"^", r"\A"); roundtrip(r"$", r"\z"); roundtrip(r"(?m)^", r"(?m:^)"); roundtrip(r"(?m)$", r"(?m:$)"); } #[test] fn print_word_boundary() { roundtrip(r"\b", r"\b"); roundtrip(r"\B", r"\B"); roundtrip(r"(?-u)\b", r"(?-u:\b)"); roundtrip_bytes(r"(?-u)\B", r"(?-u:\B)"); } #[test] fn print_repetition() { roundtrip("a?", "a?"); roundtrip("a??", "a??"); roundtrip("(?U)a?", "a??"); roundtrip("a*", "a*"); roundtrip("a*?", "a*?"); roundtrip("(?U)a*", "a*?"); roundtrip("a+", "a+"); roundtrip("a+?", "a+?"); roundtrip("(?U)a+", "a+?"); roundtrip("a{1}", "a"); roundtrip("a{2}", "a{2}"); roundtrip("a{1,}", "a+"); roundtrip("a{1,5}", "a{1,5}"); roundtrip("a{1}?", "a"); roundtrip("a{2}?", "a{2}"); roundtrip("a{1,}?", "a+?"); roundtrip("a{1,5}?", "a{1,5}?"); roundtrip("(?U)a{1}", "a"); roundtrip("(?U)a{2}", "a{2}"); roundtrip("(?U)a{1,}", "a+?"); roundtrip("(?U)a{1,5}", "a{1,5}?"); // Test that various zero-length repetitions always translate to an // empty regex. This is more a property of HIR's smart constructors // than the printer though. roundtrip("a{0}", "(?:)"); roundtrip("(?:ab){0}", "(?:)"); #[cfg(feature = "unicode-gencat")] { roundtrip(r"\p{any}{0}", "(?:)"); roundtrip(r"\P{any}{0}", "(?:)"); } } #[test] fn print_group() { roundtrip("()", "((?:))"); roundtrip("(?P)", "(?P(?:))"); roundtrip("(?:)", "(?:)"); roundtrip("(a)", "(a)"); roundtrip("(?Pa)", "(?Pa)"); roundtrip("(?:a)", "a"); roundtrip("((((a))))", "((((a))))"); } #[test] fn print_alternation() { roundtrip("|", "(?:(?:)|(?:))"); roundtrip("||", "(?:(?:)|(?:)|(?:))"); roundtrip("a|b", "[ab]"); roundtrip("ab|cd", "(?:(?:ab)|(?:cd))"); roundtrip("a|b|c", "[a-c]"); roundtrip("ab|cd|ef", "(?:(?:ab)|(?:cd)|(?:ef))"); roundtrip("foo|bar|quux", "(?:(?:foo)|(?:bar)|(?:quux))"); } // This is a regression test that stresses a peculiarity of how the HIR // is both constructed and printed. Namely, it is legal for a repetition // to directly contain a concatenation. This particular construct isn't // really possible to build from the concrete syntax directly, since you'd // be forced to put the concatenation into (at least) a non-capturing // group. Concurrently, the printer doesn't consider this case and just // kind of naively prints the child expression and tacks on the repetition // operator. // // As a result, if you attached '+' to a 'concat(a, b)', the printer gives // you 'ab+', but clearly it really should be '(?:ab)+'. // // This bug isn't easy to surface because most ways of building an HIR // come directly from the concrete syntax, and as mentioned above, it just // isn't possible to build this kind of HIR from the concrete syntax. // Nevertheless, this is definitely a bug. // // See: https://github.com/rust-lang/regex/issues/731 #[test] fn regression_repetition_concat() { let expr = Hir::concat(alloc::vec![ Hir::literal("x".as_bytes()), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, sub: Box::new(Hir::literal("ab".as_bytes())), }), Hir::literal("y".as_bytes()), ]); assert_eq!(r"(?:x(?:ab)+y)", expr.to_string()); let expr = Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, sub: Box::new(Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), Hir::look(hir::Look::End), ])), }), Hir::look(hir::Look::End), ]); assert_eq!(r"(?:\A\A\z\z)", expr.to_string()); } // Just like regression_repetition_concat, but with the repetition using // an alternation as a child expression instead. // // See: https://github.com/rust-lang/regex/issues/731 #[test] fn regression_repetition_alternation() { let expr = Hir::concat(alloc::vec![ Hir::literal("ab".as_bytes()), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, sub: Box::new(Hir::alternation(alloc::vec![ Hir::literal("cd".as_bytes()), Hir::literal("ef".as_bytes()), ])), }), Hir::literal("gh".as_bytes()), ]); assert_eq!(r"(?:(?:ab)(?:(?:cd)|(?:ef))+(?:gh))", expr.to_string()); let expr = Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, sub: Box::new(Hir::alternation(alloc::vec![ Hir::look(hir::Look::Start), Hir::look(hir::Look::End), ])), }), Hir::look(hir::Look::End), ]); assert_eq!(r"(?:\A(?:\A|\z)\z)", expr.to_string()); } // This regression test is very similar in flavor to // regression_repetition_concat in that the root of the issue lies in a // peculiarity of how the HIR is represented and how the printer writes it // out. Like the other regression, this one is also rooted in the fact that // you can't produce the peculiar HIR from the concrete syntax. Namely, you // just can't have a 'concat(a, alt(b, c))' because the 'alt' will normally // be in (at least) a non-capturing group. Why? Because the '|' has very // low precedence (lower that concatenation), and so something like 'ab|c' // is actually 'alt(ab, c)'. // // See: https://github.com/rust-lang/regex/issues/516 #[test] fn regression_alternation_concat() { let expr = Hir::concat(alloc::vec![ Hir::literal("ab".as_bytes()), Hir::alternation(alloc::vec![ Hir::literal("mn".as_bytes()), Hir::literal("xy".as_bytes()), ]), ]); assert_eq!(r"(?:(?:ab)(?:(?:mn)|(?:xy)))", expr.to_string()); let expr = Hir::concat(alloc::vec![ Hir::look(hir::Look::Start), Hir::alternation(alloc::vec![ Hir::look(hir::Look::Start), Hir::look(hir::Look::End), ]), ]); assert_eq!(r"(?:\A(?:\A|\z))", expr.to_string()); } } regex-syntax-0.8.2/src/hir/translate.rs000064400000000000000000003713151046102023000162170ustar 00000000000000/*! Defines a translator that converts an `Ast` to an `Hir`. */ use core::cell::{Cell, RefCell}; use alloc::{boxed::Box, string::ToString, vec, vec::Vec}; use crate::{ ast::{self, Ast, Span, Visitor}, either::Either, hir::{self, Error, ErrorKind, Hir, HirKind}, unicode::{self, ClassQuery}, }; type Result = core::result::Result; /// A builder for constructing an AST->HIR translator. #[derive(Clone, Debug)] pub struct TranslatorBuilder { utf8: bool, line_terminator: u8, flags: Flags, } impl Default for TranslatorBuilder { fn default() -> TranslatorBuilder { TranslatorBuilder::new() } } impl TranslatorBuilder { /// Create a new translator builder with a default c onfiguration. pub fn new() -> TranslatorBuilder { TranslatorBuilder { utf8: true, line_terminator: b'\n', flags: Flags::default(), } } /// Build a translator using the current configuration. pub fn build(&self) -> Translator { Translator { stack: RefCell::new(vec![]), flags: Cell::new(self.flags), utf8: self.utf8, line_terminator: self.line_terminator, } } /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When enabled (the default), the translator is guaranteed to produce an /// expression that, for non-empty matches, will only ever produce spans /// that are entirely valid UTF-8 (otherwise, the translator will return an /// error). /// /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete /// syntax) will be allowed even though they can produce matches that split /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" /// matches, and it is expected that the regex engine itself must handle /// these cases if necessary (perhaps by suppressing any zero-width matches /// that split a codepoint). pub fn utf8(&mut self, yes: bool) -> &mut TranslatorBuilder { self.utf8 = yes; self } /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. /// /// Namely, instead of `.` (by default) matching everything except for `\n`, /// this will cause `.` to match everything except for the byte given. /// /// If `.` is used in a context where Unicode mode is enabled and this byte /// isn't ASCII, then an error will be returned. When Unicode mode is /// disabled, then any byte is permitted, but will return an error if UTF-8 /// mode is enabled and it is a non-ASCII byte. /// /// In short, any ASCII value for a line terminator is always okay. But a /// non-ASCII byte might result in an error depending on whether Unicode /// mode or UTF-8 mode are enabled. /// /// Note that if `R` mode is enabled then it always takes precedence and /// the line terminator will be treated as `\r` and `\n` simultaneously. /// /// Note also that this *doesn't* impact the look-around assertions /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional /// configuration in the regex engine itself. pub fn line_terminator(&mut self, byte: u8) -> &mut TranslatorBuilder { self.line_terminator = byte; self } /// Enable or disable the case insensitive flag (`i`) by default. pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.case_insensitive = if yes { Some(true) } else { None }; self } /// Enable or disable the multi-line matching flag (`m`) by default. pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.multi_line = if yes { Some(true) } else { None }; self } /// Enable or disable the "dot matches any character" flag (`s`) by /// default. pub fn dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder { self.flags.dot_matches_new_line = if yes { Some(true) } else { None }; self } /// Enable or disable the CRLF mode flag (`R`) by default. pub fn crlf(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.crlf = if yes { Some(true) } else { None }; self } /// Enable or disable the "swap greed" flag (`U`) by default. pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.swap_greed = if yes { Some(true) } else { None }; self } /// Enable or disable the Unicode flag (`u`) by default. pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder { self.flags.unicode = if yes { None } else { Some(false) }; self } } /// A translator maps abstract syntax to a high level intermediate /// representation. /// /// A translator may be benefit from reuse. That is, a translator can translate /// many abstract syntax trees. /// /// A `Translator` can be configured in more detail via a /// [`TranslatorBuilder`]. #[derive(Clone, Debug)] pub struct Translator { /// Our call stack, but on the heap. stack: RefCell>, /// The current flag settings. flags: Cell, /// Whether we're allowed to produce HIR that can match arbitrary bytes. utf8: bool, /// The line terminator to use for `.`. line_terminator: u8, } impl Translator { /// Create a new translator using the default configuration. pub fn new() -> Translator { TranslatorBuilder::new().build() } /// Translate the given abstract syntax tree (AST) into a high level /// intermediate representation (HIR). /// /// If there was a problem doing the translation, then an HIR-specific /// error is returned. /// /// The original pattern string used to produce the `Ast` *must* also be /// provided. The translator does not use the pattern string during any /// correct translation, but is used for error reporting. pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result { ast::visit(ast, TranslatorI::new(self, pattern)) } } /// An HirFrame is a single stack frame, represented explicitly, which is /// created for each item in the Ast that we traverse. /// /// Note that technically, this type doesn't represent our entire stack /// frame. In particular, the Ast visitor represents any state associated with /// traversing the Ast itself. #[derive(Clone, Debug)] enum HirFrame { /// An arbitrary HIR expression. These get pushed whenever we hit a base /// case in the Ast. They get popped after an inductive (i.e., recursive) /// step is complete. Expr(Hir), /// A literal that is being constructed, character by character, from the /// AST. We need this because the AST gives each individual character its /// own node. So as we see characters, we peek at the top-most HirFrame. /// If it's a literal, then we add to it. Otherwise, we push a new literal. /// When it comes time to pop it, we convert it to an Hir via Hir::literal. Literal(Vec), /// A Unicode character class. This frame is mutated as we descend into /// the Ast of a character class (which is itself its own mini recursive /// structure). ClassUnicode(hir::ClassUnicode), /// A byte-oriented character class. This frame is mutated as we descend /// into the Ast of a character class (which is itself its own mini /// recursive structure). /// /// Byte character classes are created when Unicode mode (`u`) is disabled. /// If `utf8` is enabled (the default), then a byte character is only /// permitted to match ASCII text. ClassBytes(hir::ClassBytes), /// This is pushed whenever a repetition is observed. After visiting every /// sub-expression in the repetition, the translator's stack is expected to /// have this sentinel at the top. /// /// This sentinel only exists to stop other things (like flattening /// literals) from reaching across repetition operators. Repetition, /// This is pushed on to the stack upon first seeing any kind of capture, /// indicated by parentheses (including non-capturing groups). It is popped /// upon leaving a group. Group { /// The old active flags when this group was opened. /// /// If this group sets flags, then the new active flags are set to the /// result of merging the old flags with the flags introduced by this /// group. If the group doesn't set any flags, then this is simply /// equivalent to whatever flags were set when the group was opened. /// /// When this group is popped, the active flags should be restored to /// the flags set here. /// /// The "active" flags correspond to whatever flags are set in the /// Translator. old_flags: Flags, }, /// This is pushed whenever a concatenation is observed. After visiting /// every sub-expression in the concatenation, the translator's stack is /// popped until it sees a Concat frame. Concat, /// This is pushed whenever an alternation is observed. After visiting /// every sub-expression in the alternation, the translator's stack is /// popped until it sees an Alternation frame. Alternation, /// This is pushed immediately before each sub-expression in an /// alternation. This separates the branches of an alternation on the /// stack and prevents literal flattening from reaching across alternation /// branches. /// /// It is popped after each expression in a branch until an 'Alternation' /// frame is observed when doing a post visit on an alternation. AlternationBranch, } impl HirFrame { /// Assert that the current stack frame is an Hir expression and return it. fn unwrap_expr(self) -> Hir { match self { HirFrame::Expr(expr) => expr, HirFrame::Literal(lit) => Hir::literal(lit), _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self), } } /// Assert that the current stack frame is a Unicode class expression and /// return it. fn unwrap_class_unicode(self) -> hir::ClassUnicode { match self { HirFrame::ClassUnicode(cls) => cls, _ => panic!( "tried to unwrap Unicode class \ from HirFrame, got: {:?}", self ), } } /// Assert that the current stack frame is a byte class expression and /// return it. fn unwrap_class_bytes(self) -> hir::ClassBytes { match self { HirFrame::ClassBytes(cls) => cls, _ => panic!( "tried to unwrap byte class \ from HirFrame, got: {:?}", self ), } } /// Assert that the current stack frame is a repetition sentinel. If it /// isn't, then panic. fn unwrap_repetition(self) { match self { HirFrame::Repetition => {} _ => { panic!( "tried to unwrap repetition from HirFrame, got: {:?}", self ) } } } /// Assert that the current stack frame is a group indicator and return /// its corresponding flags (the flags that were active at the time the /// group was entered). fn unwrap_group(self) -> Flags { match self { HirFrame::Group { old_flags } => old_flags, _ => { panic!("tried to unwrap group from HirFrame, got: {:?}", self) } } } /// Assert that the current stack frame is an alternation pipe sentinel. If /// it isn't, then panic. fn unwrap_alternation_pipe(self) { match self { HirFrame::AlternationBranch => {} _ => { panic!( "tried to unwrap alt pipe from HirFrame, got: {:?}", self ) } } } } impl<'t, 'p> Visitor for TranslatorI<'t, 'p> { type Output = Hir; type Err = Error; fn finish(self) -> Result { // ... otherwise, we should have exactly one HIR on the stack. assert_eq!(self.trans().stack.borrow().len(), 1); Ok(self.pop().unwrap().unwrap_expr()) } fn visit_pre(&mut self, ast: &Ast) -> Result<()> { match *ast { Ast::ClassBracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } } Ast::Repetition(_) => self.push(HirFrame::Repetition), Ast::Group(ref x) => { let old_flags = x .flags() .map(|ast| self.set_flags(ast)) .unwrap_or_else(|| self.flags()); self.push(HirFrame::Group { old_flags }); } Ast::Concat(_) => { self.push(HirFrame::Concat); } Ast::Alternation(ref x) => { self.push(HirFrame::Alternation); if !x.asts.is_empty() { self.push(HirFrame::AlternationBranch); } } _ => {} } Ok(()) } fn visit_post(&mut self, ast: &Ast) -> Result<()> { match *ast { Ast::Empty(_) => { self.push(HirFrame::Expr(Hir::empty())); } Ast::Flags(ref x) => { self.set_flags(&x.flags); // Flags in the AST are generally considered directives and // not actual sub-expressions. However, they can be used in // the concrete syntax like `((?i))`, and we need some kind of // indication of an expression there, and Empty is the correct // choice. // // There can also be things like `(?i)+`, but we rule those out // in the parser. In the future, we might allow them for // consistency sake. self.push(HirFrame::Expr(Hir::empty())); } Ast::Literal(ref x) => match self.ast_literal_to_scalar(x)? { Either::Right(byte) => self.push_byte(byte), Either::Left(ch) => match self.case_fold_char(x.span, ch)? { None => self.push_char(ch), Some(expr) => self.push(HirFrame::Expr(expr)), }, }, Ast::Dot(ref span) => { self.push(HirFrame::Expr(self.hir_dot(**span)?)); } Ast::Assertion(ref x) => { self.push(HirFrame::Expr(self.hir_assertion(x)?)); } Ast::ClassPerl(ref x) => { if self.flags().unicode() { let cls = self.hir_perl_unicode_class(x)?; let hcls = hir::Class::Unicode(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } else { let cls = self.hir_perl_byte_class(x)?; let hcls = hir::Class::Bytes(cls); self.push(HirFrame::Expr(Hir::class(hcls))); } } Ast::ClassUnicode(ref x) => { let cls = hir::Class::Unicode(self.hir_unicode_class(x)?); self.push(HirFrame::Expr(Hir::class(cls))); } Ast::ClassBracketed(ref ast) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( &ast.span, ast.negated, &mut cls, )?; let expr = Hir::class(hir::Class::Unicode(cls)); self.push(HirFrame::Expr(expr)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); self.bytes_fold_and_negate( &ast.span, ast.negated, &mut cls, )?; let expr = Hir::class(hir::Class::Bytes(cls)); self.push(HirFrame::Expr(expr)); } } Ast::Repetition(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); self.pop().unwrap().unwrap_repetition(); self.push(HirFrame::Expr(self.hir_repetition(x, expr))); } Ast::Group(ref x) => { let expr = self.pop().unwrap().unwrap_expr(); let old_flags = self.pop().unwrap().unwrap_group(); self.trans().flags.set(old_flags); self.push(HirFrame::Expr(self.hir_capture(x, expr))); } Ast::Concat(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_concat_expr() { if !matches!(*expr.kind(), HirKind::Empty) { exprs.push(expr); } } exprs.reverse(); self.push(HirFrame::Expr(Hir::concat(exprs))); } Ast::Alternation(_) => { let mut exprs = vec![]; while let Some(expr) = self.pop_alt_expr() { self.pop().unwrap().unwrap_alternation_pipe(); exprs.push(expr); } exprs.reverse(); self.push(HirFrame::Expr(Hir::alternation(exprs))); } } Ok(()) } fn visit_alternation_in(&mut self) -> Result<()> { self.push(HirFrame::AlternationBranch); Ok(()) } fn visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { match *ast { ast::ClassSetItem::Bracketed(_) => { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } } // We needn't handle the Union case here since the visitor will // do it for us. _ => {} } Ok(()) } fn visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()> { match *ast { ast::ClassSetItem::Empty(_) => {} ast::ClassSetItem::Literal(ref x) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.push(hir::ClassUnicodeRange::new(x.c, x.c)); self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); let byte = self.class_literal_byte(x)?; cls.push(hir::ClassBytesRange::new(byte, byte)); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Range(ref x) => { if self.flags().unicode() { let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c)); self.push(HirFrame::ClassUnicode(cls)); } else { let mut cls = self.pop().unwrap().unwrap_class_bytes(); let start = self.class_literal_byte(&x.start)?; let end = self.class_literal_byte(&x.end)?; cls.push(hir::ClassBytesRange::new(start, end)); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Ascii(ref x) => { if self.flags().unicode() { let xcls = self.hir_ascii_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { let xcls = self.hir_ascii_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Unicode(ref x) => { let xcls = self.hir_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } ast::ClassSetItem::Perl(ref x) => { if self.flags().unicode() { let xcls = self.hir_perl_unicode_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_unicode(); cls.union(&xcls); self.push(HirFrame::ClassUnicode(cls)); } else { let xcls = self.hir_perl_byte_class(x)?; let mut cls = self.pop().unwrap().unwrap_class_bytes(); cls.union(&xcls); self.push(HirFrame::ClassBytes(cls)); } } ast::ClassSetItem::Bracketed(ref ast) => { if self.flags().unicode() { let mut cls1 = self.pop().unwrap().unwrap_class_unicode(); self.unicode_fold_and_negate( &ast.span, ast.negated, &mut cls1, )?; let mut cls2 = self.pop().unwrap().unwrap_class_unicode(); cls2.union(&cls1); self.push(HirFrame::ClassUnicode(cls2)); } else { let mut cls1 = self.pop().unwrap().unwrap_class_bytes(); self.bytes_fold_and_negate( &ast.span, ast.negated, &mut cls1, )?; let mut cls2 = self.pop().unwrap().unwrap_class_bytes(); cls2.union(&cls1); self.push(HirFrame::ClassBytes(cls2)); } } // This is handled automatically by the visitor. ast::ClassSetItem::Union(_) => {} } Ok(()) } fn visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()> { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } fn visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()> { if self.flags().unicode() { let cls = hir::ClassUnicode::empty(); self.push(HirFrame::ClassUnicode(cls)); } else { let cls = hir::ClassBytes::empty(); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } fn visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()> { use crate::ast::ClassSetBinaryOpKind::*; if self.flags().unicode() { let mut rhs = self.pop().unwrap().unwrap_class_unicode(); let mut lhs = self.pop().unwrap().unwrap_class_unicode(); let mut cls = self.pop().unwrap().unwrap_class_unicode(); if self.flags().case_insensitive() { rhs.try_case_fold_simple().map_err(|_| { self.error( op.rhs.span().clone(), ErrorKind::UnicodeCaseUnavailable, ) })?; lhs.try_case_fold_simple().map_err(|_| { self.error( op.lhs.span().clone(), ErrorKind::UnicodeCaseUnavailable, ) })?; } match op.kind { Intersection => lhs.intersect(&rhs), Difference => lhs.difference(&rhs), SymmetricDifference => lhs.symmetric_difference(&rhs), } cls.union(&lhs); self.push(HirFrame::ClassUnicode(cls)); } else { let mut rhs = self.pop().unwrap().unwrap_class_bytes(); let mut lhs = self.pop().unwrap().unwrap_class_bytes(); let mut cls = self.pop().unwrap().unwrap_class_bytes(); if self.flags().case_insensitive() { rhs.case_fold_simple(); lhs.case_fold_simple(); } match op.kind { Intersection => lhs.intersect(&rhs), Difference => lhs.difference(&rhs), SymmetricDifference => lhs.symmetric_difference(&rhs), } cls.union(&lhs); self.push(HirFrame::ClassBytes(cls)); } Ok(()) } } /// The internal implementation of a translator. /// /// This type is responsible for carrying around the original pattern string, /// which is not tied to the internal state of a translator. /// /// A TranslatorI exists for the time it takes to translate a single Ast. #[derive(Clone, Debug)] struct TranslatorI<'t, 'p> { trans: &'t Translator, pattern: &'p str, } impl<'t, 'p> TranslatorI<'t, 'p> { /// Build a new internal translator. fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> { TranslatorI { trans, pattern } } /// Return a reference to the underlying translator. fn trans(&self) -> &Translator { &self.trans } /// Push the given frame on to the call stack. fn push(&self, frame: HirFrame) { self.trans().stack.borrow_mut().push(frame); } /// Push the given literal char on to the call stack. /// /// If the top-most element of the stack is a literal, then the char /// is appended to the end of that literal. Otherwise, a new literal /// containing just the given char is pushed to the top of the stack. fn push_char(&self, ch: char) { let mut buf = [0; 4]; let bytes = ch.encode_utf8(&mut buf).as_bytes(); let mut stack = self.trans().stack.borrow_mut(); if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { literal.extend_from_slice(bytes); } else { stack.push(HirFrame::Literal(bytes.to_vec())); } } /// Push the given literal byte on to the call stack. /// /// If the top-most element of the stack is a literal, then the byte /// is appended to the end of that literal. Otherwise, a new literal /// containing just the given byte is pushed to the top of the stack. fn push_byte(&self, byte: u8) { let mut stack = self.trans().stack.borrow_mut(); if let Some(HirFrame::Literal(ref mut literal)) = stack.last_mut() { literal.push(byte); } else { stack.push(HirFrame::Literal(vec![byte])); } } /// Pop the top of the call stack. If the call stack is empty, return None. fn pop(&self) -> Option { self.trans().stack.borrow_mut().pop() } /// Pop an HIR expression from the top of the stack for a concatenation. /// /// This returns None if the stack is empty or when a concat frame is seen. /// Otherwise, it panics if it could not find an HIR expression. fn pop_concat_expr(&self) -> Option { let frame = self.pop()?; match frame { HirFrame::Concat => None, HirFrame::Expr(expr) => Some(expr), HirFrame::Literal(lit) => Some(Hir::literal(lit)), HirFrame::ClassUnicode(_) => { unreachable!("expected expr or concat, got Unicode class") } HirFrame::ClassBytes(_) => { unreachable!("expected expr or concat, got byte class") } HirFrame::Repetition => { unreachable!("expected expr or concat, got repetition") } HirFrame::Group { .. } => { unreachable!("expected expr or concat, got group") } HirFrame::Alternation => { unreachable!("expected expr or concat, got alt marker") } HirFrame::AlternationBranch => { unreachable!("expected expr or concat, got alt branch marker") } } } /// Pop an HIR expression from the top of the stack for an alternation. /// /// This returns None if the stack is empty or when an alternation frame is /// seen. Otherwise, it panics if it could not find an HIR expression. fn pop_alt_expr(&self) -> Option { let frame = self.pop()?; match frame { HirFrame::Alternation => None, HirFrame::Expr(expr) => Some(expr), HirFrame::Literal(lit) => Some(Hir::literal(lit)), HirFrame::ClassUnicode(_) => { unreachable!("expected expr or alt, got Unicode class") } HirFrame::ClassBytes(_) => { unreachable!("expected expr or alt, got byte class") } HirFrame::Repetition => { unreachable!("expected expr or alt, got repetition") } HirFrame::Group { .. } => { unreachable!("expected expr or alt, got group") } HirFrame::Concat => { unreachable!("expected expr or alt, got concat marker") } HirFrame::AlternationBranch => { unreachable!("expected expr or alt, got alt branch marker") } } } /// Create a new error with the given span and error type. fn error(&self, span: Span, kind: ErrorKind) -> Error { Error { kind, pattern: self.pattern.to_string(), span } } /// Return a copy of the active flags. fn flags(&self) -> Flags { self.trans().flags.get() } /// Set the flags of this translator from the flags set in the given AST. /// Then, return the old flags. fn set_flags(&self, ast_flags: &ast::Flags) -> Flags { let old_flags = self.flags(); let mut new_flags = Flags::from_ast(ast_flags); new_flags.merge(&old_flags); self.trans().flags.set(new_flags); old_flags } /// Convert an Ast literal to its scalar representation. /// /// When Unicode mode is enabled, then this always succeeds and returns a /// `char` (Unicode scalar value). /// /// When Unicode mode is disabled, then a `char` will still be returned /// whenever possible. A byte is returned only when invalid UTF-8 is /// allowed and when the byte is not ASCII. Otherwise, a non-ASCII byte /// will result in an error when invalid UTF-8 is not allowed. fn ast_literal_to_scalar( &self, lit: &ast::Literal, ) -> Result> { if self.flags().unicode() { return Ok(Either::Left(lit.c)); } let byte = match lit.byte() { None => return Ok(Either::Left(lit.c)), Some(byte) => byte, }; if byte <= 0x7F { return Ok(Either::Left(char::try_from(byte).unwrap())); } if self.trans().utf8 { return Err(self.error(lit.span, ErrorKind::InvalidUtf8)); } Ok(Either::Right(byte)) } fn case_fold_char(&self, span: Span, c: char) -> Result> { if !self.flags().case_insensitive() { return Ok(None); } if self.flags().unicode() { // If case folding won't do anything, then don't bother trying. let map = unicode::SimpleCaseFolder::new() .map(|f| f.overlaps(c, c)) .map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; if !map { return Ok(None); } let mut cls = hir::ClassUnicode::new(vec![hir::ClassUnicodeRange::new( c, c, )]); cls.try_case_fold_simple().map_err(|_| { self.error(span, ErrorKind::UnicodeCaseUnavailable) })?; Ok(Some(Hir::class(hir::Class::Unicode(cls)))) } else { if !c.is_ascii() { return Ok(None); } // If case folding won't do anything, then don't bother trying. match c { 'A'..='Z' | 'a'..='z' => {} _ => return Ok(None), } let mut cls = hir::ClassBytes::new(vec![hir::ClassBytesRange::new( // OK because 'c.len_utf8() == 1' which in turn implies // that 'c' is ASCII. u8::try_from(c).unwrap(), u8::try_from(c).unwrap(), )]); cls.case_fold_simple(); Ok(Some(Hir::class(hir::Class::Bytes(cls)))) } } fn hir_dot(&self, span: Span) -> Result { let (utf8, lineterm, flags) = (self.trans().utf8, self.trans().line_terminator, self.flags()); if utf8 && (!flags.unicode() || !lineterm.is_ascii()) { return Err(self.error(span, ErrorKind::InvalidUtf8)); } let dot = if flags.dot_matches_new_line() { if flags.unicode() { hir::Dot::AnyChar } else { hir::Dot::AnyByte } } else { if flags.unicode() { if flags.crlf() { hir::Dot::AnyCharExceptCRLF } else { if !lineterm.is_ascii() { return Err( self.error(span, ErrorKind::InvalidLineTerminator) ); } hir::Dot::AnyCharExcept(char::from(lineterm)) } } else { if flags.crlf() { hir::Dot::AnyByteExceptCRLF } else { hir::Dot::AnyByteExcept(lineterm) } } }; Ok(Hir::dot(dot)) } fn hir_assertion(&self, asst: &ast::Assertion) -> Result { let unicode = self.flags().unicode(); let multi_line = self.flags().multi_line(); let crlf = self.flags().crlf(); Ok(match asst.kind { ast::AssertionKind::StartLine => Hir::look(if multi_line { if crlf { hir::Look::StartCRLF } else { hir::Look::StartLF } } else { hir::Look::Start }), ast::AssertionKind::EndLine => Hir::look(if multi_line { if crlf { hir::Look::EndCRLF } else { hir::Look::EndLF } } else { hir::Look::End }), ast::AssertionKind::StartText => Hir::look(hir::Look::Start), ast::AssertionKind::EndText => Hir::look(hir::Look::End), ast::AssertionKind::WordBoundary => Hir::look(if unicode { hir::Look::WordUnicode } else { hir::Look::WordAscii }), ast::AssertionKind::NotWordBoundary => Hir::look(if unicode { hir::Look::WordUnicodeNegate } else { hir::Look::WordAsciiNegate }), ast::AssertionKind::WordBoundaryStart | ast::AssertionKind::WordBoundaryStartAngle => { Hir::look(if unicode { hir::Look::WordStartUnicode } else { hir::Look::WordStartAscii }) } ast::AssertionKind::WordBoundaryEnd | ast::AssertionKind::WordBoundaryEndAngle => { Hir::look(if unicode { hir::Look::WordEndUnicode } else { hir::Look::WordEndAscii }) } ast::AssertionKind::WordBoundaryStartHalf => { Hir::look(if unicode { hir::Look::WordStartHalfUnicode } else { hir::Look::WordStartHalfAscii }) } ast::AssertionKind::WordBoundaryEndHalf => Hir::look(if unicode { hir::Look::WordEndHalfUnicode } else { hir::Look::WordEndHalfAscii }), }) } fn hir_capture(&self, group: &ast::Group, expr: Hir) -> Hir { let (index, name) = match group.kind { ast::GroupKind::CaptureIndex(index) => (index, None), ast::GroupKind::CaptureName { ref name, .. } => { (name.index, Some(name.name.clone().into_boxed_str())) } // The HIR doesn't need to use non-capturing groups, since the way // in which the data type is defined handles this automatically. ast::GroupKind::NonCapturing(_) => return expr, }; Hir::capture(hir::Capture { index, name, sub: Box::new(expr) }) } fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir { let (min, max) = match rep.op.kind { ast::RepetitionKind::ZeroOrOne => (0, Some(1)), ast::RepetitionKind::ZeroOrMore => (0, None), ast::RepetitionKind::OneOrMore => (1, None), ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => { (m, Some(m)) } ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => { (m, None) } ast::RepetitionKind::Range(ast::RepetitionRange::Bounded( m, n, )) => (m, Some(n)), }; let greedy = if self.flags().swap_greed() { !rep.greedy } else { rep.greedy }; Hir::repetition(hir::Repetition { min, max, greedy, sub: Box::new(expr), }) } fn hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result { use crate::ast::ClassUnicodeKind::*; if !self.flags().unicode() { return Err( self.error(ast_class.span, ErrorKind::UnicodeNotAllowed) ); } let query = match ast_class.kind { OneLetter(name) => ClassQuery::OneLetter(name), Named(ref name) => ClassQuery::Binary(name), NamedValue { ref name, ref value, .. } => ClassQuery::ByValue { property_name: name, property_value: value, }, }; let mut result = self.convert_unicode_class_error( &ast_class.span, unicode::class(query), ); if let Ok(ref mut class) = result { self.unicode_fold_and_negate( &ast_class.span, ast_class.negated, class, )?; } result } fn hir_ascii_unicode_class( &self, ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassUnicode::new( ascii_class_as_chars(&ast.kind) .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ); self.unicode_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) } fn hir_ascii_byte_class( &self, ast: &ast::ClassAscii, ) -> Result { let mut cls = hir::ClassBytes::new( ascii_class(&ast.kind) .map(|(s, e)| hir::ClassBytesRange::new(s, e)), ); self.bytes_fold_and_negate(&ast.span, ast.negated, &mut cls)?; Ok(cls) } fn hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> Result { use crate::ast::ClassPerlKind::*; assert!(self.flags().unicode()); let result = match ast_class.kind { Digit => unicode::perl_digit(), Space => unicode::perl_space(), Word => unicode::perl_word(), }; let mut class = self.convert_unicode_class_error(&ast_class.span, result)?; // We needn't apply case folding here because the Perl Unicode classes // are already closed under Unicode simple case folding. if ast_class.negated { class.negate(); } Ok(class) } fn hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> Result { use crate::ast::ClassPerlKind::*; assert!(!self.flags().unicode()); let mut class = match ast_class.kind { Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit), Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space), Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word), }; // We needn't apply case folding here because the Perl ASCII classes // are already closed (under ASCII case folding). if ast_class.negated { class.negate(); } // Negating a Perl byte class is likely to cause it to match invalid // UTF-8. That's only OK if the translator is configured to allow such // things. if self.trans().utf8 && !class.is_ascii() { return Err(self.error(ast_class.span, ErrorKind::InvalidUtf8)); } Ok(class) } /// Converts the given Unicode specific error to an HIR translation error. /// /// The span given should approximate the position at which an error would /// occur. fn convert_unicode_class_error( &self, span: &Span, result: core::result::Result, ) -> Result { result.map_err(|err| { let sp = span.clone(); match err { unicode::Error::PropertyNotFound => { self.error(sp, ErrorKind::UnicodePropertyNotFound) } unicode::Error::PropertyValueNotFound => { self.error(sp, ErrorKind::UnicodePropertyValueNotFound) } unicode::Error::PerlClassNotFound => { self.error(sp, ErrorKind::UnicodePerlClassNotFound) } } }) } fn unicode_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassUnicode, ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { class.try_case_fold_simple().map_err(|_| { self.error(span.clone(), ErrorKind::UnicodeCaseUnavailable) })?; } if negated { class.negate(); } Ok(()) } fn bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()> { // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive() { class.case_fold_simple(); } if negated { class.negate(); } if self.trans().utf8 && !class.is_ascii() { return Err(self.error(span.clone(), ErrorKind::InvalidUtf8)); } Ok(()) } /// Return a scalar byte value suitable for use as a literal in a byte /// character class. fn class_literal_byte(&self, ast: &ast::Literal) -> Result { match self.ast_literal_to_scalar(ast)? { Either::Right(byte) => Ok(byte), Either::Left(ch) => { if ch.is_ascii() { Ok(u8::try_from(ch).unwrap()) } else { // We can't feasibly support Unicode in // byte oriented classes. Byte classes don't // do Unicode case folding. Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed)) } } } } } /// A translator's representation of a regular expression's flags at any given /// moment in time. /// /// Each flag can be in one of three states: absent, present but disabled or /// present but enabled. #[derive(Clone, Copy, Debug, Default)] struct Flags { case_insensitive: Option, multi_line: Option, dot_matches_new_line: Option, swap_greed: Option, unicode: Option, crlf: Option, // Note that `ignore_whitespace` is omitted here because it is handled // entirely in the parser. } impl Flags { fn from_ast(ast: &ast::Flags) -> Flags { let mut flags = Flags::default(); let mut enable = true; for item in &ast.items { match item.kind { ast::FlagsItemKind::Negation => { enable = false; } ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => { flags.case_insensitive = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => { flags.multi_line = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => { flags.dot_matches_new_line = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => { flags.swap_greed = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::Unicode) => { flags.unicode = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::CRLF) => { flags.crlf = Some(enable); } ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {} } } flags } fn merge(&mut self, previous: &Flags) { if self.case_insensitive.is_none() { self.case_insensitive = previous.case_insensitive; } if self.multi_line.is_none() { self.multi_line = previous.multi_line; } if self.dot_matches_new_line.is_none() { self.dot_matches_new_line = previous.dot_matches_new_line; } if self.swap_greed.is_none() { self.swap_greed = previous.swap_greed; } if self.unicode.is_none() { self.unicode = previous.unicode; } if self.crlf.is_none() { self.crlf = previous.crlf; } } fn case_insensitive(&self) -> bool { self.case_insensitive.unwrap_or(false) } fn multi_line(&self) -> bool { self.multi_line.unwrap_or(false) } fn dot_matches_new_line(&self) -> bool { self.dot_matches_new_line.unwrap_or(false) } fn swap_greed(&self) -> bool { self.swap_greed.unwrap_or(false) } fn unicode(&self) -> bool { self.unicode.unwrap_or(true) } fn crlf(&self) -> bool { self.crlf.unwrap_or(false) } } fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes { let ranges: Vec<_> = ascii_class(kind) .map(|(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::ClassBytes::new(ranges) } fn ascii_class(kind: &ast::ClassAsciiKind) -> impl Iterator { use crate::ast::ClassAsciiKind::*; let slice: &'static [(u8, u8)] = match *kind { Alnum => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], Alpha => &[(b'A', b'Z'), (b'a', b'z')], Ascii => &[(b'\x00', b'\x7F')], Blank => &[(b'\t', b'\t'), (b' ', b' ')], Cntrl => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], Digit => &[(b'0', b'9')], Graph => &[(b'!', b'~')], Lower => &[(b'a', b'z')], Print => &[(b' ', b'~')], Punct => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], Space => &[ (b'\t', b'\t'), (b'\n', b'\n'), (b'\x0B', b'\x0B'), (b'\x0C', b'\x0C'), (b'\r', b'\r'), (b' ', b' '), ], Upper => &[(b'A', b'Z')], Word => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], Xdigit => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], }; slice.iter().copied() } fn ascii_class_as_chars( kind: &ast::ClassAsciiKind, ) -> impl Iterator { ascii_class(kind).map(|(s, e)| (char::from(s), char::from(e))) } #[cfg(test)] mod tests { use crate::{ ast::{self, parse::ParserBuilder, Ast, Position, Span}, hir::{self, Hir, HirKind, Look, Properties}, unicode::{self, ClassQuery}, }; use super::*; // We create these errors to compare with real hir::Errors in the tests. // We define equality between TestError and hir::Error to disregard the // pattern string in hir::Error, which is annoying to provide in tests. #[derive(Clone, Debug)] struct TestError { span: Span, kind: hir::ErrorKind, } impl PartialEq for TestError { fn eq(&self, other: &hir::Error) -> bool { self.span == other.span && self.kind == other.kind } } impl PartialEq for hir::Error { fn eq(&self, other: &TestError) -> bool { self.span == other.span && self.kind == other.kind } } fn parse(pattern: &str) -> Ast { ParserBuilder::new().octal(true).build().parse(pattern).unwrap() } fn t(pattern: &str) -> Hir { TranslatorBuilder::new() .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap() } fn t_err(pattern: &str) -> hir::Error { TranslatorBuilder::new() .utf8(true) .build() .translate(pattern, &parse(pattern)) .unwrap_err() } fn t_bytes(pattern: &str) -> Hir { TranslatorBuilder::new() .utf8(false) .build() .translate(pattern, &parse(pattern)) .unwrap() } fn props(pattern: &str) -> Properties { t(pattern).properties().clone() } fn props_bytes(pattern: &str) -> Properties { t_bytes(pattern).properties().clone() } fn hir_lit(s: &str) -> Hir { hir_blit(s.as_bytes()) } fn hir_blit(s: &[u8]) -> Hir { Hir::literal(s) } fn hir_capture(index: u32, expr: Hir) -> Hir { Hir::capture(hir::Capture { index, name: None, sub: Box::new(expr) }) } fn hir_capture_name(index: u32, name: &str, expr: Hir) -> Hir { Hir::capture(hir::Capture { index, name: Some(name.into()), sub: Box::new(expr), }) } fn hir_quest(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy, sub: Box::new(expr), }) } fn hir_star(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 0, max: None, greedy, sub: Box::new(expr), }) } fn hir_plus(greedy: bool, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min: 1, max: None, greedy, sub: Box::new(expr), }) } fn hir_range(greedy: bool, min: u32, max: Option, expr: Hir) -> Hir { Hir::repetition(hir::Repetition { min, max, greedy, sub: Box::new(expr), }) } fn hir_alt(alts: Vec) -> Hir { Hir::alternation(alts) } fn hir_cat(exprs: Vec) -> Hir { Hir::concat(exprs) } #[allow(dead_code)] fn hir_uclass_query(query: ClassQuery<'_>) -> Hir { Hir::class(hir::Class::Unicode(unicode::class(query).unwrap())) } #[allow(dead_code)] fn hir_uclass_perl_word() -> Hir { Hir::class(hir::Class::Unicode(unicode::perl_word().unwrap())) } fn hir_ascii_uclass(kind: &ast::ClassAsciiKind) -> Hir { Hir::class(hir::Class::Unicode(hir::ClassUnicode::new( ascii_class_as_chars(kind) .map(|(s, e)| hir::ClassUnicodeRange::new(s, e)), ))) } fn hir_ascii_bclass(kind: &ast::ClassAsciiKind) -> Hir { Hir::class(hir::Class::Bytes(hir::ClassBytes::new( ascii_class(kind).map(|(s, e)| hir::ClassBytesRange::new(s, e)), ))) } fn hir_uclass(ranges: &[(char, char)]) -> Hir { Hir::class(uclass(ranges)) } fn hir_bclass(ranges: &[(u8, u8)]) -> Hir { Hir::class(bclass(ranges)) } fn hir_case_fold(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { cls.case_fold_simple(); Hir::class(cls) } _ => panic!("cannot case fold non-class Hir expr"), } } fn hir_negate(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Class(mut cls) => { cls.negate(); Hir::class(cls) } _ => panic!("cannot negate non-class Hir expr"), } } fn uclass(ranges: &[(char, char)]) -> hir::Class { let ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) .collect(); hir::Class::Unicode(hir::ClassUnicode::new(ranges)) } fn bclass(ranges: &[(u8, u8)]) -> hir::Class { let ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassBytesRange::new(s, e)) .collect(); hir::Class::Bytes(hir::ClassBytes::new(ranges)) } #[cfg(feature = "unicode-case")] fn class_case_fold(mut cls: hir::Class) -> Hir { cls.case_fold_simple(); Hir::class(cls) } fn class_negate(mut cls: hir::Class) -> Hir { cls.negate(); Hir::class(cls) } #[allow(dead_code)] fn hir_union(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; match (expr1.into_kind(), expr2.into_kind()) { (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { c1.union(&c2); Hir::class(hir::Class::Unicode(c1)) } (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { c1.union(&c2); Hir::class(hir::Class::Bytes(c1)) } _ => panic!("cannot union non-class Hir exprs"), } } #[allow(dead_code)] fn hir_difference(expr1: Hir, expr2: Hir) -> Hir { use crate::hir::Class::{Bytes, Unicode}; match (expr1.into_kind(), expr2.into_kind()) { (HirKind::Class(Unicode(mut c1)), HirKind::Class(Unicode(c2))) => { c1.difference(&c2); Hir::class(hir::Class::Unicode(c1)) } (HirKind::Class(Bytes(mut c1)), HirKind::Class(Bytes(c2))) => { c1.difference(&c2); Hir::class(hir::Class::Bytes(c1)) } _ => panic!("cannot difference non-class Hir exprs"), } } fn hir_look(look: hir::Look) -> Hir { Hir::look(look) } #[test] fn empty() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?i)"), Hir::empty()); assert_eq!(t("()"), hir_capture(1, Hir::empty())); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("(?P)"), hir_capture_name(1, "wat", Hir::empty())); assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()])); assert_eq!( t("()|()"), hir_alt(vec![ hir_capture(1, Hir::empty()), hir_capture(2, Hir::empty()), ]) ); assert_eq!( t("(|b)"), hir_capture(1, hir_alt(vec![Hir::empty(), hir_lit("b"),])) ); assert_eq!( t("(a|)"), hir_capture(1, hir_alt(vec![hir_lit("a"), Hir::empty(),])) ); assert_eq!( t("(a||c)"), hir_capture( 1, hir_alt(vec![hir_lit("a"), Hir::empty(), hir_lit("c"),]) ) ); assert_eq!( t("(||)"), hir_capture( 1, hir_alt(vec![Hir::empty(), Hir::empty(), Hir::empty(),]) ) ); } #[test] fn literal() { assert_eq!(t("a"), hir_lit("a")); assert_eq!(t("(?-u)a"), hir_lit("a")); assert_eq!(t("☃"), hir_lit("☃")); assert_eq!(t("abcd"), hir_lit("abcd")); assert_eq!(t_bytes("(?-u)a"), hir_lit("a")); assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a")); assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF")); assert_eq!(t("(?-u)☃"), hir_lit("☃")); assert_eq!( t_err(r"(?-u)\xFF"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(9, 1, 10) ), } ); } #[test] fn literal_case_insensitive() { #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)a"), hir_uclass(&[('A', 'A'), ('a', 'a'),])); #[cfg(feature = "unicode-case")] assert_eq!(t("(?i:a)"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("a(?i)a(?-i)a"), hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)ab@c"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_uclass(&[('B', 'B'), ('b', 'b')]), hir_lit("@"), hir_uclass(&[('C', 'C'), ('c', 'c')]), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)β"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![ hir_lit("a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("a"), ]) ); assert_eq!( t("(?i-u)ab@c"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_bclass(&[(b'B', b'B'), (b'b', b'b')]), hir_lit("@"), hir_bclass(&[(b'C', b'C'), (b'c', b'c')]), ]) ); assert_eq!( t_bytes("(?i-u)a"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!( t_bytes("(?i-u)\x61"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!( t_bytes(r"(?i-u)\x61"), hir_bclass(&[(b'A', b'A'), (b'a', b'a'),]) ); assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF")); assert_eq!(t("(?i-u)β"), hir_lit("β"),); } #[test] fn dot() { assert_eq!( t("."), hir_uclass(&[('\0', '\t'), ('\x0B', '\u{10FFFF}')]) ); assert_eq!( t("(?R)."), hir_uclass(&[ ('\0', '\t'), ('\x0B', '\x0C'), ('\x0E', '\u{10FFFF}'), ]) ); assert_eq!(t("(?s)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!(t("(?Rs)."), hir_uclass(&[('\0', '\u{10FFFF}')])); assert_eq!( t_bytes("(?-u)."), hir_bclass(&[(b'\0', b'\t'), (b'\x0B', b'\xFF')]) ); assert_eq!( t_bytes("(?R-u)."), hir_bclass(&[ (b'\0', b'\t'), (b'\x0B', b'\x0C'), (b'\x0E', b'\xFF'), ]) ); assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); assert_eq!(t_bytes("(?Rs-u)."), hir_bclass(&[(b'\0', b'\xFF'),])); // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed. assert_eq!( t_err("(?-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(6, 1, 7) ), } ); assert_eq!( t_err("(?R-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(7, 1, 8) ), } ); assert_eq!( t_err("(?s-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(7, 1, 8) ), } ); assert_eq!( t_err("(?Rs-u)."), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(7, 1, 8), Position::new(8, 1, 9) ), } ); } #[test] fn assertions() { assert_eq!(t("^"), hir_look(hir::Look::Start)); assert_eq!(t("$"), hir_look(hir::Look::End)); assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"\z"), hir_look(hir::Look::End)); assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); assert_eq!(t(r"\b"), hir_look(hir::Look::WordUnicode)); assert_eq!(t(r"\B"), hir_look(hir::Look::WordUnicodeNegate)); assert_eq!(t(r"(?-u)\b"), hir_look(hir::Look::WordAscii)); assert_eq!(t(r"(?-u)\B"), hir_look(hir::Look::WordAsciiNegate)); } #[test] fn group() { assert_eq!(t("(a)"), hir_capture(1, hir_lit("a"))); assert_eq!( t("(a)(b)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_capture(2, hir_lit("b")), ]) ); assert_eq!( t("(a)|(b)"), hir_alt(vec![ hir_capture(1, hir_lit("a")), hir_capture(2, hir_lit("b")), ]) ); assert_eq!(t("(?P)"), hir_capture_name(1, "foo", Hir::empty())); assert_eq!(t("(?Pa)"), hir_capture_name(1, "foo", hir_lit("a"))); assert_eq!( t("(?Pa)(?Pb)"), hir_cat(vec![ hir_capture_name(1, "foo", hir_lit("a")), hir_capture_name(2, "bar", hir_lit("b")), ]) ); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("(?:a)"), hir_lit("a")); assert_eq!( t("(?:a)(b)"), hir_cat(vec![hir_lit("a"), hir_capture(1, hir_lit("b")),]) ); assert_eq!( t("(a)(?:b)(c)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_lit("b"), hir_capture(2, hir_lit("c")), ]) ); assert_eq!( t("(a)(?Pb)(c)"), hir_cat(vec![ hir_capture(1, hir_lit("a")), hir_capture_name(2, "foo", hir_lit("b")), hir_capture(3, hir_lit("c")), ]) ); assert_eq!(t("()"), hir_capture(1, Hir::empty())); assert_eq!(t("((?i))"), hir_capture(1, Hir::empty())); assert_eq!(t("((?x))"), hir_capture(1, Hir::empty())); assert_eq!( t("(((?x)))"), hir_capture(1, hir_capture(2, Hir::empty())) ); } #[test] fn line_anchors() { assert_eq!(t("^"), hir_look(hir::Look::Start)); assert_eq!(t("$"), hir_look(hir::Look::End)); assert_eq!(t(r"\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"\z"), hir_look(hir::Look::End)); assert_eq!(t(r"(?m)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?m)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?m)^"), hir_look(hir::Look::StartLF)); assert_eq!(t("(?m)$"), hir_look(hir::Look::EndLF)); assert_eq!(t(r"(?R)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?R)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?R)^"), hir_look(hir::Look::Start)); assert_eq!(t("(?R)$"), hir_look(hir::Look::End)); assert_eq!(t(r"(?Rm)\A"), hir_look(hir::Look::Start)); assert_eq!(t(r"(?Rm)\z"), hir_look(hir::Look::End)); assert_eq!(t("(?Rm)^"), hir_look(hir::Look::StartCRLF)); assert_eq!(t("(?Rm)$"), hir_look(hir::Look::EndCRLF)); } #[test] fn flags() { #[cfg(feature = "unicode-case")] assert_eq!( t("(?i:a)a"), hir_cat( vec![hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"),] ) ); assert_eq!( t("(?i-u:a)β"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("β"), ]) ); assert_eq!( t("(?:(?i-u)a)b"), hir_cat(vec![ hir_bclass(&[(b'A', b'A'), (b'a', b'a')]), hir_lit("b"), ]) ); assert_eq!( t("((?i-u)a)b"), hir_cat(vec![ hir_capture(1, hir_bclass(&[(b'A', b'A'), (b'a', b'a')])), hir_lit("b"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?-i:a)a"), hir_cat( vec![hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]),] ) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::StartLF), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?im)a^(?i-m)a^"), hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::StartLF), hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_look(hir::Look::Start), ]) ); assert_eq!( t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![ hir_star(false, hir_lit("a")), hir_star(true, hir_lit("a")), hir_star(true, hir_lit("a")), hir_star(false, hir_lit("a")), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?:a(?i)a)a"), hir_cat(vec![ hir_cat(vec![ hir_lit("a"), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]), hir_lit("a"), ]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)(?:a(?-i)a)a"), hir_cat(vec![ hir_cat(vec![ hir_uclass(&[('A', 'A'), ('a', 'a')]), hir_lit("a"), ]), hir_uclass(&[('A', 'A'), ('a', 'a')]), ]) ); } #[test] fn escape() { assert_eq!( t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), hir_lit(r"\.+*?()|[]{}^$#") ); } #[test] fn repetition() { assert_eq!(t("a?"), hir_quest(true, hir_lit("a"))); assert_eq!(t("a*"), hir_star(true, hir_lit("a"))); assert_eq!(t("a+"), hir_plus(true, hir_lit("a"))); assert_eq!(t("a??"), hir_quest(false, hir_lit("a"))); assert_eq!(t("a*?"), hir_star(false, hir_lit("a"))); assert_eq!(t("a+?"), hir_plus(false, hir_lit("a"))); assert_eq!(t("a{1}"), hir_range(true, 1, Some(1), hir_lit("a"),)); assert_eq!(t("a{1,}"), hir_range(true, 1, None, hir_lit("a"),)); assert_eq!(t("a{1,2}"), hir_range(true, 1, Some(2), hir_lit("a"),)); assert_eq!(t("a{1}?"), hir_range(false, 1, Some(1), hir_lit("a"),)); assert_eq!(t("a{1,}?"), hir_range(false, 1, None, hir_lit("a"),)); assert_eq!(t("a{1,2}?"), hir_range(false, 1, Some(2), hir_lit("a"),)); assert_eq!( t("ab?"), hir_cat(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); assert_eq!(t("(ab)?"), hir_quest(true, hir_capture(1, hir_lit("ab")))); assert_eq!( t("a|b?"), hir_alt(vec![hir_lit("a"), hir_quest(true, hir_lit("b")),]) ); } #[test] fn cat_alt() { let a = || hir_look(hir::Look::Start); let b = || hir_look(hir::Look::End); let c = || hir_look(hir::Look::WordUnicode); let d = || hir_look(hir::Look::WordUnicodeNegate); assert_eq!(t("(^$)"), hir_capture(1, hir_cat(vec![a(), b()]))); assert_eq!(t("^|$"), hir_alt(vec![a(), b()])); assert_eq!(t(r"^|$|\b"), hir_alt(vec![a(), b(), c()])); assert_eq!( t(r"^$|$\b|\b\B"), hir_alt(vec![ hir_cat(vec![a(), b()]), hir_cat(vec![b(), c()]), hir_cat(vec![c(), d()]), ]) ); assert_eq!(t("(^|$)"), hir_capture(1, hir_alt(vec![a(), b()]))); assert_eq!( t(r"(^|$|\b)"), hir_capture(1, hir_alt(vec![a(), b(), c()])) ); assert_eq!( t(r"(^$|$\b|\b\B)"), hir_capture( 1, hir_alt(vec![ hir_cat(vec![a(), b()]), hir_cat(vec![b(), c()]), hir_cat(vec![c(), d()]), ]) ) ); assert_eq!( t(r"(^$|($\b|(\b\B)))"), hir_capture( 1, hir_alt(vec![ hir_cat(vec![a(), b()]), hir_capture( 2, hir_alt(vec![ hir_cat(vec![b(), c()]), hir_capture(3, hir_cat(vec![c(), d()])), ]) ), ]) ) ); } // Tests the HIR transformation of things like '[a-z]|[A-Z]' into // '[A-Za-z]'. In other words, an alternation of just classes is always // equivalent to a single class corresponding to the union of the branches // in that class. (Unless some branches match invalid UTF-8 and others // match non-ASCII Unicode.) #[test] fn cat_class_flattened() { assert_eq!(t(r"[a-z]|[A-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); // Combining all of the letter properties should give us the one giant // letter property. #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x) \p{Lowercase_Letter} |\p{Uppercase_Letter} |\p{Titlecase_Letter} |\p{Modifier_Letter} |\p{Other_Letter} "), hir_uclass_query(ClassQuery::Binary("letter")) ); // Byte classes that can truly match invalid UTF-8 cannot be combined // with Unicode classes. assert_eq!( t_bytes(r"[Δδ]|(?-u:[\x90-\xFF])|[Λλ]"), hir_alt(vec![ hir_uclass(&[('Δ', 'Δ'), ('δ', 'δ')]), hir_bclass(&[(b'\x90', b'\xFF')]), hir_uclass(&[('Λ', 'Λ'), ('λ', 'λ')]), ]) ); // Byte classes on their own can be combined, even if some are ASCII // and others are invalid UTF-8. assert_eq!( t_bytes(r"[a-z]|(?-u:[\x90-\xFF])|[A-Z]"), hir_bclass(&[(b'A', b'Z'), (b'a', b'z'), (b'\x90', b'\xFF')]), ); } #[test] fn class_ascii() { assert_eq!( t("[[:alnum:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Alnum) ); assert_eq!( t("[[:alpha:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Alpha) ); assert_eq!( t("[[:ascii:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Ascii) ); assert_eq!( t("[[:blank:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Blank) ); assert_eq!( t("[[:cntrl:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Cntrl) ); assert_eq!( t("[[:digit:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t("[[:graph:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Graph) ); assert_eq!( t("[[:lower:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("[[:print:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Print) ); assert_eq!( t("[[:punct:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Punct) ); assert_eq!( t("[[:space:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t("[[:upper:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Upper) ); assert_eq!( t("[[:word:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t("[[:xdigit:]]"), hir_ascii_uclass(&ast::ClassAsciiKind::Xdigit) ); assert_eq!( t("[[:^lower:]]"), hir_negate(hir_ascii_uclass(&ast::ClassAsciiKind::Lower)) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[:lower:]]"), hir_uclass(&[ ('A', 'Z'), ('a', 'z'), ('\u{17F}', '\u{17F}'), ('\u{212A}', '\u{212A}'), ]) ); assert_eq!( t("(?-u)[[:lower:]]"), hir_ascii_bclass(&ast::ClassAsciiKind::Lower) ); assert_eq!( t("(?i-u)[[:lower:]]"), hir_case_fold(hir_ascii_bclass(&ast::ClassAsciiKind::Lower)) ); assert_eq!( t_err("(?-u)[[:^lower:]]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(16, 1, 17) ), } ); assert_eq!( t_err("(?i-u)[[:^lower:]]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(7, 1, 8), Position::new(17, 1, 18) ), } ); } #[test] fn class_ascii_multiple() { // See: https://github.com/rust-lang/regex/issues/680 assert_eq!( t("[[:alnum:][:^ascii:]]"), hir_union( hir_ascii_uclass(&ast::ClassAsciiKind::Alnum), hir_uclass(&[('\u{80}', '\u{10FFFF}')]), ), ); assert_eq!( t_bytes("(?-u)[[:alnum:][:^ascii:]]"), hir_union( hir_ascii_bclass(&ast::ClassAsciiKind::Alnum), hir_bclass(&[(0x80, 0xFF)]), ), ); } #[test] #[cfg(feature = "unicode-perl")] fn class_perl_unicode() { // Unicode assert_eq!(t(r"\d"), hir_uclass_query(ClassQuery::Binary("digit"))); assert_eq!(t(r"\s"), hir_uclass_query(ClassQuery::Binary("space"))); assert_eq!(t(r"\w"), hir_uclass_perl_word()); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\d"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\s"), hir_uclass_query(ClassQuery::Binary("space")) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\w"), hir_uclass_perl_word()); // Unicode, negated assert_eq!( t(r"\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!( t(r"\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); assert_eq!(t(r"\W"), hir_negate(hir_uclass_perl_word())); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\D"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\S"), hir_negate(hir_uclass_query(ClassQuery::Binary("space"))) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)\W"), hir_negate(hir_uclass_perl_word())); } #[test] fn class_perl_ascii() { // ASCII only assert_eq!( t(r"(?-u)\d"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?-u)\s"), hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?-u)\w"), hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); assert_eq!( t(r"(?i-u)\d"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t(r"(?i-u)\s"), hir_ascii_bclass(&ast::ClassAsciiKind::Space) ); assert_eq!( t(r"(?i-u)\w"), hir_ascii_bclass(&ast::ClassAsciiKind::Word) ); // ASCII only, negated assert_eq!( t_bytes(r"(?-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t_bytes(r"(?-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); assert_eq!( t_bytes(r"(?i-u)\D"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?i-u)\S"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Space)) ); assert_eq!( t_bytes(r"(?i-u)\W"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); // ASCII only, negated, with UTF-8 mode enabled. // In this case, negating any Perl class results in an error because // all such classes can match invalid UTF-8. assert_eq!( t_err(r"(?-u)\D"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?-u)\S"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?-u)\W"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(7, 1, 8), ), }, ); assert_eq!( t_err(r"(?i-u)\D"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); assert_eq!( t_err(r"(?i-u)\S"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); assert_eq!( t_err(r"(?i-u)\W"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(6, 1, 7), Position::new(8, 1, 9), ), }, ); } #[test] #[cfg(not(feature = "unicode-perl"))] fn class_perl_word_disabled() { assert_eq!( t_err(r"\w"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(all(not(feature = "unicode-perl"), not(feature = "unicode-bool")))] fn class_perl_space_disabled() { assert_eq!( t_err(r"\s"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(all( not(feature = "unicode-perl"), not(feature = "unicode-gencat") ))] fn class_perl_digit_disabled() { assert_eq!( t_err(r"\d"), TestError { kind: hir::ErrorKind::UnicodePerlClassNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(2, 1, 3) ), } ); } #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_gencat() { assert_eq!(t(r"\pZ"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!(t(r"\pz"), hir_uclass_query(ClassQuery::Binary("Z"))); assert_eq!( t(r"\p{Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{se PaRa ToR}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{gc:Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{gc=Separator}"), hir_uclass_query(ClassQuery::Binary("Z")) ); assert_eq!( t(r"\p{Other}"), hir_uclass_query(ClassQuery::Binary("Other")) ); assert_eq!(t(r"\pC"), hir_uclass_query(ClassQuery::Binary("Other"))); assert_eq!( t(r"\PZ"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!( t(r"\P{separator}"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!( t(r"\P{gc!=separator}"), hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))) ); assert_eq!(t(r"\p{any}"), hir_uclass_query(ClassQuery::Binary("Any"))); assert_eq!( t(r"\p{assigned}"), hir_uclass_query(ClassQuery::Binary("Assigned")) ); assert_eq!( t(r"\p{ascii}"), hir_uclass_query(ClassQuery::Binary("ASCII")) ); assert_eq!( t(r"\p{gc:any}"), hir_uclass_query(ClassQuery::Binary("Any")) ); assert_eq!( t(r"\p{gc:assigned}"), hir_uclass_query(ClassQuery::Binary("Assigned")) ); assert_eq!( t(r"\p{gc:ascii}"), hir_uclass_query(ClassQuery::Binary("ASCII")) ); assert_eq!( t_err(r"(?-u)\pZ"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(5, 1, 6), Position::new(8, 1, 9) ), } ); assert_eq!( t_err(r"(?-u)\p{Separator}"), TestError { kind: hir::ErrorKind::UnicodeNotAllowed, span: Span::new( Position::new(5, 1, 6), Position::new(18, 1, 19) ), } ); assert_eq!( t_err(r"\pE"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(3, 1, 4) ), } ); assert_eq!( t_err(r"\p{Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(7, 1, 8) ), } ); assert_eq!( t_err(r"\p{gc:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(10, 1, 11) ), } ); } #[test] #[cfg(not(feature = "unicode-gencat"))] fn class_unicode_gencat_disabled() { assert_eq!( t_err(r"\p{Separator}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(13, 1, 14) ), } ); assert_eq!( t_err(r"\p{Any}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(7, 1, 8) ), } ); } #[test] #[cfg(feature = "unicode-script")] fn class_unicode_script() { assert_eq!( t(r"\p{Greek}"), hir_uclass_query(ClassQuery::Binary("Greek")) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\p{Greek}"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)\P{Greek}"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "Greek" )))) ); assert_eq!( t_err(r"\p{sc:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(10, 1, 11) ), } ); assert_eq!( t_err(r"\p{scx:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] #[cfg(not(feature = "unicode-script"))] fn class_unicode_script_disabled() { assert_eq!( t_err(r"\p{Greek}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(9, 1, 10) ), } ); assert_eq!( t_err(r"\p{scx:Greek}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(13, 1, 14) ), } ); } #[test] #[cfg(feature = "unicode-age")] fn class_unicode_age() { assert_eq!( t_err(r"\p{age:Foo}"), TestError { kind: hir::ErrorKind::UnicodePropertyValueNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] #[cfg(feature = "unicode-gencat")] fn class_unicode_any_empty() { assert_eq!(t(r"\P{any}"), hir_uclass(&[]),); } #[test] #[cfg(not(feature = "unicode-age"))] fn class_unicode_age_disabled() { assert_eq!( t_err(r"\p{age:3.0}"), TestError { kind: hir::ErrorKind::UnicodePropertyNotFound, span: Span::new( Position::new(0, 1, 1), Position::new(11, 1, 12) ), } ); } #[test] fn class_bracketed() { assert_eq!(t("[a]"), hir_lit("a")); assert_eq!(t("[ab]"), hir_uclass(&[('a', 'b')])); assert_eq!(t("[^[a]]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')])); assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')])); assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')])); assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')])); assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')])); assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')])); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[\d]"), hir_uclass_query(ClassQuery::Binary("digit"))); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\p{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!(t(r"[^\D]"), hir_uclass_query(ClassQuery::Binary("digit"))); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\PZ]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\P{separator}]"), hir_uclass_query(ClassQuery::Binary("separator")) ); #[cfg(all( feature = "unicode-case", any(feature = "unicode-perl", feature = "unicode-gencat") ))] assert_eq!( t(r"(?i)[^\D]"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\P{greek}]"), hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))) ); assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')])); assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')])); assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')])); #[cfg(feature = "unicode-case")] assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[k]"), hir_uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),]) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[β]"), hir_uclass(&[('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),]) ); assert_eq!(t("(?i-u)[k]"), hir_bclass(&[(b'K', b'K'), (b'k', b'k'),])); assert_eq!(t("[^a]"), class_negate(uclass(&[('a', 'a')]))); assert_eq!(t(r"[^\x00]"), class_negate(uclass(&[('\0', '\0')]))); assert_eq!( t_bytes("(?-u)[^a]"), class_negate(bclass(&[(b'a', b'a')])) ); #[cfg(any(feature = "unicode-perl", feature = "unicode-gencat"))] assert_eq!( t(r"[^\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\pZ]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[^\p{separator}]"), hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[^\p{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); #[cfg(all(feature = "unicode-case", feature = "unicode-script"))] assert_eq!( t(r"(?i)[\P{greek}]"), hir_negate(hir_case_fold(hir_uclass_query(ClassQuery::Binary( "greek" )))) ); // Test some weird cases. assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')])); assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')])); assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')])); assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')])); assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')])); assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')])); assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')])); assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')])); assert_eq!( t_err("(?-u)[^a]"), TestError { kind: hir::ErrorKind::InvalidUtf8, span: Span::new( Position::new(5, 1, 6), Position::new(9, 1, 10) ), } ); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!(t(r"[^\s\S]"), hir_uclass(&[]),); #[cfg(any(feature = "unicode-perl", feature = "unicode-bool"))] assert_eq!(t_bytes(r"(?-u)[^\s\S]"), hir_bclass(&[]),); } #[test] fn class_bracketed_union() { assert_eq!(t("[a-zA-Z]"), hir_uclass(&[('A', 'Z'), ('a', 'z')])); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[a\pZb]"), hir_union( hir_uclass(&[('a', 'b')]), hir_uclass_query(ClassQuery::Binary("separator")) ) ); #[cfg(all(feature = "unicode-gencat", feature = "unicode-script"))] assert_eq!( t(r"[\pZ\p{Greek}]"), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[\p{age:3.0}\pZ\p{Greek}]"), hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"), hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("cyrillic")), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ) ) ); #[cfg(all( feature = "unicode-age", feature = "unicode-case", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"), hir_case_fold(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) )) ); #[cfg(all( feature = "unicode-age", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) )) ); #[cfg(all( feature = "unicode-age", feature = "unicode-case", feature = "unicode-gencat", feature = "unicode-script" ))] assert_eq!( t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"), hir_negate(hir_case_fold(hir_union( hir_uclass_query(ClassQuery::ByValue { property_name: "age", property_value: "3.0", }), hir_union( hir_uclass_query(ClassQuery::Binary("greek")), hir_uclass_query(ClassQuery::Binary("separator")) ) ))) ); } #[test] fn class_bracketed_nested() { assert_eq!(t(r"[a[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-b[^c]]"), class_negate(uclass(&[('c', 'c')]))); assert_eq!(t(r"[a-c[^c]]"), class_negate(uclass(&[]))); assert_eq!(t(r"[^a[^c]]"), hir_uclass(&[('c', 'c')])); assert_eq!(t(r"[^a-b[^c]]"), hir_uclass(&[('c', 'c')])); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a[^c]]"), hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[a-b[^c]]"), hir_negate(class_case_fold(uclass(&[('c', 'c')]))) ); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')])); #[cfg(feature = "unicode-case")] assert_eq!( t(r"(?i)[^a-b[^c]]"), hir_uclass(&[('C', 'C'), ('c', 'c')]) ); assert_eq!(t(r"[^a-c[^c]]"), hir_uclass(&[]),); #[cfg(feature = "unicode-case")] assert_eq!(t(r"(?i)[^a-c[^c]]"), hir_uclass(&[]),); } #[test] fn class_bracketed_intersect() { assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')])); assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')])); assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')])); assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')])); assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')])); assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')])); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')])); assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')])); assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')])); assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')])); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&b-c]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[abc&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[[abc]&&[b-c]]"), hir_case_fold(hir_uclass(&[('b', 'c')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-z&&b-y&&c-x]"), hir_case_fold(hir_uclass(&[('c', 'x')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[c-da-b&&a-d]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); #[cfg(feature = "unicode-case")] assert_eq!( t("(?i)[a-d&&c-da-b]"), hir_case_fold(hir_uclass(&[('a', 'd')])) ); assert_eq!( t("(?i-u)[abc&&b-c]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[abc&&[b-c]]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[[abc]&&[b-c]]"), hir_case_fold(hir_bclass(&[(b'b', b'c')])) ); assert_eq!( t("(?i-u)[a-z&&b-y&&c-x]"), hir_case_fold(hir_bclass(&[(b'c', b'x')])) ); assert_eq!( t("(?i-u)[c-da-b&&a-d]"), hir_case_fold(hir_bclass(&[(b'a', b'd')])) ); assert_eq!( t("(?i-u)[a-d&&c-da-b]"), hir_case_fold(hir_bclass(&[(b'a', b'd')])) ); // In `[a^]`, `^` does not need to be escaped, so it makes sense that // `^` is also allowed to be unescaped after `&&`. assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')])); // `]` needs to be escaped after `&&` since it's not at start of class. assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')])); assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')])); assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')])); assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')])); // Test precedence. assert_eq!( t(r"[a-w&&[^c-g]z]"), hir_uclass(&[('a', 'b'), ('h', 'w')]) ); } #[test] fn class_bracketed_intersect_negate() { #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^\w&&\d]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')]))); #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[\w&&\d]]"), hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))) ); #[cfg(feature = "unicode-perl")] assert_eq!( t(r"[^[^\w&&\d]]"), hir_uclass_query(ClassQuery::Binary("digit")) ); #[cfg(feature = "unicode-perl")] assert_eq!(t(r"[[[^\w]&&[^\d]]]"), hir_negate(hir_uclass_perl_word())); #[cfg(feature = "unicode-perl")] assert_eq!( t_bytes(r"(?-u)[^\w&&\d]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[a-z&&a-c]]"), hir_negate(hir_bclass(&[(b'a', b'c')])) ); assert_eq!( t_bytes(r"(?-u)[^[\w&&\d]]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Digit)) ); assert_eq!( t_bytes(r"(?-u)[^[^\w&&\d]]"), hir_ascii_bclass(&ast::ClassAsciiKind::Digit) ); assert_eq!( t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"), hir_negate(hir_ascii_bclass(&ast::ClassAsciiKind::Word)) ); } #[test] fn class_bracketed_difference() { #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"[\pL--[:ascii:]]"), hir_difference( hir_uclass_query(ClassQuery::Binary("letter")), hir_uclass(&[('\0', '\x7F')]) ) ); assert_eq!( t(r"(?-u)[[:alpha:]--[:lower:]]"), hir_bclass(&[(b'A', b'Z')]) ); } #[test] fn class_bracketed_symmetric_difference() { #[cfg(feature = "unicode-script")] assert_eq!( t(r"[\p{sc:Greek}~~\p{scx:Greek}]"), hir_uclass(&[ ('\u{0342}', '\u{0342}'), ('\u{0345}', '\u{0345}'), ('\u{1DC0}', '\u{1DC1}'), ]) ); assert_eq!(t(r"[a-g~~c-j]"), hir_uclass(&[('a', 'b'), ('h', 'j')])); assert_eq!( t(r"(?-u)[a-g~~c-j]"), hir_bclass(&[(b'a', b'b'), (b'h', b'j')]) ); } #[test] fn ignore_whitespace() { assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3")); assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S")); assert_eq!( t(r"(?x)\x # comment { # comment 53 # comment } #comment"), hir_lit("S") ); assert_eq!(t(r"(?x)\x 53"), hir_lit("S")); assert_eq!( t(r"(?x)\x # comment 53 # comment"), hir_lit("S") ); assert_eq!(t(r"(?x)\x5 3"), hir_lit("S")); #[cfg(feature = "unicode-gencat")] assert_eq!( t(r"(?x)\p # comment { # comment Separator # comment } # comment"), hir_uclass_query(ClassQuery::Binary("separator")) ); assert_eq!( t(r"(?x)a # comment { # comment 5 # comment , # comment 10 # comment } # comment"), hir_range(true, 5, Some(10), hir_lit("a")) ); assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a ")); } #[test] fn analysis_is_utf8() { // Positive examples. assert!(props_bytes(r"a").is_utf8()); assert!(props_bytes(r"ab").is_utf8()); assert!(props_bytes(r"(?-u)a").is_utf8()); assert!(props_bytes(r"(?-u)ab").is_utf8()); assert!(props_bytes(r"\xFF").is_utf8()); assert!(props_bytes(r"\xFF\xFF").is_utf8()); assert!(props_bytes(r"[^a]").is_utf8()); assert!(props_bytes(r"[^a][^a]").is_utf8()); assert!(props_bytes(r"\b").is_utf8()); assert!(props_bytes(r"\B").is_utf8()); assert!(props_bytes(r"(?-u)\b").is_utf8()); assert!(props_bytes(r"(?-u)\B").is_utf8()); // Negative examples. assert!(!props_bytes(r"(?-u)\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)\xFF\xFF").is_utf8()); assert!(!props_bytes(r"(?-u)[^a]").is_utf8()); assert!(!props_bytes(r"(?-u)[^a][^a]").is_utf8()); } #[test] fn analysis_captures_len() { assert_eq!(0, props(r"a").explicit_captures_len()); assert_eq!(0, props(r"(?:a)").explicit_captures_len()); assert_eq!(0, props(r"(?i-u:a)").explicit_captures_len()); assert_eq!(0, props(r"(?i-u)a").explicit_captures_len()); assert_eq!(1, props(r"(a)").explicit_captures_len()); assert_eq!(1, props(r"(?Pa)").explicit_captures_len()); assert_eq!(1, props(r"()").explicit_captures_len()); assert_eq!(1, props(r"()a").explicit_captures_len()); assert_eq!(1, props(r"(a)+").explicit_captures_len()); assert_eq!(2, props(r"(a)(b)").explicit_captures_len()); assert_eq!(2, props(r"(a)|(b)").explicit_captures_len()); assert_eq!(2, props(r"((a))").explicit_captures_len()); assert_eq!(1, props(r"([a&&b])").explicit_captures_len()); } #[test] fn analysis_static_captures_len() { let len = |pattern| props(pattern).static_explicit_captures_len(); assert_eq!(Some(0), len(r"")); assert_eq!(Some(0), len(r"foo|bar")); assert_eq!(None, len(r"(foo)|bar")); assert_eq!(None, len(r"foo|(bar)")); assert_eq!(Some(1), len(r"(foo|bar)")); assert_eq!(Some(1), len(r"(a|b|c|d|e|f)")); assert_eq!(Some(1), len(r"(a)|(b)|(c)|(d)|(e)|(f)")); assert_eq!(Some(2), len(r"(a)(b)|(c)(d)|(e)(f)")); assert_eq!(Some(6), len(r"(a)(b)(c)(d)(e)(f)")); assert_eq!(Some(3), len(r"(a)(b)(extra)|(a)(b)()")); assert_eq!(Some(3), len(r"(a)(b)((?:extra)?)")); assert_eq!(None, len(r"(a)(b)(extra)?")); assert_eq!(Some(1), len(r"(foo)|(bar)")); assert_eq!(Some(2), len(r"(foo)(bar)")); assert_eq!(Some(2), len(r"(foo)+(bar)")); assert_eq!(None, len(r"(foo)*(bar)")); assert_eq!(Some(0), len(r"(foo)?{0}")); assert_eq!(None, len(r"(foo)?{1}")); assert_eq!(Some(1), len(r"(foo){1}")); assert_eq!(Some(1), len(r"(foo){1,}")); assert_eq!(Some(1), len(r"(foo){1,}?")); assert_eq!(None, len(r"(foo){1,}??")); assert_eq!(None, len(r"(foo){0,}")); assert_eq!(Some(1), len(r"(foo)(?:bar)")); assert_eq!(Some(2), len(r"(foo(?:bar)+)(?:baz(boo))")); assert_eq!(Some(2), len(r"(?Pfoo)(?:bar)(bal|loon)")); assert_eq!( Some(2), len(r#"<(a)[^>]+href="([^"]+)"|<(img)[^>]+src="([^"]+)""#) ); } #[test] fn analysis_is_all_assertions() { // Positive examples. let p = props(r"\b"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\A"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"\z"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$^\z\A\b\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"$|^|\z|\A|\b|\B"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"^$|$^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); let p = props(r"((\b)+())*^"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(0)); // Negative examples. let p = props(r"^a"); assert!(!p.look_set().is_empty()); assert_eq!(p.minimum_len(), Some(1)); } #[test] fn analysis_look_set_prefix_any() { let p = props(r"(?-u)(?i:(?:\b|_)win(?:32|64|dows)?(?:\b|_))"); assert!(p.look_set_prefix_any().contains(Look::WordAscii)); } #[test] fn analysis_is_anchored() { let is_start = |p| props(p).look_set_prefix().contains(Look::Start); let is_end = |p| props(p).look_set_suffix().contains(Look::End); // Positive examples. assert!(is_start(r"^")); assert!(is_end(r"$")); assert!(is_start(r"^^")); assert!(props(r"$$").look_set_suffix().contains(Look::End)); assert!(is_start(r"^$")); assert!(is_end(r"^$")); assert!(is_start(r"^foo")); assert!(is_end(r"foo$")); assert!(is_start(r"^foo|^bar")); assert!(is_end(r"foo$|bar$")); assert!(is_start(r"^(foo|bar)")); assert!(is_end(r"(foo|bar)$")); assert!(is_start(r"^+")); assert!(is_end(r"$+")); assert!(is_start(r"^++")); assert!(is_end(r"$++")); assert!(is_start(r"(^)+")); assert!(is_end(r"($)+")); assert!(is_start(r"$^")); assert!(is_start(r"$^")); assert!(is_start(r"$^|^$")); assert!(is_end(r"$^|^$")); assert!(is_start(r"\b^")); assert!(is_end(r"$\b")); assert!(is_start(r"^(?m:^)")); assert!(is_end(r"(?m:$)$")); assert!(is_start(r"(?m:^)^")); assert!(is_end(r"$(?m:$)")); // Negative examples. assert!(!is_start(r"(?m)^")); assert!(!is_end(r"(?m)$")); assert!(!is_start(r"(?m:^$)|$^")); assert!(!is_end(r"(?m:^$)|$^")); assert!(!is_start(r"$^|(?m:^$)")); assert!(!is_end(r"$^|(?m:^$)")); assert!(!is_start(r"a^")); assert!(!is_start(r"$a")); assert!(!is_end(r"a^")); assert!(!is_end(r"$a")); assert!(!is_start(r"^foo|bar")); assert!(!is_end(r"foo|bar$")); assert!(!is_start(r"^*")); assert!(!is_end(r"$*")); assert!(!is_start(r"^*+")); assert!(!is_end(r"$*+")); assert!(!is_start(r"^+*")); assert!(!is_end(r"$+*")); assert!(!is_start(r"(^)*")); assert!(!is_end(r"($)*")); } #[test] fn analysis_is_any_anchored() { let is_start = |p| props(p).look_set().contains(Look::Start); let is_end = |p| props(p).look_set().contains(Look::End); // Positive examples. assert!(is_start(r"^")); assert!(is_end(r"$")); assert!(is_start(r"\A")); assert!(is_end(r"\z")); // Negative examples. assert!(!is_start(r"(?m)^")); assert!(!is_end(r"(?m)$")); assert!(!is_start(r"$")); assert!(!is_end(r"^")); } #[test] fn analysis_can_empty() { // Positive examples. let assert_empty = |p| assert_eq!(Some(0), props_bytes(p).minimum_len()); assert_empty(r""); assert_empty(r"()"); assert_empty(r"()*"); assert_empty(r"()+"); assert_empty(r"()?"); assert_empty(r"a*"); assert_empty(r"a?"); assert_empty(r"a{0}"); assert_empty(r"a{0,}"); assert_empty(r"a{0,1}"); assert_empty(r"a{0,10}"); #[cfg(feature = "unicode-gencat")] assert_empty(r"\pL*"); assert_empty(r"a*|b"); assert_empty(r"b|a*"); assert_empty(r"a|"); assert_empty(r"|a"); assert_empty(r"a||b"); assert_empty(r"a*a?(abcd)*"); assert_empty(r"^"); assert_empty(r"$"); assert_empty(r"(?m)^"); assert_empty(r"(?m)$"); assert_empty(r"\A"); assert_empty(r"\z"); assert_empty(r"\B"); assert_empty(r"(?-u)\B"); assert_empty(r"\b"); assert_empty(r"(?-u)\b"); // Negative examples. let assert_non_empty = |p| assert_ne!(Some(0), props_bytes(p).minimum_len()); assert_non_empty(r"a+"); assert_non_empty(r"a{1}"); assert_non_empty(r"a{1,}"); assert_non_empty(r"a{1,2}"); assert_non_empty(r"a{1,10}"); assert_non_empty(r"b|a"); assert_non_empty(r"a*a+(abcd)*"); #[cfg(feature = "unicode-gencat")] assert_non_empty(r"\P{any}"); assert_non_empty(r"[a--a]"); assert_non_empty(r"[a&&b]"); } #[test] fn analysis_is_literal() { // Positive examples. assert!(props(r"a").is_literal()); assert!(props(r"ab").is_literal()); assert!(props(r"abc").is_literal()); assert!(props(r"(?m)abc").is_literal()); assert!(props(r"(?:a)").is_literal()); assert!(props(r"foo(?:a)").is_literal()); assert!(props(r"(?:a)foo").is_literal()); assert!(props(r"[a]").is_literal()); // Negative examples. assert!(!props(r"").is_literal()); assert!(!props(r"^").is_literal()); assert!(!props(r"a|b").is_literal()); assert!(!props(r"(a)").is_literal()); assert!(!props(r"a+").is_literal()); assert!(!props(r"foo(a)").is_literal()); assert!(!props(r"(a)foo").is_literal()); assert!(!props(r"[ab]").is_literal()); } #[test] fn analysis_is_alternation_literal() { // Positive examples. assert!(props(r"a").is_alternation_literal()); assert!(props(r"ab").is_alternation_literal()); assert!(props(r"abc").is_alternation_literal()); assert!(props(r"(?m)abc").is_alternation_literal()); assert!(props(r"foo|bar").is_alternation_literal()); assert!(props(r"foo|bar|baz").is_alternation_literal()); assert!(props(r"[a]").is_alternation_literal()); assert!(props(r"(?:ab)|cd").is_alternation_literal()); assert!(props(r"ab|(?:cd)").is_alternation_literal()); // Negative examples. assert!(!props(r"").is_alternation_literal()); assert!(!props(r"^").is_alternation_literal()); assert!(!props(r"(a)").is_alternation_literal()); assert!(!props(r"a+").is_alternation_literal()); assert!(!props(r"foo(a)").is_alternation_literal()); assert!(!props(r"(a)foo").is_alternation_literal()); assert!(!props(r"[ab]").is_alternation_literal()); assert!(!props(r"[ab]|b").is_alternation_literal()); assert!(!props(r"a|[ab]").is_alternation_literal()); assert!(!props(r"(a)|b").is_alternation_literal()); assert!(!props(r"a|(b)").is_alternation_literal()); assert!(!props(r"a|b").is_alternation_literal()); assert!(!props(r"a|b|c").is_alternation_literal()); assert!(!props(r"[a]|b").is_alternation_literal()); assert!(!props(r"a|[b]").is_alternation_literal()); assert!(!props(r"(?:a)|b").is_alternation_literal()); assert!(!props(r"a|(?:b)").is_alternation_literal()); assert!(!props(r"(?:z|xx)@|xx").is_alternation_literal()); } // This tests that the smart Hir::repetition constructors does some basic // simplifications. #[test] fn smart_repetition() { assert_eq!(t(r"a{0}"), Hir::empty()); assert_eq!(t(r"a{1}"), hir_lit("a")); assert_eq!(t(r"\B{32111}"), hir_look(hir::Look::WordUnicodeNegate)); } // This tests that the smart Hir::concat constructor simplifies the given // exprs in a way we expect. #[test] fn smart_concat() { assert_eq!(t(""), Hir::empty()); assert_eq!(t("(?:)"), Hir::empty()); assert_eq!(t("abc"), hir_lit("abc")); assert_eq!(t("(?:foo)(?:bar)"), hir_lit("foobar")); assert_eq!(t("quux(?:foo)(?:bar)baz"), hir_lit("quuxfoobarbaz")); assert_eq!( t("foo(?:bar^baz)quux"), hir_cat(vec![ hir_lit("foobar"), hir_look(hir::Look::Start), hir_lit("bazquux"), ]) ); assert_eq!( t("foo(?:ba(?:r^b)az)quux"), hir_cat(vec![ hir_lit("foobar"), hir_look(hir::Look::Start), hir_lit("bazquux"), ]) ); } // This tests that the smart Hir::alternation constructor simplifies the // given exprs in a way we expect. #[test] fn smart_alternation() { assert_eq!( t("(?:foo)|(?:bar)"), hir_alt(vec![hir_lit("foo"), hir_lit("bar")]) ); assert_eq!( t("quux|(?:abc|def|xyz)|baz"), hir_alt(vec![ hir_lit("quux"), hir_lit("abc"), hir_lit("def"), hir_lit("xyz"), hir_lit("baz"), ]) ); assert_eq!( t("quux|(?:abc|(?:def|mno)|xyz)|baz"), hir_alt(vec![ hir_lit("quux"), hir_lit("abc"), hir_lit("def"), hir_lit("mno"), hir_lit("xyz"), hir_lit("baz"), ]) ); assert_eq!( t("a|b|c|d|e|f|x|y|z"), hir_uclass(&[('a', 'f'), ('x', 'z')]), ); // Tests that we lift common prefixes out of an alternation. assert_eq!( t("[A-Z]foo|[A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_lit("foo"), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z][A-Z]|[A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_uclass(&[('A', 'Z')]), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z][A-Z]|[A-Z][A-Z]quux"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_uclass(&[('A', 'Z')]), hir_alt(vec![Hir::empty(), hir_lit("quux")]), ]), ); assert_eq!( t("[A-Z]foo|[A-Z]foobar"), hir_cat(vec![ hir_uclass(&[('A', 'Z')]), hir_alt(vec![hir_lit("foo"), hir_lit("foobar")]), ]), ); } #[test] fn regression_alt_empty_concat() { use crate::ast::{self, Ast}; let span = Span::splat(Position::new(0, 0, 0)); let ast = Ast::alternation(ast::Alternation { span, asts: vec![Ast::concat(ast::Concat { span, asts: vec![] })], }); let mut t = Translator::new(); assert_eq!(Ok(Hir::empty()), t.translate("", &ast)); } #[test] fn regression_empty_alt() { use crate::ast::{self, Ast}; let span = Span::splat(Position::new(0, 0, 0)); let ast = Ast::concat(ast::Concat { span, asts: vec![Ast::alternation(ast::Alternation { span, asts: vec![], })], }); let mut t = Translator::new(); assert_eq!(Ok(Hir::fail()), t.translate("", &ast)); } #[test] fn regression_singleton_alt() { use crate::{ ast::{self, Ast}, hir::Dot, }; let span = Span::splat(Position::new(0, 0, 0)); let ast = Ast::concat(ast::Concat { span, asts: vec![Ast::alternation(ast::Alternation { span, asts: vec![Ast::dot(span)], })], }); let mut t = Translator::new(); assert_eq!(Ok(Hir::dot(Dot::AnyCharExceptLF)), t.translate("", &ast)); } // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63168 #[test] fn regression_fuzz_match() { let pat = "[(\u{6} \0-\u{afdf5}] \0 "; let ast = ParserBuilder::new() .octal(false) .ignore_whitespace(true) .build() .parse(pat) .unwrap(); let hir = TranslatorBuilder::new() .utf8(true) .case_insensitive(false) .multi_line(false) .dot_matches_new_line(false) .swap_greed(true) .unicode(true) .build() .translate(pat, &ast) .unwrap(); assert_eq!( hir, Hir::concat(vec![ hir_uclass(&[('\0', '\u{afdf5}')]), hir_lit("\0"), ]) ); } // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63155 #[cfg(feature = "unicode")] #[test] fn regression_fuzz_difference1() { let pat = r"\W\W|\W[^\v--\W\W\P{Script_Extensions:Pau_Cin_Hau}\u10A1A1-\U{3E3E3}--~~~~--~~~~~~~~------~~~~~~--~~~~~~]*"; let _ = t(pat); // shouldn't panic } // See: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=63153 #[test] fn regression_fuzz_char_decrement1() { let pat = "w[w[^w?\rw\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\r\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0*\0\0\u{1}\0]\0\0-*\0][^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w?\rw[^w\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0\0\0\0\0\0\0\0x\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\0\0*??\0\u{7f}{2}\u{10}??\0\0\0\0\0\0\0\0\0\u{3}\0\0\0}\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\u{1}\0]\0\0-*\0]\0\0\0\0\0\0\0\u{1}\0]\0\u{1}\u{1}H-i]-]\0\0\0\0\u{1}\0]\0\0\0\u{1}\0]\0\0-*\0\0\0\0\u{1}9-\u{7f}]\0'|-\u{7f}]\0'|(?i-ux)[-\u{7f}]\0'\u{3}\0\0\0}\0-*\0] Result; /// This method is called before beginning traversal of the HIR. fn start(&mut self) {} /// This method is called on an `Hir` before descending into child `Hir` /// nodes. fn visit_pre(&mut self, _hir: &Hir) -> Result<(), Self::Err> { Ok(()) } /// This method is called on an `Hir` after descending all of its child /// `Hir` nodes. fn visit_post(&mut self, _hir: &Hir) -> Result<(), Self::Err> { Ok(()) } /// This method is called between child nodes of an alternation. fn visit_alternation_in(&mut self) -> Result<(), Self::Err> { Ok(()) } /// This method is called between child nodes of a concatenation. fn visit_concat_in(&mut self) -> Result<(), Self::Err> { Ok(()) } } /// Executes an implementation of `Visitor` in constant stack space. /// /// This function will visit every node in the given `Hir` while calling /// appropriate methods provided by the [`Visitor`] trait. /// /// The primary use case for this method is when one wants to perform case /// analysis over an `Hir` without using a stack size proportional to the depth /// of the `Hir`. Namely, this method will instead use constant stack space, /// but will use heap space proportional to the size of the `Hir`. This may be /// desirable in cases where the size of `Hir` is proportional to end user /// input. /// /// If the visitor returns an error at any point, then visiting is stopped and /// the error is returned. pub fn visit(hir: &Hir, visitor: V) -> Result { HeapVisitor::new().visit(hir, visitor) } /// HeapVisitor visits every item in an `Hir` recursively using constant stack /// size and a heap size proportional to the size of the `Hir`. struct HeapVisitor<'a> { /// A stack of `Hir` nodes. This is roughly analogous to the call stack /// used in a typical recursive visitor. stack: Vec<(&'a Hir, Frame<'a>)>, } /// Represents a single stack frame while performing structural induction over /// an `Hir`. enum Frame<'a> { /// A stack frame allocated just before descending into a repetition /// operator's child node. Repetition(&'a hir::Repetition), /// A stack frame allocated just before descending into a capture's child /// node. Capture(&'a hir::Capture), /// The stack frame used while visiting every child node of a concatenation /// of expressions. Concat { /// The child node we are currently visiting. head: &'a Hir, /// The remaining child nodes to visit (which may be empty). tail: &'a [Hir], }, /// The stack frame used while visiting every child node of an alternation /// of expressions. Alternation { /// The child node we are currently visiting. head: &'a Hir, /// The remaining child nodes to visit (which may be empty). tail: &'a [Hir], }, } impl<'a> HeapVisitor<'a> { fn new() -> HeapVisitor<'a> { HeapVisitor { stack: vec![] } } fn visit( &mut self, mut hir: &'a Hir, mut visitor: V, ) -> Result { self.stack.clear(); visitor.start(); loop { visitor.visit_pre(hir)?; if let Some(x) = self.induct(hir) { let child = x.child(); self.stack.push((hir, x)); hir = child; continue; } // No induction means we have a base case, so we can post visit // it now. visitor.visit_post(hir)?; // At this point, we now try to pop our call stack until it is // either empty or we hit another inductive case. loop { let (post_hir, frame) = match self.stack.pop() { None => return visitor.finish(), Some((post_hir, frame)) => (post_hir, frame), }; // If this is a concat/alternate, then we might have additional // inductive steps to process. if let Some(x) = self.pop(frame) { match x { Frame::Alternation { .. } => { visitor.visit_alternation_in()?; } Frame::Concat { .. } => { visitor.visit_concat_in()?; } _ => {} } hir = x.child(); self.stack.push((post_hir, x)); break; } // Otherwise, we've finished visiting all the child nodes for // this HIR, so we can post visit it now. visitor.visit_post(post_hir)?; } } } /// Build a stack frame for the given HIR if one is needed (which occurs if /// and only if there are child nodes in the HIR). Otherwise, return None. fn induct(&mut self, hir: &'a Hir) -> Option> { match *hir.kind() { HirKind::Repetition(ref x) => Some(Frame::Repetition(x)), HirKind::Capture(ref x) => Some(Frame::Capture(x)), HirKind::Concat(ref x) if x.is_empty() => None, HirKind::Concat(ref x) => { Some(Frame::Concat { head: &x[0], tail: &x[1..] }) } HirKind::Alternation(ref x) if x.is_empty() => None, HirKind::Alternation(ref x) => { Some(Frame::Alternation { head: &x[0], tail: &x[1..] }) } _ => None, } } /// Pops the given frame. If the frame has an additional inductive step, /// then return it, otherwise return `None`. fn pop(&self, induct: Frame<'a>) -> Option> { match induct { Frame::Repetition(_) => None, Frame::Capture(_) => None, Frame::Concat { tail, .. } => { if tail.is_empty() { None } else { Some(Frame::Concat { head: &tail[0], tail: &tail[1..] }) } } Frame::Alternation { tail, .. } => { if tail.is_empty() { None } else { Some(Frame::Alternation { head: &tail[0], tail: &tail[1..], }) } } } } } impl<'a> Frame<'a> { /// Perform the next inductive step on this frame and return the next /// child HIR node to visit. fn child(&self) -> &'a Hir { match *self { Frame::Repetition(rep) => &rep.sub, Frame::Capture(capture) => &capture.sub, Frame::Concat { head, .. } => head, Frame::Alternation { head, .. } => head, } } } regex-syntax-0.8.2/src/lib.rs000064400000000000000000000407521046102023000142040ustar 00000000000000/*! This crate provides a robust regular expression parser. This crate defines two primary types: * [`Ast`](ast::Ast) is the abstract syntax of a regular expression. An abstract syntax corresponds to a *structured representation* of the concrete syntax of a regular expression, where the concrete syntax is the pattern string itself (e.g., `foo(bar)+`). Given some abstract syntax, it can be converted back to the original concrete syntax (modulo some details, like whitespace). To a first approximation, the abstract syntax is complex and difficult to analyze. * [`Hir`](hir::Hir) is the high-level intermediate representation ("HIR" or "high-level IR" for short) of regular expression. It corresponds to an intermediate state of a regular expression that sits between the abstract syntax and the low level compiled opcodes that are eventually responsible for executing a regular expression search. Given some high-level IR, it is not possible to produce the original concrete syntax (although it is possible to produce an equivalent concrete syntax, but it will likely scarcely resemble the original pattern). To a first approximation, the high-level IR is simple and easy to analyze. These two types come with conversion routines: * An [`ast::parse::Parser`] converts concrete syntax (a `&str`) to an [`Ast`](ast::Ast). * A [`hir::translate::Translator`] converts an [`Ast`](ast::Ast) to a [`Hir`](hir::Hir). As a convenience, the above two conversion routines are combined into one via the top-level [`Parser`] type. This `Parser` will first convert your pattern to an `Ast` and then convert the `Ast` to an `Hir`. It's also exposed as top-level [`parse`] free function. # Example This example shows how to parse a pattern string into its HIR: ``` use regex_syntax::{hir::Hir, parse}; let hir = parse("a|b")?; assert_eq!(hir, Hir::alternation(vec![ Hir::literal("a".as_bytes()), Hir::literal("b".as_bytes()), ])); # Ok::<(), Box>(()) ``` # Concrete syntax supported The concrete syntax is documented as part of the public API of the [`regex` crate](https://docs.rs/regex/%2A/regex/#syntax). # Input safety A key feature of this library is that it is safe to use with end user facing input. This plays a significant role in the internal implementation. In particular: 1. Parsers provide a `nest_limit` option that permits callers to control how deeply nested a regular expression is allowed to be. This makes it possible to do case analysis over an `Ast` or an `Hir` using recursion without worrying about stack overflow. 2. Since relying on a particular stack size is brittle, this crate goes to great lengths to ensure that all interactions with both the `Ast` and the `Hir` do not use recursion. Namely, they use constant stack space and heap space proportional to the size of the original pattern string (in bytes). This includes the type's corresponding destructors. (One exception to this is literal extraction, but this will eventually get fixed.) # Error reporting The `Display` implementations on all `Error` types exposed in this library provide nice human readable errors that are suitable for showing to end users in a monospace font. # Literal extraction This crate provides limited support for [literal extraction from `Hir` values](hir::literal). Be warned that literal extraction uses recursion, and therefore, stack size proportional to the size of the `Hir`. The purpose of literal extraction is to speed up searches. That is, if you know a regular expression must match a prefix or suffix literal, then it is often quicker to search for instances of that literal, and then confirm or deny the match using the full regular expression engine. These optimizations are done automatically in the `regex` crate. # Crate features An important feature provided by this crate is its Unicode support. This includes things like case folding, boolean properties, general categories, scripts and Unicode-aware support for the Perl classes `\w`, `\s` and `\d`. However, a downside of this support is that it requires bundling several Unicode data tables that are substantial in size. A fair number of use cases do not require full Unicode support. For this reason, this crate exposes a number of features to control which Unicode data is available. If a regular expression attempts to use a Unicode feature that is not available because the corresponding crate feature was disabled, then translating that regular expression to an `Hir` will return an error. (It is still possible construct an `Ast` for such a regular expression, since Unicode data is not used until translation to an `Hir`.) Stated differently, enabling or disabling any of the features below can only add or subtract from the total set of valid regular expressions. Enabling or disabling a feature will never modify the match semantics of a regular expression. The following features are available: * **std** - Enables support for the standard library. This feature is enabled by default. When disabled, only `core` and `alloc` are used. Otherwise, enabling `std` generally just enables `std::error::Error` trait impls for the various error types. * **unicode** - Enables all Unicode features. This feature is enabled by default, and will always cover all Unicode features, even if more are added in the future. * **unicode-age** - Provide the data for the [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). This makes it possible to use classes like `\p{Age:6.0}` to refer to all codepoints first introduced in Unicode 6.0 * **unicode-bool** - Provide the data for numerous Unicode boolean properties. The full list is not included here, but contains properties like `Alphabetic`, `Emoji`, `Lowercase`, `Math`, `Uppercase` and `White_Space`. * **unicode-case** - Provide the data for case insensitive matching using [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). * **unicode-gencat** - Provide the data for [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). This includes, but is not limited to, `Decimal_Number`, `Letter`, `Math_Symbol`, `Number` and `Punctuation`. * **unicode-perl** - Provide the data for supporting the Unicode-aware Perl character classes, corresponding to `\w`, `\s` and `\d`. This is also necessary for using Unicode-aware word boundary assertions. Note that if this feature is disabled, the `\s` and `\d` character classes are still available if the `unicode-bool` and `unicode-gencat` features are enabled, respectively. * **unicode-script** - Provide the data for [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, `Latin` and `Thai`. * **unicode-segment** - Provide the data necessary to provide the properties used to implement the [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and `\p{sb=ATerm}`. * **arbitrary** - Enabling this feature introduces a public dependency on the [`arbitrary`](https://crates.io/crates/arbitrary) crate. Namely, it implements the `Arbitrary` trait from that crate for the [`Ast`](crate::ast::Ast) type. This feature is disabled by default. */ #![no_std] #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] #![cfg_attr(docsrs, feature(doc_auto_cfg))] #[cfg(any(test, feature = "std"))] extern crate std; extern crate alloc; pub use crate::{ error::Error, parser::{parse, Parser, ParserBuilder}, unicode::UnicodeWordError, }; use alloc::string::String; pub mod ast; mod debug; mod either; mod error; pub mod hir; mod parser; mod rank; mod unicode; mod unicode_tables; pub mod utf8; /// Escapes all regular expression meta characters in `text`. /// /// The string returned may be safely used as a literal in a regular /// expression. pub fn escape(text: &str) -> String { let mut quoted = String::new(); escape_into(text, &mut quoted); quoted } /// Escapes all meta characters in `text` and writes the result into `buf`. /// /// This will append escape characters into the given buffer. The characters /// that are appended are safe to use as a literal in a regular expression. pub fn escape_into(text: &str, buf: &mut String) { buf.reserve(text.len()); for c in text.chars() { if is_meta_character(c) { buf.push('\\'); } buf.push(c); } } /// Returns true if the given character has significance in a regex. /// /// Generally speaking, these are the only characters which _must_ be escaped /// in order to match their literal meaning. For example, to match a literal /// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For /// example, `-` is treated as a meta character because of its significance /// for writing ranges inside of character classes, but the regex `-` will /// match a literal `-` because `-` has no special meaning outside of character /// classes. /// /// In order to determine whether a character may be escaped at all, the /// [`is_escapeable_character`] routine should be used. The difference between /// `is_meta_character` and `is_escapeable_character` is that the latter will /// return true for some characters that are _not_ meta characters. For /// example, `%` and `\%` both match a literal `%` in all contexts. In other /// words, `is_escapeable_character` includes "superfluous" escapes. /// /// Note that the set of characters for which this function returns `true` or /// `false` is fixed and won't change in a semver compatible release. (In this /// case, "semver compatible release" actually refers to the `regex` crate /// itself, since reducing or expanding the set of meta characters would be a /// breaking change for not just `regex-syntax` but also `regex` itself.) /// /// # Example /// /// ``` /// use regex_syntax::is_meta_character; /// /// assert!(is_meta_character('?')); /// assert!(is_meta_character('-')); /// assert!(is_meta_character('&')); /// assert!(is_meta_character('#')); /// /// assert!(!is_meta_character('%')); /// assert!(!is_meta_character('/')); /// assert!(!is_meta_character('!')); /// assert!(!is_meta_character('"')); /// assert!(!is_meta_character('e')); /// ``` pub fn is_meta_character(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, _ => false, } } /// Returns true if the given character can be escaped in a regex. /// /// This returns true in all cases that `is_meta_character` returns true, but /// also returns true in some cases where `is_meta_character` returns false. /// For example, `%` is not a meta character, but it is escapeable. That is, /// `%` and `\%` both match a literal `%` in all contexts. /// /// The purpose of this routine is to provide knowledge about what characters /// may be escaped. Namely, most regex engines permit "superfluous" escapes /// where characters without any special significance may be escaped even /// though there is no actual _need_ to do so. /// /// This will return false for some characters. For example, `e` is not /// escapeable. Therefore, `\e` will either result in a parse error (which is /// true today), or it could backwards compatibly evolve into a new construct /// with its own meaning. Indeed, that is the purpose of banning _some_ /// superfluous escapes: it provides a way to evolve the syntax in a compatible /// manner. /// /// # Example /// /// ``` /// use regex_syntax::is_escapeable_character; /// /// assert!(is_escapeable_character('?')); /// assert!(is_escapeable_character('-')); /// assert!(is_escapeable_character('&')); /// assert!(is_escapeable_character('#')); /// assert!(is_escapeable_character('%')); /// assert!(is_escapeable_character('/')); /// assert!(is_escapeable_character('!')); /// assert!(is_escapeable_character('"')); /// /// assert!(!is_escapeable_character('e')); /// ``` pub fn is_escapeable_character(c: char) -> bool { // Certainly escapeable if it's a meta character. if is_meta_character(c) { return true; } // Any character that isn't ASCII is definitely not escapeable. There's // no real need to allow things like \☃ right? if !c.is_ascii() { return false; } // Otherwise, we basically say that everything is escapeable unless it's a // letter or digit. Things like \3 are either octal (when enabled) or an // error, and we should keep it that way. Otherwise, letters are reserved // for adding new syntax in a backwards compatible way. match c { '0'..='9' | 'A'..='Z' | 'a'..='z' => false, // While not currently supported, we keep these as not escapeable to // give us some flexibility with respect to supporting the \< and // \> word boundary assertions in the future. By rejecting them as // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. // // OK, now we support \< and \>, and we need to retain them as *not* // escapeable here since the escape sequence is significant. '<' | '>' => false, _ => true, } } /// Returns true if and only if the given character is a Unicode word /// character. /// /// A Unicode word character is defined by /// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). /// In particular, a character /// is considered a word character if it is in either of the `Alphabetic` or /// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` /// or `Connector_Punctuation` general categories. /// /// # Panics /// /// If the `unicode-perl` feature is not enabled, then this function /// panics. For this reason, it is recommended that callers use /// [`try_is_word_character`] instead. pub fn is_word_character(c: char) -> bool { try_is_word_character(c).expect("unicode-perl feature must be enabled") } /// Returns true if and only if the given character is a Unicode word /// character. /// /// A Unicode word character is defined by /// [UTS#18 Annex C](https://unicode.org/reports/tr18/#Compatibility_Properties). /// In particular, a character /// is considered a word character if it is in either of the `Alphabetic` or /// `Join_Control` properties, or is in one of the `Decimal_Number`, `Mark` /// or `Connector_Punctuation` general categories. /// /// # Errors /// /// If the `unicode-perl` feature is not enabled, then this function always /// returns an error. pub fn try_is_word_character( c: char, ) -> core::result::Result { unicode::is_word_character(c) } /// Returns true if and only if the given character is an ASCII word character. /// /// An ASCII word character is defined by the following character class: /// `[_0-9a-zA-Z]`. pub fn is_word_byte(c: u8) -> bool { match c { b'_' | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' => true, _ => false, } } #[cfg(test)] mod tests { use alloc::string::ToString; use super::*; #[test] fn escape_meta() { assert_eq!( escape(r"\.+*?()|[]{}^$#&-~"), r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~".to_string() ); } #[test] fn word_byte() { assert!(is_word_byte(b'a')); assert!(!is_word_byte(b'-')); } #[test] #[cfg(feature = "unicode-perl")] fn word_char() { assert!(is_word_character('a'), "ASCII"); assert!(is_word_character('à'), "Latin-1"); assert!(is_word_character('β'), "Greek"); assert!(is_word_character('\u{11011}'), "Brahmi (Unicode 6.0)"); assert!(is_word_character('\u{11611}'), "Modi (Unicode 7.0)"); assert!(is_word_character('\u{11711}'), "Ahom (Unicode 8.0)"); assert!(is_word_character('\u{17828}'), "Tangut (Unicode 9.0)"); assert!(is_word_character('\u{1B1B1}'), "Nushu (Unicode 10.0)"); assert!(is_word_character('\u{16E40}'), "Medefaidrin (Unicode 11.0)"); assert!(!is_word_character('-')); assert!(!is_word_character('☃')); } #[test] #[should_panic] #[cfg(not(feature = "unicode-perl"))] fn word_char_disabled_panic() { assert!(is_word_character('a')); } #[test] #[cfg(not(feature = "unicode-perl"))] fn word_char_disabled_error() { assert!(try_is_word_character('a').is_err()); } } regex-syntax-0.8.2/src/parser.rs000064400000000000000000000251601046102023000147260ustar 00000000000000use crate::{ast, hir, Error}; /// A convenience routine for parsing a regex using default options. /// /// This is equivalent to `Parser::new().parse(pattern)`. /// /// If you need to set non-default options, then use a [`ParserBuilder`]. /// /// This routine returns an [`Hir`](hir::Hir) value. Namely, it automatically /// parses the pattern as an [`Ast`](ast::Ast) and then invokes the translator /// to convert the `Ast` into an `Hir`. If you need access to the `Ast`, then /// you should use a [`ast::parse::Parser`]. pub fn parse(pattern: &str) -> Result { Parser::new().parse(pattern) } /// A builder for a regular expression parser. /// /// This builder permits modifying configuration options for the parser. /// /// This type combines the builder options for both the [AST /// `ParserBuilder`](ast::parse::ParserBuilder) and the [HIR /// `TranslatorBuilder`](hir::translate::TranslatorBuilder). #[derive(Clone, Debug, Default)] pub struct ParserBuilder { ast: ast::parse::ParserBuilder, hir: hir::translate::TranslatorBuilder, } impl ParserBuilder { /// Create a new parser builder with a default configuration. pub fn new() -> ParserBuilder { ParserBuilder::default() } /// Build a parser from this configuration with the given pattern. pub fn build(&self) -> Parser { Parser { ast: self.ast.build(), hir: self.hir.build() } } /// Set the nesting limit for this parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow for consumers that do structural induction on an `Ast` using /// explicit recursion. While this crate never does this (instead using /// constant stack space and moving the call stack to the heap), other /// crates may. /// /// This limit is not checked until the entire Ast is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since this parser /// implementation will limit itself to heap space proportional to the /// length of the pattern string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation, which results in a nest /// depth of `1`. In general, a nest limit is not something that manifests /// in an obvious way in the concrete syntax, therefore, it should not be /// used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder { self.ast.nest_limit(limit); self } /// Whether to support octal syntax or not. /// /// Octal syntax is a little-known way of uttering Unicode codepoints in /// a regular expression. For example, `a`, `\x61`, `\u0061` and /// `\141` are all equivalent regular expressions, where the last example /// shows octal syntax. /// /// While supporting octal syntax isn't in and of itself a problem, it does /// make good error messages harder. That is, in PCRE based regex engines, /// syntax like `\0` invokes a backreference, which is explicitly /// unsupported in Rust's regex engine. However, many users expect it to /// be supported. Therefore, when octal support is disabled, the error /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder { self.ast.octal(yes); self } /// When disabled, translation will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When enabled (the default), the translator is guaranteed to produce an /// expression that, for non-empty matches, will only ever produce spans /// that are entirely valid UTF-8 (otherwise, the translator will return an /// error). /// /// Perhaps surprisingly, when UTF-8 is enabled, an empty regex or even /// a negated ASCII word boundary (uttered as `(?-u:\B)` in the concrete /// syntax) will be allowed even though they can produce matches that split /// a UTF-8 encoded codepoint. This only applies to zero-width or "empty" /// matches, and it is expected that the regex engine itself must handle /// these cases if necessary (perhaps by suppressing any zero-width matches /// that split a codepoint). pub fn utf8(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.utf8(yes); self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insignificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder { self.ast.ignore_whitespace(yes); self } /// Enable or disable the case insensitive flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.case_insensitive(yes); self } /// Enable or disable the multi-line matching flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `m` flag. pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.multi_line(yes); self } /// Enable or disable the "dot matches any character" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.dot_matches_new_line(yes); self } /// Enable or disable the CRLF mode flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `R` flag. /// /// When CRLF mode is enabled, the following happens: /// /// * Unless `dot_matches_new_line` is enabled, `.` will match any character /// except for `\r` and `\n`. /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, /// `\r` and `\n` as line terminators. And in particular, neither will /// match between a `\r` and a `\n`. pub fn crlf(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.crlf(yes); self } /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. /// /// Namely, instead of `.` (by default) matching everything except for `\n`, /// this will cause `.` to match everything except for the byte given. /// /// If `.` is used in a context where Unicode mode is enabled and this byte /// isn't ASCII, then an error will be returned. When Unicode mode is /// disabled, then any byte is permitted, but will return an error if UTF-8 /// mode is enabled and it is a non-ASCII byte. /// /// In short, any ASCII value for a line terminator is always okay. But a /// non-ASCII byte might result in an error depending on whether Unicode /// mode or UTF-8 mode are enabled. /// /// Note that if `R` mode is enabled then it always takes precedence and /// the line terminator will be treated as `\r` and `\n` simultaneously. /// /// Note also that this *doesn't* impact the look-around assertions /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional /// configuration in the regex engine itself. pub fn line_terminator(&mut self, byte: u8) -> &mut ParserBuilder { self.hir.line_terminator(byte); self } /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.swap_greed(yes); self } /// Enable or disable the Unicode flag (`u`) by default. /// /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// /// Note that unless `utf8` is disabled (it's enabled by default), a /// regular expression will fail to parse if Unicode mode is disabled and a /// sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder { self.hir.unicode(yes); self } } /// A convenience parser for regular expressions. /// /// This parser takes as input a regular expression pattern string (the /// "concrete syntax") and returns a high-level intermediate representation /// (the HIR) suitable for most types of analysis. In particular, this parser /// hides the intermediate state of producing an AST (the "abstract syntax"). /// The AST is itself far more complex than the HIR, so this parser serves as a /// convenience for never having to deal with it at all. /// /// If callers have more fine grained use cases that need an AST, then please /// see the [`ast::parse`] module. /// /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub struct Parser { ast: ast::parse::Parser, hir: hir::translate::Translator, } impl Parser { /// Create a new parser with a default configuration. /// /// The parser can be run with `parse` method. The parse method returns /// a high level intermediate representation of the given regular /// expression. /// /// To set configuration options on the parser, use [`ParserBuilder`]. pub fn new() -> Parser { ParserBuilder::new().build() } /// Parse the regular expression into a high level intermediate /// representation. pub fn parse(&mut self, pattern: &str) -> Result { let ast = self.ast.parse(pattern)?; let hir = self.hir.translate(pattern, &ast)?; Ok(hir) } } regex-syntax-0.8.2/src/rank.rs000064400000000000000000000105261046102023000143650ustar 00000000000000pub(crate) const BYTE_FREQUENCIES: [u8; 256] = [ 55, // '\x00' 52, // '\x01' 51, // '\x02' 50, // '\x03' 49, // '\x04' 48, // '\x05' 47, // '\x06' 46, // '\x07' 45, // '\x08' 103, // '\t' 242, // '\n' 66, // '\x0b' 67, // '\x0c' 229, // '\r' 44, // '\x0e' 43, // '\x0f' 42, // '\x10' 41, // '\x11' 40, // '\x12' 39, // '\x13' 38, // '\x14' 37, // '\x15' 36, // '\x16' 35, // '\x17' 34, // '\x18' 33, // '\x19' 56, // '\x1a' 32, // '\x1b' 31, // '\x1c' 30, // '\x1d' 29, // '\x1e' 28, // '\x1f' 255, // ' ' 148, // '!' 164, // '"' 149, // '#' 136, // '$' 160, // '%' 155, // '&' 173, // "'" 221, // '(' 222, // ')' 134, // '*' 122, // '+' 232, // ',' 202, // '-' 215, // '.' 224, // '/' 208, // '0' 220, // '1' 204, // '2' 187, // '3' 183, // '4' 179, // '5' 177, // '6' 168, // '7' 178, // '8' 200, // '9' 226, // ':' 195, // ';' 154, // '<' 184, // '=' 174, // '>' 126, // '?' 120, // '@' 191, // 'A' 157, // 'B' 194, // 'C' 170, // 'D' 189, // 'E' 162, // 'F' 161, // 'G' 150, // 'H' 193, // 'I' 142, // 'J' 137, // 'K' 171, // 'L' 176, // 'M' 185, // 'N' 167, // 'O' 186, // 'P' 112, // 'Q' 175, // 'R' 192, // 'S' 188, // 'T' 156, // 'U' 140, // 'V' 143, // 'W' 123, // 'X' 133, // 'Y' 128, // 'Z' 147, // '[' 138, // '\\' 146, // ']' 114, // '^' 223, // '_' 151, // '`' 249, // 'a' 216, // 'b' 238, // 'c' 236, // 'd' 253, // 'e' 227, // 'f' 218, // 'g' 230, // 'h' 247, // 'i' 135, // 'j' 180, // 'k' 241, // 'l' 233, // 'm' 246, // 'n' 244, // 'o' 231, // 'p' 139, // 'q' 245, // 'r' 243, // 's' 251, // 't' 235, // 'u' 201, // 'v' 196, // 'w' 240, // 'x' 214, // 'y' 152, // 'z' 182, // '{' 205, // '|' 181, // '}' 127, // '~' 27, // '\x7f' 212, // '\x80' 211, // '\x81' 210, // '\x82' 213, // '\x83' 228, // '\x84' 197, // '\x85' 169, // '\x86' 159, // '\x87' 131, // '\x88' 172, // '\x89' 105, // '\x8a' 80, // '\x8b' 98, // '\x8c' 96, // '\x8d' 97, // '\x8e' 81, // '\x8f' 207, // '\x90' 145, // '\x91' 116, // '\x92' 115, // '\x93' 144, // '\x94' 130, // '\x95' 153, // '\x96' 121, // '\x97' 107, // '\x98' 132, // '\x99' 109, // '\x9a' 110, // '\x9b' 124, // '\x9c' 111, // '\x9d' 82, // '\x9e' 108, // '\x9f' 118, // '\xa0' 141, // '¡' 113, // '¢' 129, // '£' 119, // '¤' 125, // '¥' 165, // '¦' 117, // '§' 92, // '¨' 106, // '©' 83, // 'ª' 72, // '«' 99, // '¬' 93, // '\xad' 65, // '®' 79, // '¯' 166, // '°' 237, // '±' 163, // '²' 199, // '³' 190, // '´' 225, // 'µ' 209, // '¶' 203, // '·' 198, // '¸' 217, // '¹' 219, // 'º' 206, // '»' 234, // '¼' 248, // '½' 158, // '¾' 239, // '¿' 255, // 'À' 255, // 'Á' 255, // 'Â' 255, // 'Ã' 255, // 'Ä' 255, // 'Å' 255, // 'Æ' 255, // 'Ç' 255, // 'È' 255, // 'É' 255, // 'Ê' 255, // 'Ë' 255, // 'Ì' 255, // 'Í' 255, // 'Î' 255, // 'Ï' 255, // 'Ð' 255, // 'Ñ' 255, // 'Ò' 255, // 'Ó' 255, // 'Ô' 255, // 'Õ' 255, // 'Ö' 255, // '×' 255, // 'Ø' 255, // 'Ù' 255, // 'Ú' 255, // 'Û' 255, // 'Ü' 255, // 'Ý' 255, // 'Þ' 255, // 'ß' 255, // 'à' 255, // 'á' 255, // 'â' 255, // 'ã' 255, // 'ä' 255, // 'å' 255, // 'æ' 255, // 'ç' 255, // 'è' 255, // 'é' 255, // 'ê' 255, // 'ë' 255, // 'ì' 255, // 'í' 255, // 'î' 255, // 'ï' 255, // 'ð' 255, // 'ñ' 255, // 'ò' 255, // 'ó' 255, // 'ô' 255, // 'õ' 255, // 'ö' 255, // '÷' 255, // 'ø' 255, // 'ù' 255, // 'ú' 255, // 'û' 255, // 'ü' 255, // 'ý' 255, // 'þ' 255, // 'ÿ' ]; regex-syntax-0.8.2/src/unicode.rs000064400000000000000000001075021046102023000150610ustar 00000000000000use alloc::{ string::{String, ToString}, vec::Vec, }; use crate::hir; /// An inclusive range of codepoints from a generated file (hence the static /// lifetime). type Range = &'static [(char, char)]; /// An error that occurs when dealing with Unicode. /// /// We don't impl the Error trait here because these always get converted /// into other public errors. (This error type isn't exported.) #[derive(Debug)] pub enum Error { PropertyNotFound, PropertyValueNotFound, // Not used when unicode-perl is enabled. #[allow(dead_code)] PerlClassNotFound, } /// An error that occurs when Unicode-aware simple case folding fails. /// /// This error can occur when the case mapping tables necessary for Unicode /// aware case folding are unavailable. This only occurs when the /// `unicode-case` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] pub struct CaseFoldError(()); #[cfg(feature = "std")] impl std::error::Error for CaseFoldError {} impl core::fmt::Display for CaseFoldError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware case folding is not available \ (probably because the unicode-case feature is not enabled)" ) } } /// An error that occurs when the Unicode-aware `\w` class is unavailable. /// /// This error can occur when the data tables necessary for the Unicode aware /// Perl character class `\w` are unavailable. This only occurs when the /// `unicode-perl` feature is disabled. (The feature is enabled by default.) #[derive(Debug)] pub struct UnicodeWordError(()); #[cfg(feature = "std")] impl std::error::Error for UnicodeWordError {} impl core::fmt::Display for UnicodeWordError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware \\w class is not available \ (probably because the unicode-perl feature is not enabled)" ) } } /// A state oriented traverser of the simple case folding table. /// /// A case folder can be constructed via `SimpleCaseFolder::new()`, which will /// return an error if the underlying case folding table is unavailable. /// /// After construction, it is expected that callers will use /// `SimpleCaseFolder::mapping` by calling it with codepoints in strictly /// increasing order. For example, calling it on `b` and then on `a` is illegal /// and will result in a panic. /// /// The main idea of this type is that it tries hard to make mapping lookups /// fast by exploiting the structure of the underlying table, and the ordering /// assumption enables this. #[derive(Debug)] pub struct SimpleCaseFolder { /// The simple case fold table. It's a sorted association list, where the /// keys are Unicode scalar values and the values are the corresponding /// equivalence class (not including the key) of the "simple" case folded /// Unicode scalar values. table: &'static [(char, &'static [char])], /// The last codepoint that was used for a lookup. last: Option, /// The index to the entry in `table` corresponding to the smallest key `k` /// such that `k > k0`, where `k0` is the most recent key lookup. Note that /// in particular, `k0` may not be in the table! next: usize, } impl SimpleCaseFolder { /// Create a new simple case folder, returning an error if the underlying /// case folding table is unavailable. pub fn new() -> Result { #[cfg(not(feature = "unicode-case"))] { Err(CaseFoldError(())) } #[cfg(feature = "unicode-case")] { Ok(SimpleCaseFolder { table: crate::unicode_tables::case_folding_simple::CASE_FOLDING_SIMPLE, last: None, next: 0, }) } } /// Return the equivalence class of case folded codepoints for the given /// codepoint. The equivalence class returned never includes the codepoint /// given. If the given codepoint has no case folded codepoints (i.e., /// no entry in the underlying case folding table), then this returns an /// empty slice. /// /// # Panics /// /// This panics when called with a `c` that is less than or equal to the /// previous call. In other words, callers need to use this method with /// strictly increasing values of `c`. pub fn mapping(&mut self, c: char) -> &'static [char] { if let Some(last) = self.last { assert!( last < c, "got codepoint U+{:X} which occurs before \ last codepoint U+{:X}", u32::from(c), u32::from(last), ); } self.last = Some(c); if self.next >= self.table.len() { return &[]; } let (k, v) = self.table[self.next]; if k == c { self.next += 1; return v; } match self.get(c) { Err(i) => { self.next = i; &[] } Ok(i) => { // Since we require lookups to proceed // in order, anything we find should be // after whatever we thought might be // next. Otherwise, the caller is either // going out of order or we would have // found our next key at 'self.next'. assert!(i > self.next); self.next = i + 1; self.table[i].1 } } } /// Returns true if and only if the given range overlaps with any region /// of the underlying case folding table. That is, when true, there exists /// at least one codepoint in the inclusive range `[start, end]` that has /// a non-trivial equivalence class of case folded codepoints. Conversely, /// when this returns false, all codepoints in the range `[start, end]` /// correspond to the trivial equivalence class of case folded codepoints, /// i.e., itself. /// /// This is useful to call before iterating over the codepoints in the /// range and looking up the mapping for each. If you know none of the /// mappings will return anything, then you might be able to skip doing it /// altogether. /// /// # Panics /// /// This panics when `end < start`. pub fn overlaps(&self, start: char, end: char) -> bool { use core::cmp::Ordering; assert!(start <= end); self.table .binary_search_by(|&(c, _)| { if start <= c && c <= end { Ordering::Equal } else if c > end { Ordering::Greater } else { Ordering::Less } }) .is_ok() } /// Returns the index at which `c` occurs in the simple case fold table. If /// `c` does not occur, then this returns an `i` such that `table[i-1].0 < /// c` and `table[i].0 > c`. fn get(&self, c: char) -> Result { self.table.binary_search_by_key(&c, |&(c1, _)| c1) } } /// A query for finding a character class defined by Unicode. This supports /// either use of a property name directly, or lookup by property value. The /// former generally refers to Binary properties (see UTS#44, Table 8), but /// as a special exception (see UTS#18, Section 1.2) both general categories /// (an enumeration) and scripts (a catalog) are supported as if each of their /// possible values were a binary property. /// /// In all circumstances, property names and values are normalized and /// canonicalized. That is, `GC == gc == GeneralCategory == general_category`. /// /// The lifetime `'a` refers to the shorter of the lifetimes of property name /// and property value. #[derive(Debug)] pub enum ClassQuery<'a> { /// Return a class corresponding to a Unicode binary property, named by /// a single letter. OneLetter(char), /// Return a class corresponding to a Unicode binary property. /// /// Note that, by special exception (see UTS#18, Section 1.2), both /// general category values and script values are permitted here as if /// they were a binary property. Binary(&'a str), /// Return a class corresponding to all codepoints whose property /// (identified by `property_name`) corresponds to the given value /// (identified by `property_value`). ByValue { /// A property name. property_name: &'a str, /// A property value. property_value: &'a str, }, } impl<'a> ClassQuery<'a> { fn canonicalize(&self) -> Result { match *self { ClassQuery::OneLetter(c) => self.canonical_binary(&c.to_string()), ClassQuery::Binary(name) => self.canonical_binary(name), ClassQuery::ByValue { property_name, property_value } => { let property_name = symbolic_name_normalize(property_name); let property_value = symbolic_name_normalize(property_value); let canon_name = match canonical_prop(&property_name)? { None => return Err(Error::PropertyNotFound), Some(canon_name) => canon_name, }; Ok(match canon_name { "General_Category" => { let canon = match canonical_gencat(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::GeneralCategory(canon) } "Script" => { let canon = match canonical_script(&property_value)? { None => return Err(Error::PropertyValueNotFound), Some(canon) => canon, }; CanonicalClassQuery::Script(canon) } _ => { let vals = match property_values(canon_name)? { None => return Err(Error::PropertyValueNotFound), Some(vals) => vals, }; let canon_val = match canonical_value(vals, &property_value) { None => { return Err(Error::PropertyValueNotFound) } Some(canon_val) => canon_val, }; CanonicalClassQuery::ByValue { property_name: canon_name, property_value: canon_val, } } }) } } } fn canonical_binary( &self, name: &str, ) -> Result { let norm = symbolic_name_normalize(name); // This is a special case where 'cf' refers to the 'Format' general // category, but where the 'cf' abbreviation is also an abbreviation // for the 'Case_Folding' property. But we want to treat it as // a general category. (Currently, we don't even support the // 'Case_Folding' property. But if we do in the future, users will be // required to spell it out.) // // Also 'sc' refers to the 'Currency_Symbol' general category, but is // also the abbreviation for the 'Script' property. So we avoid calling // 'canonical_prop' for it too, which would erroneously normalize it // to 'Script'. // // Another case: 'lc' is an abbreviation for the 'Cased_Letter' // general category, but is also an abbreviation for the 'Lowercase_Mapping' // property. We don't currently support the latter, so as with 'cf' // above, we treat 'lc' as 'Cased_Letter'. if norm != "cf" && norm != "sc" && norm != "lc" { if let Some(canon) = canonical_prop(&norm)? { return Ok(CanonicalClassQuery::Binary(canon)); } } if let Some(canon) = canonical_gencat(&norm)? { return Ok(CanonicalClassQuery::GeneralCategory(canon)); } if let Some(canon) = canonical_script(&norm)? { return Ok(CanonicalClassQuery::Script(canon)); } Err(Error::PropertyNotFound) } } /// Like ClassQuery, but its parameters have been canonicalized. This also /// differentiates binary properties from flattened general categories and /// scripts. #[derive(Debug, Eq, PartialEq)] enum CanonicalClassQuery { /// The canonical binary property name. Binary(&'static str), /// The canonical general category name. GeneralCategory(&'static str), /// The canonical script name. Script(&'static str), /// An arbitrary association between property and value, both of which /// have been canonicalized. /// /// Note that by construction, the property name of ByValue will never /// be General_Category or Script. Those two cases are subsumed by the /// eponymous variants. ByValue { /// The canonical property name. property_name: &'static str, /// The canonical property value. property_value: &'static str, }, } /// Looks up a Unicode class given a query. If one doesn't exist, then /// `None` is returned. pub fn class(query: ClassQuery<'_>) -> Result { use self::CanonicalClassQuery::*; match query.canonicalize()? { Binary(name) => bool_property(name), GeneralCategory(name) => gencat(name), Script(name) => script(name), ByValue { property_name: "Age", property_value } => { let mut class = hir::ClassUnicode::empty(); for set in ages(property_value)? { class.union(&hir_class(set)); } Ok(class) } ByValue { property_name: "Script_Extensions", property_value } => { script_extension(property_value) } ByValue { property_name: "Grapheme_Cluster_Break", property_value, } => gcb(property_value), ByValue { property_name: "Sentence_Break", property_value } => { sb(property_value) } ByValue { property_name: "Word_Break", property_value } => { wb(property_value) } _ => { // What else should we support? Err(Error::PropertyNotFound) } } } /// Returns a Unicode aware class for \w. /// /// This returns an error if the data is not available for \w. pub fn perl_word() -> Result { #[cfg(not(feature = "unicode-perl"))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(feature = "unicode-perl")] fn imp() -> Result { use crate::unicode_tables::perl_word::PERL_WORD; Ok(hir_class(PERL_WORD)) } imp() } /// Returns a Unicode aware class for \s. /// /// This returns an error if the data is not available for \s. pub fn perl_space() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-bool")))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] fn imp() -> Result { use crate::unicode_tables::perl_space::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } #[cfg(feature = "unicode-bool")] fn imp() -> Result { use crate::unicode_tables::property_bool::WHITE_SPACE; Ok(hir_class(WHITE_SPACE)) } imp() } /// Returns a Unicode aware class for \d. /// /// This returns an error if the data is not available for \d. pub fn perl_digit() -> Result { #[cfg(not(any(feature = "unicode-perl", feature = "unicode-gencat")))] fn imp() -> Result { Err(Error::PerlClassNotFound) } #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] fn imp() -> Result { use crate::unicode_tables::perl_decimal::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } #[cfg(feature = "unicode-gencat")] fn imp() -> Result { use crate::unicode_tables::general_category::DECIMAL_NUMBER; Ok(hir_class(DECIMAL_NUMBER)) } imp() } /// Build a Unicode HIR class from a sequence of Unicode scalar value ranges. pub fn hir_class(ranges: &[(char, char)]) -> hir::ClassUnicode { let hir_ranges: Vec = ranges .iter() .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e)) .collect(); hir::ClassUnicode::new(hir_ranges) } /// Returns true only if the given codepoint is in the `\w` character class. /// /// If the `unicode-perl` feature is not enabled, then this returns an error. pub fn is_word_character(c: char) -> Result { #[cfg(not(feature = "unicode-perl"))] fn imp(_: char) -> Result { Err(UnicodeWordError(())) } #[cfg(feature = "unicode-perl")] fn imp(c: char) -> Result { use crate::{is_word_byte, unicode_tables::perl_word::PERL_WORD}; if u8::try_from(c).map_or(false, is_word_byte) { return Ok(true); } Ok(PERL_WORD .binary_search_by(|&(start, end)| { use core::cmp::Ordering; if start <= c && c <= end { Ordering::Equal } else if start > c { Ordering::Greater } else { Ordering::Less } }) .is_ok()) } imp(c) } /// A mapping of property values for a specific property. /// /// The first element of each tuple is a normalized property value while the /// second element of each tuple is the corresponding canonical property /// value. type PropertyValues = &'static [(&'static str, &'static str)]; fn canonical_gencat( normalized_value: &str, ) -> Result, Error> { Ok(match normalized_value { "any" => Some("Any"), "assigned" => Some("Assigned"), "ascii" => Some("ASCII"), _ => { let gencats = property_values("General_Category")?.unwrap(); canonical_value(gencats, normalized_value) } }) } fn canonical_script( normalized_value: &str, ) -> Result, Error> { let scripts = property_values("Script")?.unwrap(); Ok(canonical_value(scripts, normalized_value)) } /// Find the canonical property name for the given normalized property name. /// /// If no such property exists, then `None` is returned. /// /// The normalized property name must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. /// /// If the property names data is not available, then an error is returned. fn canonical_prop( normalized_name: &str, ) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] fn imp(_: &str) -> Result, Error> { Err(Error::PropertyNotFound) } #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] fn imp(name: &str) -> Result, Error> { use crate::unicode_tables::property_names::PROPERTY_NAMES; Ok(PROPERTY_NAMES .binary_search_by_key(&name, |&(n, _)| n) .ok() .map(|i| PROPERTY_NAMES[i].1)) } imp(normalized_name) } /// Find the canonical property value for the given normalized property /// value. /// /// The given property values should correspond to the values for the property /// under question, which can be found using `property_values`. /// /// If no such property value exists, then `None` is returned. /// /// The normalized property value must have been normalized according to /// UAX44 LM3, which can be done using `symbolic_name_normalize`. fn canonical_value( vals: PropertyValues, normalized_value: &str, ) -> Option<&'static str> { vals.binary_search_by_key(&normalized_value, |&(n, _)| n) .ok() .map(|i| vals[i].1) } /// Return the table of property values for the given property name. /// /// If the property values data is not available, then an error is returned. fn property_values( canonical_property_name: &'static str, ) -> Result, Error> { #[cfg(not(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", )))] fn imp(_: &'static str) -> Result, Error> { Err(Error::PropertyValueNotFound) } #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] fn imp(name: &'static str) -> Result, Error> { use crate::unicode_tables::property_values::PROPERTY_VALUES; Ok(PROPERTY_VALUES .binary_search_by_key(&name, |&(n, _)| n) .ok() .map(|i| PROPERTY_VALUES[i].1)) } imp(canonical_property_name) } // This is only used in some cases, but small enough to just let it be dead // instead of figuring out (and maintaining) the right set of features. #[allow(dead_code)] fn property_set( name_map: &'static [(&'static str, Range)], canonical: &'static str, ) -> Option { name_map .binary_search_by_key(&canonical, |x| x.0) .ok() .map(|i| name_map[i].1) } /// Returns an iterator over Unicode Age sets. Each item corresponds to a set /// of codepoints that were added in a particular revision of Unicode. The /// iterator yields items in chronological order. /// /// If the given age value isn't valid or if the data isn't available, then an /// error is returned instead. fn ages(canonical_age: &str) -> Result, Error> { #[cfg(not(feature = "unicode-age"))] fn imp(_: &str) -> Result, Error> { use core::option::IntoIter; Err::, _>(Error::PropertyNotFound) } #[cfg(feature = "unicode-age")] fn imp(canonical_age: &str) -> Result, Error> { use crate::unicode_tables::age; const AGES: &[(&str, Range)] = &[ ("V1_1", age::V1_1), ("V2_0", age::V2_0), ("V2_1", age::V2_1), ("V3_0", age::V3_0), ("V3_1", age::V3_1), ("V3_2", age::V3_2), ("V4_0", age::V4_0), ("V4_1", age::V4_1), ("V5_0", age::V5_0), ("V5_1", age::V5_1), ("V5_2", age::V5_2), ("V6_0", age::V6_0), ("V6_1", age::V6_1), ("V6_2", age::V6_2), ("V6_3", age::V6_3), ("V7_0", age::V7_0), ("V8_0", age::V8_0), ("V9_0", age::V9_0), ("V10_0", age::V10_0), ("V11_0", age::V11_0), ("V12_0", age::V12_0), ("V12_1", age::V12_1), ("V13_0", age::V13_0), ("V14_0", age::V14_0), ("V15_0", age::V15_0), ]; assert_eq!(AGES.len(), age::BY_NAME.len(), "ages are out of sync"); let pos = AGES.iter().position(|&(age, _)| canonical_age == age); match pos { None => Err(Error::PropertyValueNotFound), Some(i) => Ok(AGES[..=i].iter().map(|&(_, classes)| classes)), } } imp(canonical_age) } /// Returns the Unicode HIR class corresponding to the given general category. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given general category could not be found, or if the general /// category data is not available, then an error is returned. fn gencat(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-gencat"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-gencat")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::general_category::BY_NAME; match name { "ASCII" => Ok(hir_class(&[('\0', '\x7F')])), "Any" => Ok(hir_class(&[('\0', '\u{10FFFF}')])), "Assigned" => { let mut cls = gencat("Unassigned")?; cls.negate(); Ok(cls) } name => property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound), } } match canonical_name { "Decimal_Number" => perl_digit(), name => imp(name), } } /// Returns the Unicode HIR class corresponding to the given script. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given script could not be found, or if the script data is not /// available, then an error is returned. fn script(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-script"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::script::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given script extension. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given script extension could not be found, or if the script data is /// not available, then an error is returned. fn script_extension( canonical_name: &'static str, ) -> Result { #[cfg(not(feature = "unicode-script"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-script")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::script_extension::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given Unicode boolean /// property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given boolean property could not be found, or if the boolean /// property data is not available, then an error is returned. fn bool_property( canonical_name: &'static str, ) -> Result { #[cfg(not(feature = "unicode-bool"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-bool")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::property_bool::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyNotFound) } match canonical_name { "Decimal_Number" => perl_digit(), "White_Space" => perl_space(), name => imp(name), } } /// Returns the Unicode HIR class corresponding to the given grapheme cluster /// break property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn gcb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::grapheme_cluster_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given word break /// property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn wb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::word_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Returns the Unicode HIR class corresponding to the given sentence /// break property. /// /// Name canonicalization is assumed to be performed by the caller. /// /// If the given property could not be found, or if the corresponding data is /// not available, then an error is returned. fn sb(canonical_name: &'static str) -> Result { #[cfg(not(feature = "unicode-segment"))] fn imp(_: &'static str) -> Result { Err(Error::PropertyNotFound) } #[cfg(feature = "unicode-segment")] fn imp(name: &'static str) -> Result { use crate::unicode_tables::sentence_break::BY_NAME; property_set(BY_NAME, name) .map(hir_class) .ok_or(Error::PropertyValueNotFound) } imp(canonical_name) } /// Like symbolic_name_normalize_bytes, but operates on a string. fn symbolic_name_normalize(x: &str) -> String { let mut tmp = x.as_bytes().to_vec(); let len = symbolic_name_normalize_bytes(&mut tmp).len(); tmp.truncate(len); // This should always succeed because `symbolic_name_normalize_bytes` // guarantees that `&tmp[..len]` is always valid UTF-8. // // N.B. We could avoid the additional UTF-8 check here, but it's unlikely // to be worth skipping the additional safety check. A benchmark must // justify it first. String::from_utf8(tmp).unwrap() } /// Normalize the given symbolic name in place according to UAX44-LM3. /// /// A "symbolic name" typically corresponds to property names and property /// value aliases. Note, though, that it should not be applied to property /// string values. /// /// The slice returned is guaranteed to be valid UTF-8 for all possible values /// of `slice`. /// /// See: https://unicode.org/reports/tr44/#UAX44-LM3 fn symbolic_name_normalize_bytes(slice: &mut [u8]) -> &mut [u8] { // I couldn't find a place in the standard that specified that property // names/aliases had a particular structure (unlike character names), but // we assume that it's ASCII only and drop anything that isn't ASCII. let mut start = 0; let mut starts_with_is = false; if slice.len() >= 2 { // Ignore any "is" prefix. starts_with_is = slice[0..2] == b"is"[..] || slice[0..2] == b"IS"[..] || slice[0..2] == b"iS"[..] || slice[0..2] == b"Is"[..]; if starts_with_is { start = 2; } } let mut next_write = 0; for i in start..slice.len() { // VALIDITY ARGUMENT: To guarantee that the resulting slice is valid // UTF-8, we ensure that the slice contains only ASCII bytes. In // particular, we drop every non-ASCII byte from the normalized string. let b = slice[i]; if b == b' ' || b == b'_' || b == b'-' { continue; } else if b'A' <= b && b <= b'Z' { slice[next_write] = b + (b'a' - b'A'); next_write += 1; } else if b <= 0x7F { slice[next_write] = b; next_write += 1; } } // Special case: ISO_Comment has a 'isc' abbreviation. Since we generally // ignore 'is' prefixes, the 'isc' abbreviation gets caught in the cross // fire and ends up creating an alias for 'c' to 'ISO_Comment', but it // is actually an alias for the 'Other' general category. if starts_with_is && next_write == 1 && slice[0] == b'c' { slice[0] = b'i'; slice[1] = b's'; slice[2] = b'c'; next_write = 3; } &mut slice[..next_write] } #[cfg(test)] mod tests { use super::*; #[cfg(feature = "unicode-case")] fn simple_fold_ok(c: char) -> impl Iterator { SimpleCaseFolder::new().unwrap().mapping(c).iter().copied() } #[cfg(feature = "unicode-case")] fn contains_case_map(start: char, end: char) -> bool { SimpleCaseFolder::new().unwrap().overlaps(start, end) } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_k() { let xs: Vec = simple_fold_ok('k').collect(); assert_eq!(xs, alloc::vec!['K', 'K']); let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, alloc::vec!['k', 'K']); let xs: Vec = simple_fold_ok('K').collect(); assert_eq!(xs, alloc::vec!['K', 'k']); } #[test] #[cfg(feature = "unicode-case")] fn simple_fold_a() { let xs: Vec = simple_fold_ok('a').collect(); assert_eq!(xs, alloc::vec!['A']); let xs: Vec = simple_fold_ok('A').collect(); assert_eq!(xs, alloc::vec!['a']); } #[test] #[cfg(not(feature = "unicode-case"))] fn simple_fold_disabled() { assert!(SimpleCaseFolder::new().is_err()); } #[test] #[cfg(feature = "unicode-case")] fn range_contains() { assert!(contains_case_map('A', 'A')); assert!(contains_case_map('Z', 'Z')); assert!(contains_case_map('A', 'Z')); assert!(contains_case_map('@', 'A')); assert!(contains_case_map('Z', '[')); assert!(contains_case_map('☃', 'Ⰰ')); assert!(!contains_case_map('[', '[')); assert!(!contains_case_map('[', '`')); assert!(!contains_case_map('☃', '☃')); } #[test] #[cfg(feature = "unicode-gencat")] fn regression_466() { use super::{CanonicalClassQuery, ClassQuery}; let q = ClassQuery::OneLetter('C'); assert_eq!( q.canonicalize().unwrap(), CanonicalClassQuery::GeneralCategory("Other") ); } #[test] fn sym_normalize() { let sym_norm = symbolic_name_normalize; assert_eq!(sym_norm("Line_Break"), "linebreak"); assert_eq!(sym_norm("Line-break"), "linebreak"); assert_eq!(sym_norm("linebreak"), "linebreak"); assert_eq!(sym_norm("BA"), "ba"); assert_eq!(sym_norm("ba"), "ba"); assert_eq!(sym_norm("Greek"), "greek"); assert_eq!(sym_norm("isGreek"), "greek"); assert_eq!(sym_norm("IS_Greek"), "greek"); assert_eq!(sym_norm("isc"), "isc"); assert_eq!(sym_norm("is c"), "isc"); assert_eq!(sym_norm("is_c"), "isc"); } #[test] fn valid_utf8_symbolic() { let mut x = b"abc\xFFxyz".to_vec(); let y = symbolic_name_normalize_bytes(&mut x); assert_eq!(y, b"abcxyz"); } } regex-syntax-0.8.2/src/unicode_tables/LICENSE-UNICODE000064400000000000000000000054371046102023000201420ustar 00000000000000UNICODE, INC. LICENSE AGREEMENT - DATA FILES AND SOFTWARE Unicode Data Files include all data files under the directories http://www.unicode.org/Public/, http://www.unicode.org/reports/, http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and http://www.unicode.org/utility/trac/browser/. Unicode Data Files do not include PDF online code charts under the directory http://www.unicode.org/Public/. Software includes any source code published in the Unicode Standard or under the directories http://www.unicode.org/Public/, http://www.unicode.org/reports/, http://www.unicode.org/cldr/data/, http://source.icu-project.org/repos/icu/, and http://www.unicode.org/utility/trac/browser/. NOTICE TO USER: Carefully read the following legal agreement. BY DOWNLOADING, INSTALLING, COPYING OR OTHERWISE USING UNICODE INC.'S DATA FILES ("DATA FILES"), AND/OR SOFTWARE ("SOFTWARE"), YOU UNEQUIVOCALLY ACCEPT, AND AGREE TO BE BOUND BY, ALL OF THE TERMS AND CONDITIONS OF THIS AGREEMENT. IF YOU DO NOT AGREE, DO NOT DOWNLOAD, INSTALL, COPY, DISTRIBUTE OR USE THE DATA FILES OR SOFTWARE. COPYRIGHT AND PERMISSION NOTICE Copyright © 1991-2018 Unicode, Inc. All rights reserved. Distributed under the Terms of Use in http://www.unicode.org/copyright.html. Permission is hereby granted, free of charge, to any person obtaining a copy of the Unicode data files and any associated documentation (the "Data Files") or Unicode software and any associated documentation (the "Software") to deal in the Data Files or Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Data Files or Software, and to permit persons to whom the Data Files or Software are furnished to do so, provided that either (a) this copyright and permission notice appear with all copies of the Data Files or Software, or (b) this copyright and permission notice appear in associated Documentation. THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THE DATA FILES OR SOFTWARE. Except as contained in this notice, the name of a copyright holder shall not be used in advertising or otherwise to promote the sale, use or other dealings in these Data Files or Software without prior written authorization of the copyright holder. regex-syntax-0.8.2/src/unicode_tables/age.rs000064400000000000000000001143011046102023000171420ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate age ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("V10_0", V10_0), ("V11_0", V11_0), ("V12_0", V12_0), ("V12_1", V12_1), ("V13_0", V13_0), ("V14_0", V14_0), ("V15_0", V15_0), ("V1_1", V1_1), ("V2_0", V2_0), ("V2_1", V2_1), ("V3_0", V3_0), ("V3_1", V3_1), ("V3_2", V3_2), ("V4_0", V4_0), ("V4_1", V4_1), ("V5_0", V5_0), ("V5_1", V5_1), ("V5_2", V5_2), ("V6_0", V6_0), ("V6_1", V6_1), ("V6_2", V6_2), ("V6_3", V6_3), ("V7_0", V7_0), ("V8_0", V8_0), ("V9_0", V9_0), ]; pub const V10_0: &'static [(char, char)] = &[ ('ࡠ', 'ࡪ'), ('ৼ', '৽'), ('\u{afa}', '\u{aff}'), ('\u{d00}', '\u{d00}'), ('\u{d3b}', '\u{d3c}'), ('᳷', '᳷'), ('\u{1df6}', '\u{1df9}'), ('₿', '₿'), ('⏿', '⏿'), ('⯒', '⯒'), ('⹅', '⹉'), ('ㄮ', 'ㄮ'), ('鿖', '鿪'), ('𐌭', '𐌯'), ('𑨀', '\u{11a47}'), ('𑩐', '𑪃'), ('𑪆', '𑪜'), ('𑪞', '𑪢'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ('𖿡', '𖿡'), ('𛀂', '𛄞'), ('𛅰', '𛋻'), ('🉠', '🉥'), ('🛓', '🛔'), ('🛷', '🛸'), ('🤀', '🤋'), ('🤟', '🤟'), ('🤨', '🤯'), ('🤱', '🤲'), ('🥌', '🥌'), ('🥟', '🥫'), ('🦒', '🦗'), ('🧐', '🧦'), ('𬺰', '𮯠'), ]; pub const V11_0: &'static [(char, char)] = &[ ('ՠ', 'ՠ'), ('ֈ', 'ֈ'), ('ׯ', 'ׯ'), ('\u{7fd}', '߿'), ('\u{8d3}', '\u{8d3}'), ('\u{9fe}', '\u{9fe}'), ('੶', '੶'), ('\u{c04}', '\u{c04}'), ('಄', '಄'), ('ᡸ', 'ᡸ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('⮺', '⮼'), ('⯓', '⯫'), ('⯰', '⯾'), ('⹊', '⹎'), ('ㄯ', 'ㄯ'), ('鿫', '鿯'), ('ꞯ', 'ꞯ'), ('Ꞹ', 'ꞹ'), ('ꣾ', '\u{a8ff}'), ('𐨴', '𐨵'), ('𐩈', '𐩈'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐼀', '𐼧'), ('𐼰', '𐽙'), ('\u{110cd}', '\u{110cd}'), ('𑅄', '𑅆'), ('\u{1133b}', '\u{1133b}'), ('\u{1145e}', '\u{1145e}'), ('𑜚', '𑜚'), ('𑠀', '𑠻'), ('𑪝', '𑪝'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻸'), ('𖹀', '𖺚'), ('𘟭', '𘟱'), ('𝋠', '𝋳'), ('𝍲', '𝍸'), ('𞱱', '𞲴'), ('🄯', '🄯'), ('🛹', '🛹'), ('🟕', '🟘'), ('🥍', '🥏'), ('🥬', '🥰'), ('🥳', '🥶'), ('🥺', '🥺'), ('🥼', '🥿'), ('🦘', '🦢'), ('🦰', '🦹'), ('🧁', '🧂'), ('🧧', '🧿'), ('🩠', '🩭'), ]; pub const V12_0: &'static [(char, char)] = &[ ('౷', '౷'), ('ຆ', 'ຆ'), ('ຉ', 'ຉ'), ('ຌ', 'ຌ'), ('ຎ', 'ຓ'), ('ຘ', 'ຘ'), ('ຠ', 'ຠ'), ('ຨ', 'ຩ'), ('ຬ', 'ຬ'), ('\u{eba}', '\u{eba}'), ('ᳺ', 'ᳺ'), ('⯉', '⯉'), ('⯿', '⯿'), ('⹏', '⹏'), ('Ꞻ', 'ꞿ'), ('Ꟃ', 'Ᶎ'), ('ꭦ', 'ꭧ'), ('𐿠', '𐿶'), ('𑑟', '𑑟'), ('𑚸', '𑚸'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤'), ('𑪄', '𑪅'), ('𑿀', '𑿱'), ('𑿿', '𑿿'), ('\u{13430}', '\u{13438}'), ('𖽅', '𖽊'), ('\u{16f4f}', '\u{16f4f}'), ('𖽿', '𖾇'), ('𖿢', '𖿣'), ('𘟲', '𘟷'), ('𛅐', '𛅒'), ('𛅤', '𛅧'), ('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏'), ('𞋀', '𞋹'), ('𞋿', '𞋿'), ('𞥋', '𞥋'), ('𞴁', '𞴽'), ('🅬', '🅬'), ('🛕', '🛕'), ('🛺', '🛺'), ('🟠', '🟫'), ('🤍', '🤏'), ('🤿', '🤿'), ('🥱', '🥱'), ('🥻', '🥻'), ('🦥', '🦪'), ('🦮', '🦯'), ('🦺', '🦿'), ('🧃', '🧊'), ('🧍', '🧏'), ('🨀', '🩓'), ('🩰', '🩳'), ('🩸', '🩺'), ('🪀', '🪂'), ('🪐', '🪕'), ]; pub const V12_1: &'static [(char, char)] = &[('㋿', '㋿')]; pub const V13_0: &'static [(char, char)] = &[ ('ࢾ', 'ࣇ'), ('\u{b55}', '\u{b55}'), ('ഄ', 'ഄ'), ('\u{d81}', '\u{d81}'), ('\u{1abf}', '\u{1ac0}'), ('⮗', '⮗'), ('⹐', '⹒'), ('ㆻ', 'ㆿ'), ('䶶', '䶿'), ('鿰', '鿼'), ('Ꟈ', 'ꟊ'), ('Ꟶ', 'ꟶ'), ('\u{a82c}', '\u{a82c}'), ('ꭨ', '꭫'), ('𐆜', '𐆜'), ('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱'), ('𐾰', '𐿋'), ('𑅇', '𑅇'), ('𑇎', '\u{111cf}'), ('𑑚', '𑑚'), ('𑑠', '𑑡'), ('𑤀', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '𑥆'), ('𑥐', '𑥙'), ('𑾰', '𑾰'), ('\u{16fe4}', '\u{16fe4}'), ('𖿰', '𖿱'), ('𘫳', '𘳕'), ('𘴀', '𘴈'), ('🄍', '🄏'), ('🅭', '🅯'), ('🆭', '🆭'), ('🛖', '🛗'), ('🛻', '🛼'), ('🢰', '🢱'), ('🤌', '🤌'), ('🥲', '🥲'), ('🥷', '🥸'), ('🦣', '🦤'), ('🦫', '🦭'), ('🧋', '🧋'), ('🩴', '🩴'), ('🪃', '🪆'), ('🪖', '🪨'), ('🪰', '🪶'), ('🫀', '🫂'), ('🫐', '🫖'), ('🬀', '🮒'), ('🮔', '🯊'), ('🯰', '🯹'), ('𪛗', '𪛝'), ('𰀀', '𱍊'), ]; pub const V14_0: &'static [(char, char)] = &[ ('؝', '؝'), ('ࡰ', 'ࢎ'), ('\u{890}', '\u{891}'), ('\u{898}', '\u{89f}'), ('ࢵ', 'ࢵ'), ('ࣈ', '\u{8d2}'), ('\u{c3c}', '\u{c3c}'), ('ౝ', 'ౝ'), ('ೝ', 'ೝ'), ('ᜍ', 'ᜍ'), ('᜕', '᜕'), ('ᜟ', 'ᜟ'), ('\u{180f}', '\u{180f}'), ('\u{1ac1}', '\u{1ace}'), ('ᭌ', 'ᭌ'), ('᭽', '᭾'), ('\u{1dfa}', '\u{1dfa}'), ('⃀', '⃀'), ('Ⱟ', 'Ⱟ'), ('ⱟ', 'ⱟ'), ('⹓', '⹝'), ('鿽', '鿿'), ('Ꟁ', 'ꟁ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꟴ'), ('﯂', '﯂'), ('﵀', '﵏'), ('﷏', '﷏'), ('﷾', '﷿'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐽰', '𐾉'), ('\u{11070}', '𑁵'), ('\u{110c2}', '\u{110c2}'), ('𑚹', '𑚹'), ('𑝀', '𑝆'), ('𑪰', '𑪿'), ('𒾐', '𒿲'), ('𖩰', '𖪾'), ('𖫀', '𖫉'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛄟', '𛄢'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('𜽐', '𜿃'), ('𝇩', '𝇪'), ('𝼀', '𝼞'), ('𞊐', '\u{1e2ae}'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('🛝', '🛟'), ('🟰', '🟰'), ('🥹', '🥹'), ('🧌', '🧌'), ('🩻', '🩼'), ('🪩', '🪬'), ('🪷', '🪺'), ('🫃', '🫅'), ('🫗', '🫙'), ('🫠', '🫧'), ('🫰', '🫶'), ('𪛞', '𪛟'), ('𫜵', '𫜸'), ]; pub const V15_0: &'static [(char, char)] = &[ ('ೳ', 'ೳ'), ('\u{ece}', '\u{ece}'), ('\u{10efd}', '\u{10eff}'), ('𑈿', '\u{11241}'), ('𑬀', '𑬉'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙'), ('𓐯', '𓐯'), ('\u{13439}', '\u{13455}'), ('𛄲', '𛄲'), ('𛅕', '𛅕'), ('𝋀', '𝋓'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞓐', '𞓹'), ('🛜', '🛜'), ('🝴', '🝶'), ('🝻', '🝿'), ('🟙', '🟙'), ('🩵', '🩷'), ('🪇', '🪈'), ('🪭', '🪯'), ('🪻', '🪽'), ('🪿', '🪿'), ('🫎', '🫏'), ('🫚', '🫛'), ('🫨', '🫨'), ('🫷', '🫸'), ('𫜹', '𫜹'), ('𱍐', '𲎯'), ]; pub const V1_1: &'static [(char, char)] = &[ ('\0', 'ǵ'), ('Ǻ', 'ȗ'), ('ɐ', 'ʨ'), ('ʰ', '˞'), ('ˠ', '˩'), ('\u{300}', '\u{345}'), ('\u{360}', '\u{361}'), ('ʹ', '͵'), ('ͺ', 'ͺ'), (';', ';'), ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ώ'), ('ϐ', 'ϖ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'ϳ'), ('Ё', 'Ќ'), ('Ў', 'я'), ('ё', 'ќ'), ('ў', '\u{486}'), ('Ґ', 'ӄ'), ('Ӈ', 'ӈ'), ('Ӌ', 'ӌ'), ('Ӑ', 'ӫ'), ('Ӯ', 'ӵ'), ('Ӹ', 'ӹ'), ('Ա', 'Ֆ'), ('ՙ', '՟'), ('ա', 'և'), ('։', '։'), ('\u{5b0}', '\u{5b9}'), ('\u{5bb}', '׃'), ('א', 'ת'), ('װ', '״'), ('،', '،'), ('؛', '؛'), ('؟', '؟'), ('ء', 'غ'), ('ـ', '\u{652}'), ('٠', '٭'), ('\u{670}', 'ڷ'), ('ں', 'ھ'), ('ۀ', 'ێ'), ('ې', '\u{6ed}'), ('۰', '۹'), ('\u{901}', 'ः'), ('अ', 'ह'), ('\u{93c}', '\u{94d}'), ('ॐ', '\u{954}'), ('क़', '॰'), ('\u{981}', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', '৺'), ('\u{a02}', '\u{a02}'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', 'ੴ'), ('\u{a81}', 'ઃ'), ('અ', 'ઋ'), ('ઍ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', 'ૠ'), ('૦', '૯'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଶ', 'ହ'), ('\u{b3c}', '\u{b43}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b56}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('୦', '୰'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'வ'), ('ஷ', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('௧', '௲'), ('ఁ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'ళ'), ('వ', 'హ'), ('\u{c3e}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౠ', 'ౡ'), ('౦', '౯'), ('ಂ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಾ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೞ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ം', 'ഃ'), ('അ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ന'), ('പ', 'ഹ'), ('\u{d3e}', '\u{d43}'), ('െ', 'ൈ'), ('ൊ', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('ൠ', 'ൡ'), ('൦', '൯'), ('ก', '\u{e3a}'), ('฿', '๛'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ງ', 'ຈ'), ('ຊ', 'ຊ'), ('ຍ', 'ຍ'), ('ດ', 'ທ'), ('ນ', 'ຟ'), ('ມ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ວ'), ('ສ', 'ຫ'), ('ອ', '\u{eb9}'), ('\u{ebb}', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ecd}'), ('໐', '໙'), ('ໜ', 'ໝ'), ('Ⴀ', 'Ⴥ'), ('ა', 'ჶ'), ('჻', '჻'), ('ᄀ', 'ᅙ'), ('ᅟ', 'ᆢ'), ('ᆨ', 'ᇹ'), ('Ḁ', 'ẚ'), ('Ạ', 'ỹ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), ('\u{2000}', '\u{202e}'), ('‰', '⁆'), ('\u{206a}', '⁰'), ('⁴', '₎'), ('₠', '₪'), ('\u{20d0}', '\u{20e1}'), ('℀', 'ℸ'), ('⅓', 'ↂ'), ('←', '⇪'), ('∀', '⋱'), ('⌀', '⌀'), ('⌂', '⍺'), ('␀', '␤'), ('⑀', '⑊'), ('①', '⓪'), ('─', '▕'), ('■', '◯'), ('☀', '☓'), ('☚', '♯'), ('✁', '✄'), ('✆', '✉'), ('✌', '✧'), ('✩', '❋'), ('❍', '❍'), ('❏', '❒'), ('❖', '❖'), ('❘', '❞'), ('❡', '❧'), ('❶', '➔'), ('➘', '➯'), ('➱', '➾'), ('\u{3000}', '〷'), ('〿', '〿'), ('ぁ', 'ゔ'), ('\u{3099}', 'ゞ'), ('ァ', 'ヾ'), ('ㄅ', 'ㄬ'), ('ㄱ', 'ㆎ'), ('㆐', '㆟'), ('㈀', '㈜'), ('㈠', '㉃'), ('㉠', '㉻'), ('㉿', '㊰'), ('㋀', '㋋'), ('㋐', '㋾'), ('㌀', '㍶'), ('㍻', '㏝'), ('㏠', '㏾'), ('一', '龥'), ('\u{e000}', '鶴'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('\u{fb1e}', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', '﴿'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('\u{fe20}', '\u{fe23}'), ('︰', '﹄'), ('﹉', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('ﹰ', 'ﹲ'), ('ﹴ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('\u{feff}', '\u{feff}'), ('!', '~'), ('。', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), ('│', '○'), ('�', '\u{ffff}'), ]; pub const V2_0: &'static [(char, char)] = &[ ('\u{591}', '\u{5a1}'), ('\u{5a3}', '\u{5af}'), ('\u{5c4}', '\u{5c4}'), ('ༀ', 'ཇ'), ('ཉ', 'ཀྵ'), ('\u{f71}', 'ྋ'), ('\u{f90}', '\u{f95}'), ('\u{f97}', '\u{f97}'), ('\u{f99}', '\u{fad}'), ('\u{fb1}', '\u{fb7}'), ('\u{fb9}', '\u{fb9}'), ('ẛ', 'ẛ'), ('₫', '₫'), ('가', '힣'), ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{10ffff}'), ]; pub const V2_1: &'static [(char, char)] = &[('€', '€'), ('', '')]; pub const V3_0: &'static [(char, char)] = &[ ('Ƕ', 'ǹ'), ('Ș', 'ȟ'), ('Ȣ', 'ȳ'), ('ʩ', 'ʭ'), ('˟', '˟'), ('˪', 'ˮ'), ('\u{346}', '\u{34e}'), ('\u{362}', '\u{362}'), ('ϗ', 'ϗ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('Ѐ', 'Ѐ'), ('Ѝ', 'Ѝ'), ('ѐ', 'ѐ'), ('ѝ', 'ѝ'), ('\u{488}', '\u{489}'), ('Ҍ', 'ҏ'), ('Ӭ', 'ӭ'), ('֊', '֊'), ('\u{653}', '\u{655}'), ('ڸ', 'ڹ'), ('ڿ', 'ڿ'), ('ۏ', 'ۏ'), ('ۺ', '۾'), ('܀', '܍'), ('\u{70f}', 'ܬ'), ('\u{730}', '\u{74a}'), ('ހ', '\u{7b0}'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', '෴'), ('ཪ', 'ཪ'), ('\u{f96}', '\u{f96}'), ('\u{fae}', '\u{fb0}'), ('\u{fb8}', '\u{fb8}'), ('\u{fba}', '\u{fbc}'), ('྾', '࿌'), ('࿏', '࿏'), ('က', 'အ'), ('ဣ', 'ဧ'), ('ဩ', 'ဪ'), ('ာ', '\u{1032}'), ('\u{1036}', '\u{1039}'), ('၀', '\u{1059}'), ('ሀ', 'ሆ'), ('ለ', 'ቆ'), ('ቈ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኆ'), ('ኈ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኮ'), ('ኰ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዎ'), ('ዐ', 'ዖ'), ('ዘ', 'ዮ'), ('ደ', 'ጎ'), ('ጐ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ጞ'), ('ጠ', 'ፆ'), ('ፈ', 'ፚ'), ('፡', '፼'), ('Ꭰ', 'Ᏼ'), ('ᐁ', 'ᙶ'), ('\u{1680}', '᚜'), ('ᚠ', 'ᛰ'), ('ក', 'ៜ'), ('០', '៩'), ('᠀', '\u{180e}'), ('᠐', '᠙'), ('ᠠ', 'ᡷ'), ('ᢀ', '\u{18a9}'), ('\u{202f}', '\u{202f}'), ('⁈', '⁍'), ('₭', '₯'), ('\u{20e2}', '\u{20e3}'), ('ℹ', '℺'), ('Ↄ', 'Ↄ'), ('⇫', '⇳'), ('⌁', '⌁'), ('⍻', '⍻'), ('⍽', '⎚'), ('␥', '␦'), ('◰', '◷'), ('☙', '☙'), ('♰', '♱'), ('⠀', '⣿'), ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('〸', '〺'), ('〾', '〾'), ('ㆠ', 'ㆷ'), ('㐀', '䶵'), ('ꀀ', 'ꒌ'), ('꒐', '꒡'), ('꒤', '꒳'), ('꒵', '꓀'), ('꓂', '꓄'), ('꓆', '꓆'), ('יִ', 'יִ'), ('\u{fff9}', '\u{fffb}'), ]; pub const V3_1: &'static [(char, char)] = &[ ('ϴ', 'ϵ'), ('\u{fdd0}', '\u{fdef}'), ('𐌀', '𐌞'), ('𐌠', '𐌣'), ('𐌰', '𐍊'), ('𐐀', '𐐥'), ('𐐨', '𐑍'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄪', '𝇝'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓀'), ('𝓂', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚣'), ('𝚨', '𝟉'), ('𝟎', '𝟿'), ('𠀀', '𪛖'), ('丽', '𪘀'), ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), ]; pub const V3_2: &'static [(char, char)] = &[ ('Ƞ', 'Ƞ'), ('\u{34f}', '\u{34f}'), ('\u{363}', '\u{36f}'), ('Ϙ', 'ϙ'), ('϶', '϶'), ('Ҋ', 'ҋ'), ('Ӆ', 'ӆ'), ('Ӊ', 'ӊ'), ('Ӎ', 'ӎ'), ('Ԁ', 'ԏ'), ('ٮ', 'ٯ'), ('ޱ', 'ޱ'), ('ჷ', 'ჸ'), ('ᜀ', 'ᜌ'), ('ᜎ', '\u{1714}'), ('ᜠ', '᜶'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('⁇', '⁇'), ('⁎', '⁒'), ('⁗', '⁗'), ('\u{205f}', '\u{2063}'), ('ⁱ', 'ⁱ'), ('₰', '₱'), ('\u{20e4}', '\u{20ea}'), ('ℽ', '⅋'), ('⇴', '⇿'), ('⋲', '⋿'), ('⍼', '⍼'), ('⎛', '⏎'), ('⓫', '⓾'), ('▖', '▟'), ('◸', '◿'), ('☖', '☗'), ('♲', '♽'), ('⚀', '⚉'), ('❨', '❵'), ('⟐', '⟫'), ('⟰', '⟿'), ('⤀', '⫿'), ('〻', '〽'), ('ゕ', 'ゖ'), ('ゟ', '゠'), ('ヿ', 'ヿ'), ('ㇰ', 'ㇿ'), ('㉑', '㉟'), ('㊱', '㊿'), ('꒢', '꒣'), ('꒴', '꒴'), ('꓁', '꓁'), ('꓅', '꓅'), ('侮', '頻'), ('﷼', '﷼'), ('\u{fe00}', '\u{fe0f}'), ('﹅', '﹆'), ('ﹳ', 'ﹳ'), ('⦅', '⦆'), ]; pub const V4_0: &'static [(char, char)] = &[ ('ȡ', 'ȡ'), ('ȴ', 'ȶ'), ('ʮ', 'ʯ'), ('˯', '˿'), ('\u{350}', '\u{357}'), ('\u{35d}', '\u{35f}'), ('Ϸ', 'ϻ'), ('\u{600}', '\u{603}'), ('؍', '\u{615}'), ('\u{656}', '\u{658}'), ('ۮ', 'ۯ'), ('ۿ', 'ۿ'), ('ܭ', 'ܯ'), ('ݍ', 'ݏ'), ('ऄ', 'ऄ'), ('ঽ', 'ঽ'), ('\u{a01}', '\u{a01}'), ('ਃ', 'ਃ'), ('ઌ', 'ઌ'), ('ૡ', '\u{ae3}'), ('૱', '૱'), ('ଵ', 'ଵ'), ('ୱ', 'ୱ'), ('௳', '௺'), ('\u{cbc}', 'ಽ'), ('\u{17dd}', '\u{17dd}'), ('៰', '៹'), ('ᤀ', 'ᤜ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥀', '᥀'), ('᥄', 'ᥭ'), ('ᥰ', 'ᥴ'), ('᧠', '᧿'), ('ᴀ', 'ᵫ'), ('⁓', '⁔'), ('℻', '℻'), ('⏏', '⏐'), ('⓿', '⓿'), ('☔', '☕'), ('⚊', '⚑'), ('⚠', '⚡'), ('⬀', '⬍'), ('㈝', '㈞'), ('㉐', '㉐'), ('㉼', '㉽'), ('㋌', '㋏'), ('㍷', '㍺'), ('㏞', '㏟'), ('㏿', '㏿'), ('䷀', '䷿'), ('﷽', '﷽'), ('﹇', '﹈'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐎀', '𐎝'), ('𐎟', '𐎟'), ('𐐦', '𐐧'), ('𐑎', '𐒝'), ('𐒠', '𐒩'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿'), ('𝌀', '𝍖'), ('𝓁', '𝓁'), ('\u{e0100}', '\u{e01ef}'), ]; pub const V4_1: &'static [(char, char)] = &[ ('ȷ', 'Ɂ'), ('\u{358}', '\u{35c}'), ('ϼ', 'Ͽ'), ('Ӷ', 'ӷ'), ('\u{5a2}', '\u{5a2}'), ('\u{5c5}', '\u{5c7}'), ('؋', '؋'), ('؞', '؞'), ('\u{659}', '\u{65e}'), ('ݐ', 'ݭ'), ('ॽ', 'ॽ'), ('ৎ', 'ৎ'), ('ஶ', 'ஶ'), ('௦', '௦'), ('࿐', '࿑'), ('ჹ', 'ჺ'), ('ჼ', 'ჼ'), ('ሇ', 'ሇ'), ('ቇ', 'ቇ'), ('ኇ', 'ኇ'), ('ኯ', 'ኯ'), ('ዏ', 'ዏ'), ('ዯ', 'ዯ'), ('ጏ', 'ጏ'), ('ጟ', 'ጟ'), ('ፇ', 'ፇ'), ('\u{135f}', '፠'), ('ᎀ', '᎙'), ('ᦀ', 'ᦩ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'), ('᧞', '᧟'), ('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ᵬ', '\u{1dc3}'), ('⁕', '⁖'), ('⁘', '⁞'), ('ₐ', 'ₔ'), ('₲', '₵'), ('\u{20eb}', '\u{20eb}'), ('ℼ', 'ℼ'), ('⅌', '⅌'), ('⏑', '⏛'), ('☘', '☘'), ('♾', '♿'), ('⚒', '⚜'), ('⚢', '⚱'), ('⟀', '⟆'), ('⬎', '⬓'), ('Ⰰ', 'Ⱞ'), ('ⰰ', 'ⱞ'), ('Ⲁ', '⳪'), ('⳹', 'ⴥ'), ('ⴰ', 'ⵥ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⸗'), ('⸜', '⸝'), ('㇀', '㇏'), ('㉾', '㉾'), ('龦', '龻'), ('꜀', '꜖'), ('ꠀ', '꠫'), ('並', '龎'), ('︐', '︙'), ('𐅀', '𐆊'), ('𐎠', '𐏃'), ('𐏈', '𐏕'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨳'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '𐩇'), ('𐩐', '𐩘'), ('𝈀', '𝉅'), ('𝚤', '𝚥'), ]; pub const V5_0: &'static [(char, char)] = &[ ('ɂ', 'ɏ'), ('ͻ', 'ͽ'), ('ӏ', 'ӏ'), ('Ӻ', 'ӿ'), ('Ԑ', 'ԓ'), ('\u{5ba}', '\u{5ba}'), ('߀', 'ߺ'), ('ॻ', 'ॼ'), ('ॾ', 'ॿ'), ('\u{ce2}', '\u{ce3}'), ('ೱ', 'ೲ'), ('\u{1b00}', 'ᭋ'), ('᭐', '᭼'), ('\u{1dc4}', '\u{1dca}'), ('\u{1dfe}', '\u{1dff}'), ('\u{20ec}', '\u{20ef}'), ('⅍', 'ⅎ'), ('ↄ', 'ↄ'), ('⏜', '⏧'), ('⚲', '⚲'), ('⟇', '⟊'), ('⬔', '⬚'), ('⬠', '⬣'), ('Ⱡ', 'ⱬ'), ('ⱴ', 'ⱷ'), ('ꜗ', 'ꜚ'), ('꜠', '꜡'), ('ꡀ', '꡷'), ('𐤀', '𐤙'), ('𐤟', '𐤟'), ('𒀀', '𒍮'), ('𒐀', '𒑢'), ('𒑰', '𒑳'), ('𝍠', '𝍱'), ('𝟊', '𝟋'), ]; pub const V5_1: &'static [(char, char)] = &[ ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('Ϗ', 'Ϗ'), ('\u{487}', '\u{487}'), ('Ԕ', 'ԣ'), ('؆', '؊'), ('\u{616}', '\u{61a}'), ('ػ', 'ؿ'), ('ݮ', 'ݿ'), ('ॱ', 'ॲ'), ('\u{a51}', '\u{a51}'), ('\u{a75}', '\u{a75}'), ('\u{b44}', '\u{b44}'), ('\u{b62}', '\u{b63}'), ('ௐ', 'ௐ'), ('ఽ', 'ఽ'), ('ౘ', 'ౙ'), ('\u{c62}', '\u{c63}'), ('౸', '౿'), ('ഽ', 'ഽ'), ('\u{d44}', '\u{d44}'), ('\u{d62}', '\u{d63}'), ('൰', '൵'), ('൹', 'ൿ'), ('ཫ', 'ཬ'), ('࿎', '࿎'), ('࿒', '࿔'), ('ဢ', 'ဢ'), ('ဨ', 'ဨ'), ('ါ', 'ါ'), ('\u{1033}', '\u{1035}'), ('\u{103a}', 'ဿ'), ('ၚ', '႙'), ('႞', '႟'), ('ᢪ', 'ᢪ'), ('\u{1b80}', '᮪'), ('ᮮ', '᮹'), ('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', '᱿'), ('\u{1dcb}', '\u{1de6}'), ('ẜ', 'ẟ'), ('Ỻ', 'ỿ'), ('\u{2064}', '\u{2064}'), ('\u{20f0}', '\u{20f0}'), ('⅏', '⅏'), ('ↅ', 'ↈ'), ('⚝', '⚝'), ('⚳', '⚼'), ('⛀', '⛃'), ('⟌', '⟌'), ('⟬', '⟯'), ('⬛', '⬟'), ('⬤', '⭌'), ('⭐', '⭔'), ('Ɑ', 'Ɐ'), ('ⱱ', 'ⱳ'), ('ⱸ', 'ⱽ'), ('\u{2de0}', '\u{2dff}'), ('⸘', '⸛'), ('⸞', '⸰'), ('ㄭ', 'ㄭ'), ('㇐', '㇣'), ('龼', '鿃'), ('ꔀ', 'ꘫ'), ('Ꙁ', 'ꙟ'), ('Ꙣ', '꙳'), ('\u{a67c}', 'ꚗ'), ('ꜛ', 'ꜟ'), ('Ꜣ', 'ꞌ'), ('ꟻ', 'ꟿ'), ('ꢀ', '\u{a8c4}'), ('꣎', '꣙'), ('꤀', '꥓'), ('꥟', '꥟'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟'), ('\u{fe24}', '\u{fe26}'), ('𐆐', '𐆛'), ('𐇐', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐤠', '𐤹'), ('𐤿', '𐤿'), ('𝄩', '𝄩'), ('🀀', '🀫'), ('🀰', '🂓'), ]; pub const V5_2: &'static [(char, char)] = &[ ('Ԥ', 'ԥ'), ('ࠀ', '\u{82d}'), ('࠰', '࠾'), ('\u{900}', '\u{900}'), ('ॎ', 'ॎ'), ('\u{955}', '\u{955}'), ('ॹ', 'ॺ'), ('৻', '৻'), ('࿕', '࿘'), ('ႚ', '\u{109d}'), ('ᅚ', 'ᅞ'), ('ᆣ', 'ᆧ'), ('ᇺ', 'ᇿ'), ('᐀', '᐀'), ('ᙷ', 'ᙿ'), ('ᢰ', 'ᣵ'), ('ᦪ', 'ᦫ'), ('᧚', '᧚'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ('\u{1cd0}', 'ᳲ'), ('\u{1dfd}', '\u{1dfd}'), ('₶', '₸'), ('⅐', '⅒'), ('↉', '↉'), ('⏨', '⏨'), ('⚞', '⚟'), ('⚽', '⚿'), ('⛄', '⛍'), ('⛏', '⛡'), ('⛣', '⛣'), ('⛨', '⛿'), ('❗', '❗'), ('⭕', '⭙'), ('Ɒ', 'Ɒ'), ('Ȿ', 'Ɀ'), ('Ⳬ', '\u{2cf1}'), ('⸱', '⸱'), ('㉄', '㉏'), ('鿄', '鿋'), ('ꓐ', '꓿'), ('ꚠ', '꛷'), ('꠰', '꠹'), ('\u{a8e0}', 'ꣻ'), ('ꥠ', 'ꥼ'), ('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟'), ('ꩠ', 'ꩻ'), ('ꪀ', 'ꫂ'), ('ꫛ', '꫟'), ('ꯀ', '\u{abed}'), ('꯰', '꯹'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('恵', '舘'), ('𐡀', '𐡕'), ('𐡗', '𐡟'), ('𐤚', '𐤛'), ('𐩠', '𐩿'), ('𐬀', '𐬵'), ('𐬹', '𐭕'), ('𐭘', '𐭲'), ('𐭸', '𐭿'), ('𐰀', '𐱈'), ('𐹠', '𐹾'), ('\u{11080}', '𑃁'), ('𓀀', '𓐮'), ('🄀', '🄊'), ('🄐', '🄮'), ('🄱', '🄱'), ('🄽', '🄽'), ('🄿', '🄿'), ('🅂', '🅂'), ('🅆', '🅆'), ('🅊', '🅎'), ('🅗', '🅗'), ('🅟', '🅟'), ('🅹', '🅹'), ('🅻', '🅼'), ('🅿', '🅿'), ('🆊', '🆍'), ('🆐', '🆐'), ('🈀', '🈀'), ('🈐', '🈱'), ('🉀', '🉈'), ('𪜀', '𫜴'), ]; pub const V6_0: &'static [(char, char)] = &[ ('Ԧ', 'ԧ'), ('ؠ', 'ؠ'), ('\u{65f}', '\u{65f}'), ('ࡀ', '\u{85b}'), ('࡞', '࡞'), ('\u{93a}', 'ऻ'), ('ॏ', 'ॏ'), ('\u{956}', '\u{957}'), ('ॳ', 'ॷ'), ('୲', '୷'), ('ഩ', 'ഩ'), ('ഺ', 'ഺ'), ('ൎ', 'ൎ'), ('ྌ', '\u{f8f}'), ('࿙', '࿚'), ('\u{135d}', '\u{135e}'), ('ᯀ', '᯳'), ('᯼', '᯿'), ('\u{1dfc}', '\u{1dfc}'), ('ₕ', 'ₜ'), ('₹', '₹'), ('⏩', '⏳'), ('⛎', '⛎'), ('⛢', '⛢'), ('⛤', '⛧'), ('✅', '✅'), ('✊', '✋'), ('✨', '✨'), ('❌', '❌'), ('❎', '❎'), ('❓', '❕'), ('❟', '❠'), ('➕', '➗'), ('➰', '➰'), ('➿', '➿'), ('⟎', '⟏'), ('⵰', '⵰'), ('\u{2d7f}', '\u{2d7f}'), ('ㆸ', 'ㆺ'), ('Ꙡ', 'ꙡ'), ('Ɥ', 'ꞎ'), ('Ꞑ', 'ꞑ'), ('Ꞡ', 'ꞩ'), ('ꟺ', 'ꟺ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('﮲', '﯁'), ('𑀀', '𑁍'), ('𑁒', '𑁯'), ('𖠀', '𖨸'), ('𛀀', '𛀁'), ('🂠', '🂮'), ('🂱', '🂾'), ('🃁', '🃏'), ('🃑', '🃟'), ('🄰', '🄰'), ('🄲', '🄼'), ('🄾', '🄾'), ('🅀', '🅁'), ('🅃', '🅅'), ('🅇', '🅉'), ('🅏', '🅖'), ('🅘', '🅞'), ('🅠', '🅩'), ('🅰', '🅸'), ('🅺', '🅺'), ('🅽', '🅾'), ('🆀', '🆉'), ('🆎', '🆏'), ('🆑', '🆚'), ('🇦', '🇿'), ('🈁', '🈂'), ('🈲', '🈺'), ('🉐', '🉑'), ('🌀', '🌠'), ('🌰', '🌵'), ('🌷', '🍼'), ('🎀', '🎓'), ('🎠', '🏄'), ('🏆', '🏊'), ('🏠', '🏰'), ('🐀', '🐾'), ('👀', '👀'), ('👂', '📷'), ('📹', '📼'), ('🔀', '🔽'), ('🕐', '🕧'), ('🗻', '🗿'), ('😁', '😐'), ('😒', '😔'), ('😖', '😖'), ('😘', '😘'), ('😚', '😚'), ('😜', '😞'), ('😠', '😥'), ('😨', '😫'), ('😭', '😭'), ('😰', '😳'), ('😵', '🙀'), ('🙅', '🙏'), ('🚀', '🛅'), ('🜀', '🝳'), ('𫝀', '𫠝'), ]; pub const V6_1: &'static [(char, char)] = &[ ('֏', '֏'), ('\u{604}', '\u{604}'), ('ࢠ', 'ࢠ'), ('ࢢ', 'ࢬ'), ('\u{8e4}', '\u{8fe}'), ('૰', '૰'), ('ໞ', 'ໟ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ჽ', 'ჿ'), ('\u{1bab}', '\u{1bad}'), ('ᮺ', 'ᮿ'), ('᳀', '᳇'), ('ᳳ', 'ᳶ'), ('⟋', '⟋'), ('⟍', '⟍'), ('Ⳳ', 'ⳳ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⵦ', 'ⵧ'), ('⸲', '⸻'), ('鿌', '鿌'), ('\u{a674}', '\u{a67b}'), ('\u{a69f}', '\u{a69f}'), ('Ꞓ', 'ꞓ'), ('Ɦ', 'Ɦ'), ('ꟸ', 'ꟹ'), ('ꫠ', '\u{aaf6}'), ('郞', '隷'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅃'), ('\u{11180}', '𑇈'), ('𑇐', '𑇙'), ('𑚀', '\u{116b7}'), ('𑛀', '𑛉'), ('𖼀', '𖽄'), ('𖽐', '𖽾'), ('\u{16f8f}', '𖾟'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ('🅪', '🅫'), ('🕀', '🕃'), ('😀', '😀'), ('😑', '😑'), ('😕', '😕'), ('😗', '😗'), ('😙', '😙'), ('😛', '😛'), ('😟', '😟'), ('😦', '😧'), ('😬', '😬'), ('😮', '😯'), ('😴', '😴'), ]; pub const V6_2: &'static [(char, char)] = &[('₺', '₺')]; pub const V6_3: &'static [(char, char)] = &[('\u{61c}', '\u{61c}'), ('\u{2066}', '\u{2069}')]; pub const V7_0: &'static [(char, char)] = &[ ('Ϳ', 'Ϳ'), ('Ԩ', 'ԯ'), ('֍', '֎'), ('\u{605}', '\u{605}'), ('ࢡ', 'ࢡ'), ('ࢭ', 'ࢲ'), ('\u{8ff}', '\u{8ff}'), ('ॸ', 'ॸ'), ('ঀ', 'ঀ'), ('\u{c00}', '\u{c00}'), ('ఴ', 'ఴ'), ('\u{c81}', '\u{c81}'), ('\u{d01}', '\u{d01}'), ('෦', '෯'), ('ᛱ', 'ᛸ'), ('ᤝ', 'ᤞ'), ('\u{1ab0}', '\u{1abe}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1de7}', '\u{1df5}'), ('₻', '₽'), ('⏴', '⏺'), ('✀', '✀'), ('⭍', '⭏'), ('⭚', '⭳'), ('⭶', '⮕'), ('⮘', '⮹'), ('⮽', '⯈'), ('⯊', '⯑'), ('⸼', '⹂'), ('Ꚙ', 'ꚝ'), ('ꞔ', 'ꞟ'), ('Ɜ', 'Ɬ'), ('Ʞ', 'Ʇ'), ('ꟷ', 'ꟷ'), ('ꧠ', 'ꧾ'), ('\u{aa7c}', 'ꩿ'), ('ꬰ', 'ꭟ'), ('ꭤ', 'ꭥ'), ('\u{fe27}', '\u{fe2d}'), ('𐆋', '𐆌'), ('𐆠', '𐆠'), ('\u{102e0}', '𐋻'), ('𐌟', '𐌟'), ('𐍐', '\u{1037a}'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕯'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐡠', '𐢞'), ('𐢧', '𐢯'), ('𐪀', '𐪟'), ('𐫀', '\u{10ae6}'), ('𐫫', '𐫶'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), ('\u{1107f}', '\u{1107f}'), ('𑅐', '𑅶'), ('𑇍', '𑇍'), ('𑇚', '𑇚'), ('𑇡', '𑇴'), ('𑈀', '𑈑'), ('𑈓', '𑈽'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹'), ('\u{11301}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133c}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '𑗉'), ('𑘀', '𑙄'), ('𑙐', '𑙙'), ('𑢠', '𑣲'), ('𑣿', '𑣿'), ('𑫀', '𑫸'), ('𒍯', '𒎘'), ('𒑣', '𒑮'), ('𒑴', '𒑴'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯'), ('𖫐', '𖫭'), ('\u{16af0}', '𖫵'), ('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}'), ('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}'), ('🂿', '🂿'), ('🃠', '🃵'), ('🄋', '🄌'), ('🌡', '🌬'), ('🌶', '🌶'), ('🍽', '🍽'), ('🎔', '🎟'), ('🏅', '🏅'), ('🏋', '🏎'), ('🏔', '🏟'), ('🏱', '🏷'), ('🐿', '🐿'), ('👁', '👁'), ('📸', '📸'), ('📽', '📾'), ('🔾', '🔿'), ('🕄', '🕊'), ('🕨', '🕹'), ('🕻', '🖣'), ('🖥', '🗺'), ('🙁', '🙂'), ('🙐', '🙿'), ('🛆', '🛏'), ('🛠', '🛬'), ('🛰', '🛳'), ('🞀', '🟔'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ]; pub const V8_0: &'static [(char, char)] = &[ ('ࢳ', 'ࢴ'), ('\u{8e3}', '\u{8e3}'), ('ૹ', 'ૹ'), ('ౚ', 'ౚ'), ('ൟ', 'ൟ'), ('Ᏽ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('₾', '₾'), ('↊', '↋'), ('⯬', '⯯'), ('鿍', '鿕'), ('\u{a69e}', '\u{a69e}'), ('ꞏ', 'ꞏ'), ('Ʝ', 'ꞷ'), ('꣼', 'ꣽ'), ('ꭠ', 'ꭣ'), ('ꭰ', 'ꮿ'), ('\u{fe2e}', '\u{fe2f}'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), ('𐧒', '𐧿'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿'), ('\u{111c9}', '\u{111cc}'), ('𑇛', '𑇟'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('\u{11300}', '\u{11300}'), ('𑍐', '𑍐'), ('𑗊', '\u{115dd}'), ('𑜀', '𑜙'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜿'), ('𒎙', '𒎙'), ('𒒀', '𒕃'), ('𔐀', '𔙆'), ('𝇞', '𝇨'), ('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('🌭', '🌯'), ('🍾', '🍿'), ('🏏', '🏓'), ('🏸', '🏿'), ('📿', '📿'), ('🕋', '🕏'), ('🙃', '🙄'), ('🛐', '🛐'), ('🤐', '🤘'), ('🦀', '🦄'), ('🧀', '🧀'), ('𫠠', '𬺡'), ]; pub const V9_0: &'static [(char, char)] = &[ ('ࢶ', 'ࢽ'), ('\u{8d4}', '\u{8e2}'), ('ಀ', 'ಀ'), ('൏', '൏'), ('ൔ', 'ൖ'), ('൘', '൞'), ('൶', '൸'), ('ᲀ', 'ᲈ'), ('\u{1dfb}', '\u{1dfb}'), ('⏻', '⏾'), ('⹃', '⹄'), ('Ɪ', 'Ɪ'), ('\u{a8c5}', '\u{a8c5}'), ('𐆍', '𐆎'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('\u{1123e}', '\u{1123e}'), ('𑐀', '𑑙'), ('𑑛', '𑑛'), ('𑑝', '𑑝'), ('𑙠', '𑙬'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𖿠', '𖿠'), ('𗀀', '𘟬'), ('𘠀', '𘫲'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞤀', '\u{1e94a}'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), ('🆛', '🆬'), ('🈻', '🈻'), ('🕺', '🕺'), ('🖤', '🖤'), ('🛑', '🛒'), ('🛴', '🛶'), ('🤙', '🤞'), ('🤠', '🤧'), ('🤰', '🤰'), ('🤳', '🤾'), ('🥀', '🥋'), ('🥐', '🥞'), ('🦅', '🦑'), ]; regex-syntax-0.8.2/src/unicode_tables/case_folding_simple.rs000064400000000000000000002011051046102023000223730ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate case-folding-simple ucd-15.0.0 --chars --all-pairs // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const CASE_FOLDING_SIMPLE: &'static [(char, &'static [char])] = &[ ('A', &['a']), ('B', &['b']), ('C', &['c']), ('D', &['d']), ('E', &['e']), ('F', &['f']), ('G', &['g']), ('H', &['h']), ('I', &['i']), ('J', &['j']), ('K', &['k', 'K']), ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &['o']), ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s', 'ſ']), ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), ('X', &['x']), ('Y', &['y']), ('Z', &['z']), ('a', &['A']), ('b', &['B']), ('c', &['C']), ('d', &['D']), ('e', &['E']), ('f', &['F']), ('g', &['G']), ('h', &['H']), ('i', &['I']), ('j', &['J']), ('k', &['K', 'K']), ('l', &['L']), ('m', &['M']), ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), ('r', &['R']), ('s', &['S', 'ſ']), ('t', &['T']), ('u', &['U']), ('v', &['V']), ('w', &['W']), ('x', &['X']), ('y', &['Y']), ('z', &['Z']), ('µ', &['Μ', 'μ']), ('À', &['à']), ('Á', &['á']), ('Â', &['â']), ('Ã', &['ã']), ('Ä', &['ä']), ('Å', &['å', 'Å']), ('Æ', &['æ']), ('Ç', &['ç']), ('È', &['è']), ('É', &['é']), ('Ê', &['ê']), ('Ë', &['ë']), ('Ì', &['ì']), ('Í', &['í']), ('Î', &['î']), ('Ï', &['ï']), ('Ð', &['ð']), ('Ñ', &['ñ']), ('Ò', &['ò']), ('Ó', &['ó']), ('Ô', &['ô']), ('Õ', &['õ']), ('Ö', &['ö']), ('Ø', &['ø']), ('Ù', &['ù']), ('Ú', &['ú']), ('Û', &['û']), ('Ü', &['ü']), ('Ý', &['ý']), ('Þ', &['þ']), ('ß', &['ẞ']), ('à', &['À']), ('á', &['Á']), ('â', &['Â']), ('ã', &['Ã']), ('ä', &['Ä']), ('å', &['Å', 'Å']), ('æ', &['Æ']), ('ç', &['Ç']), ('è', &['È']), ('é', &['É']), ('ê', &['Ê']), ('ë', &['Ë']), ('ì', &['Ì']), ('í', &['Í']), ('î', &['Î']), ('ï', &['Ï']), ('ð', &['Ð']), ('ñ', &['Ñ']), ('ò', &['Ò']), ('ó', &['Ó']), ('ô', &['Ô']), ('õ', &['Õ']), ('ö', &['Ö']), ('ø', &['Ø']), ('ù', &['Ù']), ('ú', &['Ú']), ('û', &['Û']), ('ü', &['Ü']), ('ý', &['Ý']), ('þ', &['Þ']), ('ÿ', &['Ÿ']), ('Ā', &['ā']), ('ā', &['Ā']), ('Ă', &['ă']), ('ă', &['Ă']), ('Ą', &['ą']), ('ą', &['Ą']), ('Ć', &['ć']), ('ć', &['Ć']), ('Ĉ', &['ĉ']), ('ĉ', &['Ĉ']), ('Ċ', &['ċ']), ('ċ', &['Ċ']), ('Č', &['č']), ('č', &['Č']), ('Ď', &['ď']), ('ď', &['Ď']), ('Đ', &['đ']), ('đ', &['Đ']), ('Ē', &['ē']), ('ē', &['Ē']), ('Ĕ', &['ĕ']), ('ĕ', &['Ĕ']), ('Ė', &['ė']), ('ė', &['Ė']), ('Ę', &['ę']), ('ę', &['Ę']), ('Ě', &['ě']), ('ě', &['Ě']), ('Ĝ', &['ĝ']), ('ĝ', &['Ĝ']), ('Ğ', &['ğ']), ('ğ', &['Ğ']), ('Ġ', &['ġ']), ('ġ', &['Ġ']), ('Ģ', &['ģ']), ('ģ', &['Ģ']), ('Ĥ', &['ĥ']), ('ĥ', &['Ĥ']), ('Ħ', &['ħ']), ('ħ', &['Ħ']), ('Ĩ', &['ĩ']), ('ĩ', &['Ĩ']), ('Ī', &['ī']), ('ī', &['Ī']), ('Ĭ', &['ĭ']), ('ĭ', &['Ĭ']), ('Į', &['į']), ('į', &['Į']), ('IJ', &['ij']), ('ij', &['IJ']), ('Ĵ', &['ĵ']), ('ĵ', &['Ĵ']), ('Ķ', &['ķ']), ('ķ', &['Ķ']), ('Ĺ', &['ĺ']), ('ĺ', &['Ĺ']), ('Ļ', &['ļ']), ('ļ', &['Ļ']), ('Ľ', &['ľ']), ('ľ', &['Ľ']), ('Ŀ', &['ŀ']), ('ŀ', &['Ŀ']), ('Ł', &['ł']), ('ł', &['Ł']), ('Ń', &['ń']), ('ń', &['Ń']), ('Ņ', &['ņ']), ('ņ', &['Ņ']), ('Ň', &['ň']), ('ň', &['Ň']), ('Ŋ', &['ŋ']), ('ŋ', &['Ŋ']), ('Ō', &['ō']), ('ō', &['Ō']), ('Ŏ', &['ŏ']), ('ŏ', &['Ŏ']), ('Ő', &['ő']), ('ő', &['Ő']), ('Œ', &['œ']), ('œ', &['Œ']), ('Ŕ', &['ŕ']), ('ŕ', &['Ŕ']), ('Ŗ', &['ŗ']), ('ŗ', &['Ŗ']), ('Ř', &['ř']), ('ř', &['Ř']), ('Ś', &['ś']), ('ś', &['Ś']), ('Ŝ', &['ŝ']), ('ŝ', &['Ŝ']), ('Ş', &['ş']), ('ş', &['Ş']), ('Š', &['š']), ('š', &['Š']), ('Ţ', &['ţ']), ('ţ', &['Ţ']), ('Ť', &['ť']), ('ť', &['Ť']), ('Ŧ', &['ŧ']), ('ŧ', &['Ŧ']), ('Ũ', &['ũ']), ('ũ', &['Ũ']), ('Ū', &['ū']), ('ū', &['Ū']), ('Ŭ', &['ŭ']), ('ŭ', &['Ŭ']), ('Ů', &['ů']), ('ů', &['Ů']), ('Ű', &['ű']), ('ű', &['Ű']), ('Ų', &['ų']), ('ų', &['Ų']), ('Ŵ', &['ŵ']), ('ŵ', &['Ŵ']), ('Ŷ', &['ŷ']), ('ŷ', &['Ŷ']), ('Ÿ', &['ÿ']), ('Ź', &['ź']), ('ź', &['Ź']), ('Ż', &['ż']), ('ż', &['Ż']), ('Ž', &['ž']), ('ž', &['Ž']), ('ſ', &['S', 's']), ('ƀ', &['Ƀ']), ('Ɓ', &['ɓ']), ('Ƃ', &['ƃ']), ('ƃ', &['Ƃ']), ('Ƅ', &['ƅ']), ('ƅ', &['Ƅ']), ('Ɔ', &['ɔ']), ('Ƈ', &['ƈ']), ('ƈ', &['Ƈ']), ('Ɖ', &['ɖ']), ('Ɗ', &['ɗ']), ('Ƌ', &['ƌ']), ('ƌ', &['Ƌ']), ('Ǝ', &['ǝ']), ('Ə', &['ə']), ('Ɛ', &['ɛ']), ('Ƒ', &['ƒ']), ('ƒ', &['Ƒ']), ('Ɠ', &['ɠ']), ('Ɣ', &['ɣ']), ('ƕ', &['Ƕ']), ('Ɩ', &['ɩ']), ('Ɨ', &['ɨ']), ('Ƙ', &['ƙ']), ('ƙ', &['Ƙ']), ('ƚ', &['Ƚ']), ('Ɯ', &['ɯ']), ('Ɲ', &['ɲ']), ('ƞ', &['Ƞ']), ('Ɵ', &['ɵ']), ('Ơ', &['ơ']), ('ơ', &['Ơ']), ('Ƣ', &['ƣ']), ('ƣ', &['Ƣ']), ('Ƥ', &['ƥ']), ('ƥ', &['Ƥ']), ('Ʀ', &['ʀ']), ('Ƨ', &['ƨ']), ('ƨ', &['Ƨ']), ('Ʃ', &['ʃ']), ('Ƭ', &['ƭ']), ('ƭ', &['Ƭ']), ('Ʈ', &['ʈ']), ('Ư', &['ư']), ('ư', &['Ư']), ('Ʊ', &['ʊ']), ('Ʋ', &['ʋ']), ('Ƴ', &['ƴ']), ('ƴ', &['Ƴ']), ('Ƶ', &['ƶ']), ('ƶ', &['Ƶ']), ('Ʒ', &['ʒ']), ('Ƹ', &['ƹ']), ('ƹ', &['Ƹ']), ('Ƽ', &['ƽ']), ('ƽ', &['Ƽ']), ('ƿ', &['Ƿ']), ('DŽ', &['Dž', 'dž']), ('Dž', &['DŽ', 'dž']), ('dž', &['DŽ', 'Dž']), ('LJ', &['Lj', 'lj']), ('Lj', &['LJ', 'lj']), ('lj', &['LJ', 'Lj']), ('NJ', &['Nj', 'nj']), ('Nj', &['NJ', 'nj']), ('nj', &['NJ', 'Nj']), ('Ǎ', &['ǎ']), ('ǎ', &['Ǎ']), ('Ǐ', &['ǐ']), ('ǐ', &['Ǐ']), ('Ǒ', &['ǒ']), ('ǒ', &['Ǒ']), ('Ǔ', &['ǔ']), ('ǔ', &['Ǔ']), ('Ǖ', &['ǖ']), ('ǖ', &['Ǖ']), ('Ǘ', &['ǘ']), ('ǘ', &['Ǘ']), ('Ǚ', &['ǚ']), ('ǚ', &['Ǚ']), ('Ǜ', &['ǜ']), ('ǜ', &['Ǜ']), ('ǝ', &['Ǝ']), ('Ǟ', &['ǟ']), ('ǟ', &['Ǟ']), ('Ǡ', &['ǡ']), ('ǡ', &['Ǡ']), ('Ǣ', &['ǣ']), ('ǣ', &['Ǣ']), ('Ǥ', &['ǥ']), ('ǥ', &['Ǥ']), ('Ǧ', &['ǧ']), ('ǧ', &['Ǧ']), ('Ǩ', &['ǩ']), ('ǩ', &['Ǩ']), ('Ǫ', &['ǫ']), ('ǫ', &['Ǫ']), ('Ǭ', &['ǭ']), ('ǭ', &['Ǭ']), ('Ǯ', &['ǯ']), ('ǯ', &['Ǯ']), ('DZ', &['Dz', 'dz']), ('Dz', &['DZ', 'dz']), ('dz', &['DZ', 'Dz']), ('Ǵ', &['ǵ']), ('ǵ', &['Ǵ']), ('Ƕ', &['ƕ']), ('Ƿ', &['ƿ']), ('Ǹ', &['ǹ']), ('ǹ', &['Ǹ']), ('Ǻ', &['ǻ']), ('ǻ', &['Ǻ']), ('Ǽ', &['ǽ']), ('ǽ', &['Ǽ']), ('Ǿ', &['ǿ']), ('ǿ', &['Ǿ']), ('Ȁ', &['ȁ']), ('ȁ', &['Ȁ']), ('Ȃ', &['ȃ']), ('ȃ', &['Ȃ']), ('Ȅ', &['ȅ']), ('ȅ', &['Ȅ']), ('Ȇ', &['ȇ']), ('ȇ', &['Ȇ']), ('Ȉ', &['ȉ']), ('ȉ', &['Ȉ']), ('Ȋ', &['ȋ']), ('ȋ', &['Ȋ']), ('Ȍ', &['ȍ']), ('ȍ', &['Ȍ']), ('Ȏ', &['ȏ']), ('ȏ', &['Ȏ']), ('Ȑ', &['ȑ']), ('ȑ', &['Ȑ']), ('Ȓ', &['ȓ']), ('ȓ', &['Ȓ']), ('Ȕ', &['ȕ']), ('ȕ', &['Ȕ']), ('Ȗ', &['ȗ']), ('ȗ', &['Ȗ']), ('Ș', &['ș']), ('ș', &['Ș']), ('Ț', &['ț']), ('ț', &['Ț']), ('Ȝ', &['ȝ']), ('ȝ', &['Ȝ']), ('Ȟ', &['ȟ']), ('ȟ', &['Ȟ']), ('Ƞ', &['ƞ']), ('Ȣ', &['ȣ']), ('ȣ', &['Ȣ']), ('Ȥ', &['ȥ']), ('ȥ', &['Ȥ']), ('Ȧ', &['ȧ']), ('ȧ', &['Ȧ']), ('Ȩ', &['ȩ']), ('ȩ', &['Ȩ']), ('Ȫ', &['ȫ']), ('ȫ', &['Ȫ']), ('Ȭ', &['ȭ']), ('ȭ', &['Ȭ']), ('Ȯ', &['ȯ']), ('ȯ', &['Ȯ']), ('Ȱ', &['ȱ']), ('ȱ', &['Ȱ']), ('Ȳ', &['ȳ']), ('ȳ', &['Ȳ']), ('Ⱥ', &['ⱥ']), ('Ȼ', &['ȼ']), ('ȼ', &['Ȼ']), ('Ƚ', &['ƚ']), ('Ⱦ', &['ⱦ']), ('ȿ', &['Ȿ']), ('ɀ', &['Ɀ']), ('Ɂ', &['ɂ']), ('ɂ', &['Ɂ']), ('Ƀ', &['ƀ']), ('Ʉ', &['ʉ']), ('Ʌ', &['ʌ']), ('Ɇ', &['ɇ']), ('ɇ', &['Ɇ']), ('Ɉ', &['ɉ']), ('ɉ', &['Ɉ']), ('Ɋ', &['ɋ']), ('ɋ', &['Ɋ']), ('Ɍ', &['ɍ']), ('ɍ', &['Ɍ']), ('Ɏ', &['ɏ']), ('ɏ', &['Ɏ']), ('ɐ', &['Ɐ']), ('ɑ', &['Ɑ']), ('ɒ', &['Ɒ']), ('ɓ', &['Ɓ']), ('ɔ', &['Ɔ']), ('ɖ', &['Ɖ']), ('ɗ', &['Ɗ']), ('ə', &['Ə']), ('ɛ', &['Ɛ']), ('ɜ', &['Ɜ']), ('ɠ', &['Ɠ']), ('ɡ', &['Ɡ']), ('ɣ', &['Ɣ']), ('ɥ', &['Ɥ']), ('ɦ', &['Ɦ']), ('ɨ', &['Ɨ']), ('ɩ', &['Ɩ']), ('ɪ', &['Ɪ']), ('ɫ', &['Ɫ']), ('ɬ', &['Ɬ']), ('ɯ', &['Ɯ']), ('ɱ', &['Ɱ']), ('ɲ', &['Ɲ']), ('ɵ', &['Ɵ']), ('ɽ', &['Ɽ']), ('ʀ', &['Ʀ']), ('ʂ', &['Ʂ']), ('ʃ', &['Ʃ']), ('ʇ', &['Ʇ']), ('ʈ', &['Ʈ']), ('ʉ', &['Ʉ']), ('ʊ', &['Ʊ']), ('ʋ', &['Ʋ']), ('ʌ', &['Ʌ']), ('ʒ', &['Ʒ']), ('ʝ', &['Ʝ']), ('ʞ', &['Ʞ']), ('\u{345}', &['Ι', 'ι', 'ι']), ('Ͱ', &['ͱ']), ('ͱ', &['Ͱ']), ('Ͳ', &['ͳ']), ('ͳ', &['Ͳ']), ('Ͷ', &['ͷ']), ('ͷ', &['Ͷ']), ('ͻ', &['Ͻ']), ('ͼ', &['Ͼ']), ('ͽ', &['Ͽ']), ('Ϳ', &['ϳ']), ('Ά', &['ά']), ('Έ', &['έ']), ('Ή', &['ή']), ('Ί', &['ί']), ('Ό', &['ό']), ('Ύ', &['ύ']), ('Ώ', &['ώ']), ('Α', &['α']), ('Β', &['β', 'ϐ']), ('Γ', &['γ']), ('Δ', &['δ']), ('Ε', &['ε', 'ϵ']), ('Ζ', &['ζ']), ('Η', &['η']), ('Θ', &['θ', 'ϑ', 'ϴ']), ('Ι', &['\u{345}', 'ι', 'ι']), ('Κ', &['κ', 'ϰ']), ('Λ', &['λ']), ('Μ', &['µ', 'μ']), ('Ν', &['ν']), ('Ξ', &['ξ']), ('Ο', &['ο']), ('Π', &['π', 'ϖ']), ('Ρ', &['ρ', 'ϱ']), ('Σ', &['ς', 'σ']), ('Τ', &['τ']), ('Υ', &['υ']), ('Φ', &['φ', 'ϕ']), ('Χ', &['χ']), ('Ψ', &['ψ']), ('Ω', &['ω', 'Ω']), ('Ϊ', &['ϊ']), ('Ϋ', &['ϋ']), ('ά', &['Ά']), ('έ', &['Έ']), ('ή', &['Ή']), ('ί', &['Ί']), ('α', &['Α']), ('β', &['Β', 'ϐ']), ('γ', &['Γ']), ('δ', &['Δ']), ('ε', &['Ε', 'ϵ']), ('ζ', &['Ζ']), ('η', &['Η']), ('θ', &['Θ', 'ϑ', 'ϴ']), ('ι', &['\u{345}', 'Ι', 'ι']), ('κ', &['Κ', 'ϰ']), ('λ', &['Λ']), ('μ', &['µ', 'Μ']), ('ν', &['Ν']), ('ξ', &['Ξ']), ('ο', &['Ο']), ('π', &['Π', 'ϖ']), ('ρ', &['Ρ', 'ϱ']), ('ς', &['Σ', 'σ']), ('σ', &['Σ', 'ς']), ('τ', &['Τ']), ('υ', &['Υ']), ('φ', &['Φ', 'ϕ']), ('χ', &['Χ']), ('ψ', &['Ψ']), ('ω', &['Ω', 'Ω']), ('ϊ', &['Ϊ']), ('ϋ', &['Ϋ']), ('ό', &['Ό']), ('ύ', &['Ύ']), ('ώ', &['Ώ']), ('Ϗ', &['ϗ']), ('ϐ', &['Β', 'β']), ('ϑ', &['Θ', 'θ', 'ϴ']), ('ϕ', &['Φ', 'φ']), ('ϖ', &['Π', 'π']), ('ϗ', &['Ϗ']), ('Ϙ', &['ϙ']), ('ϙ', &['Ϙ']), ('Ϛ', &['ϛ']), ('ϛ', &['Ϛ']), ('Ϝ', &['ϝ']), ('ϝ', &['Ϝ']), ('Ϟ', &['ϟ']), ('ϟ', &['Ϟ']), ('Ϡ', &['ϡ']), ('ϡ', &['Ϡ']), ('Ϣ', &['ϣ']), ('ϣ', &['Ϣ']), ('Ϥ', &['ϥ']), ('ϥ', &['Ϥ']), ('Ϧ', &['ϧ']), ('ϧ', &['Ϧ']), ('Ϩ', &['ϩ']), ('ϩ', &['Ϩ']), ('Ϫ', &['ϫ']), ('ϫ', &['Ϫ']), ('Ϭ', &['ϭ']), ('ϭ', &['Ϭ']), ('Ϯ', &['ϯ']), ('ϯ', &['Ϯ']), ('ϰ', &['Κ', 'κ']), ('ϱ', &['Ρ', 'ρ']), ('ϲ', &['Ϲ']), ('ϳ', &['Ϳ']), ('ϴ', &['Θ', 'θ', 'ϑ']), ('ϵ', &['Ε', 'ε']), ('Ϸ', &['ϸ']), ('ϸ', &['Ϸ']), ('Ϲ', &['ϲ']), ('Ϻ', &['ϻ']), ('ϻ', &['Ϻ']), ('Ͻ', &['ͻ']), ('Ͼ', &['ͼ']), ('Ͽ', &['ͽ']), ('Ѐ', &['ѐ']), ('Ё', &['ё']), ('Ђ', &['ђ']), ('Ѓ', &['ѓ']), ('Є', &['є']), ('Ѕ', &['ѕ']), ('І', &['і']), ('Ї', &['ї']), ('Ј', &['ј']), ('Љ', &['љ']), ('Њ', &['њ']), ('Ћ', &['ћ']), ('Ќ', &['ќ']), ('Ѝ', &['ѝ']), ('Ў', &['ў']), ('Џ', &['џ']), ('А', &['а']), ('Б', &['б']), ('В', &['в', 'ᲀ']), ('Г', &['г']), ('Д', &['д', 'ᲁ']), ('Е', &['е']), ('Ж', &['ж']), ('З', &['з']), ('И', &['и']), ('Й', &['й']), ('К', &['к']), ('Л', &['л']), ('М', &['м']), ('Н', &['н']), ('О', &['о', 'ᲂ']), ('П', &['п']), ('Р', &['р']), ('С', &['с', 'ᲃ']), ('Т', &['т', 'ᲄ', 'ᲅ']), ('У', &['у']), ('Ф', &['ф']), ('Х', &['х']), ('Ц', &['ц']), ('Ч', &['ч']), ('Ш', &['ш']), ('Щ', &['щ']), ('Ъ', &['ъ', 'ᲆ']), ('Ы', &['ы']), ('Ь', &['ь']), ('Э', &['э']), ('Ю', &['ю']), ('Я', &['я']), ('а', &['А']), ('б', &['Б']), ('в', &['В', 'ᲀ']), ('г', &['Г']), ('д', &['Д', 'ᲁ']), ('е', &['Е']), ('ж', &['Ж']), ('з', &['З']), ('и', &['И']), ('й', &['Й']), ('к', &['К']), ('л', &['Л']), ('м', &['М']), ('н', &['Н']), ('о', &['О', 'ᲂ']), ('п', &['П']), ('р', &['Р']), ('с', &['С', 'ᲃ']), ('т', &['Т', 'ᲄ', 'ᲅ']), ('у', &['У']), ('ф', &['Ф']), ('х', &['Х']), ('ц', &['Ц']), ('ч', &['Ч']), ('ш', &['Ш']), ('щ', &['Щ']), ('ъ', &['Ъ', 'ᲆ']), ('ы', &['Ы']), ('ь', &['Ь']), ('э', &['Э']), ('ю', &['Ю']), ('я', &['Я']), ('ѐ', &['Ѐ']), ('ё', &['Ё']), ('ђ', &['Ђ']), ('ѓ', &['Ѓ']), ('є', &['Є']), ('ѕ', &['Ѕ']), ('і', &['І']), ('ї', &['Ї']), ('ј', &['Ј']), ('љ', &['Љ']), ('њ', &['Њ']), ('ћ', &['Ћ']), ('ќ', &['Ќ']), ('ѝ', &['Ѝ']), ('ў', &['Ў']), ('џ', &['Џ']), ('Ѡ', &['ѡ']), ('ѡ', &['Ѡ']), ('Ѣ', &['ѣ', 'ᲇ']), ('ѣ', &['Ѣ', 'ᲇ']), ('Ѥ', &['ѥ']), ('ѥ', &['Ѥ']), ('Ѧ', &['ѧ']), ('ѧ', &['Ѧ']), ('Ѩ', &['ѩ']), ('ѩ', &['Ѩ']), ('Ѫ', &['ѫ']), ('ѫ', &['Ѫ']), ('Ѭ', &['ѭ']), ('ѭ', &['Ѭ']), ('Ѯ', &['ѯ']), ('ѯ', &['Ѯ']), ('Ѱ', &['ѱ']), ('ѱ', &['Ѱ']), ('Ѳ', &['ѳ']), ('ѳ', &['Ѳ']), ('Ѵ', &['ѵ']), ('ѵ', &['Ѵ']), ('Ѷ', &['ѷ']), ('ѷ', &['Ѷ']), ('Ѹ', &['ѹ']), ('ѹ', &['Ѹ']), ('Ѻ', &['ѻ']), ('ѻ', &['Ѻ']), ('Ѽ', &['ѽ']), ('ѽ', &['Ѽ']), ('Ѿ', &['ѿ']), ('ѿ', &['Ѿ']), ('Ҁ', &['ҁ']), ('ҁ', &['Ҁ']), ('Ҋ', &['ҋ']), ('ҋ', &['Ҋ']), ('Ҍ', &['ҍ']), ('ҍ', &['Ҍ']), ('Ҏ', &['ҏ']), ('ҏ', &['Ҏ']), ('Ґ', &['ґ']), ('ґ', &['Ґ']), ('Ғ', &['ғ']), ('ғ', &['Ғ']), ('Ҕ', &['ҕ']), ('ҕ', &['Ҕ']), ('Җ', &['җ']), ('җ', &['Җ']), ('Ҙ', &['ҙ']), ('ҙ', &['Ҙ']), ('Қ', &['қ']), ('қ', &['Қ']), ('Ҝ', &['ҝ']), ('ҝ', &['Ҝ']), ('Ҟ', &['ҟ']), ('ҟ', &['Ҟ']), ('Ҡ', &['ҡ']), ('ҡ', &['Ҡ']), ('Ң', &['ң']), ('ң', &['Ң']), ('Ҥ', &['ҥ']), ('ҥ', &['Ҥ']), ('Ҧ', &['ҧ']), ('ҧ', &['Ҧ']), ('Ҩ', &['ҩ']), ('ҩ', &['Ҩ']), ('Ҫ', &['ҫ']), ('ҫ', &['Ҫ']), ('Ҭ', &['ҭ']), ('ҭ', &['Ҭ']), ('Ү', &['ү']), ('ү', &['Ү']), ('Ұ', &['ұ']), ('ұ', &['Ұ']), ('Ҳ', &['ҳ']), ('ҳ', &['Ҳ']), ('Ҵ', &['ҵ']), ('ҵ', &['Ҵ']), ('Ҷ', &['ҷ']), ('ҷ', &['Ҷ']), ('Ҹ', &['ҹ']), ('ҹ', &['Ҹ']), ('Һ', &['һ']), ('һ', &['Һ']), ('Ҽ', &['ҽ']), ('ҽ', &['Ҽ']), ('Ҿ', &['ҿ']), ('ҿ', &['Ҿ']), ('Ӏ', &['ӏ']), ('Ӂ', &['ӂ']), ('ӂ', &['Ӂ']), ('Ӄ', &['ӄ']), ('ӄ', &['Ӄ']), ('Ӆ', &['ӆ']), ('ӆ', &['Ӆ']), ('Ӈ', &['ӈ']), ('ӈ', &['Ӈ']), ('Ӊ', &['ӊ']), ('ӊ', &['Ӊ']), ('Ӌ', &['ӌ']), ('ӌ', &['Ӌ']), ('Ӎ', &['ӎ']), ('ӎ', &['Ӎ']), ('ӏ', &['Ӏ']), ('Ӑ', &['ӑ']), ('ӑ', &['Ӑ']), ('Ӓ', &['ӓ']), ('ӓ', &['Ӓ']), ('Ӕ', &['ӕ']), ('ӕ', &['Ӕ']), ('Ӗ', &['ӗ']), ('ӗ', &['Ӗ']), ('Ә', &['ә']), ('ә', &['Ә']), ('Ӛ', &['ӛ']), ('ӛ', &['Ӛ']), ('Ӝ', &['ӝ']), ('ӝ', &['Ӝ']), ('Ӟ', &['ӟ']), ('ӟ', &['Ӟ']), ('Ӡ', &['ӡ']), ('ӡ', &['Ӡ']), ('Ӣ', &['ӣ']), ('ӣ', &['Ӣ']), ('Ӥ', &['ӥ']), ('ӥ', &['Ӥ']), ('Ӧ', &['ӧ']), ('ӧ', &['Ӧ']), ('Ө', &['ө']), ('ө', &['Ө']), ('Ӫ', &['ӫ']), ('ӫ', &['Ӫ']), ('Ӭ', &['ӭ']), ('ӭ', &['Ӭ']), ('Ӯ', &['ӯ']), ('ӯ', &['Ӯ']), ('Ӱ', &['ӱ']), ('ӱ', &['Ӱ']), ('Ӳ', &['ӳ']), ('ӳ', &['Ӳ']), ('Ӵ', &['ӵ']), ('ӵ', &['Ӵ']), ('Ӷ', &['ӷ']), ('ӷ', &['Ӷ']), ('Ӹ', &['ӹ']), ('ӹ', &['Ӹ']), ('Ӻ', &['ӻ']), ('ӻ', &['Ӻ']), ('Ӽ', &['ӽ']), ('ӽ', &['Ӽ']), ('Ӿ', &['ӿ']), ('ӿ', &['Ӿ']), ('Ԁ', &['ԁ']), ('ԁ', &['Ԁ']), ('Ԃ', &['ԃ']), ('ԃ', &['Ԃ']), ('Ԅ', &['ԅ']), ('ԅ', &['Ԅ']), ('Ԇ', &['ԇ']), ('ԇ', &['Ԇ']), ('Ԉ', &['ԉ']), ('ԉ', &['Ԉ']), ('Ԋ', &['ԋ']), ('ԋ', &['Ԋ']), ('Ԍ', &['ԍ']), ('ԍ', &['Ԍ']), ('Ԏ', &['ԏ']), ('ԏ', &['Ԏ']), ('Ԑ', &['ԑ']), ('ԑ', &['Ԑ']), ('Ԓ', &['ԓ']), ('ԓ', &['Ԓ']), ('Ԕ', &['ԕ']), ('ԕ', &['Ԕ']), ('Ԗ', &['ԗ']), ('ԗ', &['Ԗ']), ('Ԙ', &['ԙ']), ('ԙ', &['Ԙ']), ('Ԛ', &['ԛ']), ('ԛ', &['Ԛ']), ('Ԝ', &['ԝ']), ('ԝ', &['Ԝ']), ('Ԟ', &['ԟ']), ('ԟ', &['Ԟ']), ('Ԡ', &['ԡ']), ('ԡ', &['Ԡ']), ('Ԣ', &['ԣ']), ('ԣ', &['Ԣ']), ('Ԥ', &['ԥ']), ('ԥ', &['Ԥ']), ('Ԧ', &['ԧ']), ('ԧ', &['Ԧ']), ('Ԩ', &['ԩ']), ('ԩ', &['Ԩ']), ('Ԫ', &['ԫ']), ('ԫ', &['Ԫ']), ('Ԭ', &['ԭ']), ('ԭ', &['Ԭ']), ('Ԯ', &['ԯ']), ('ԯ', &['Ԯ']), ('Ա', &['ա']), ('Բ', &['բ']), ('Գ', &['գ']), ('Դ', &['դ']), ('Ե', &['ե']), ('Զ', &['զ']), ('Է', &['է']), ('Ը', &['ը']), ('Թ', &['թ']), ('Ժ', &['ժ']), ('Ի', &['ի']), ('Լ', &['լ']), ('Խ', &['խ']), ('Ծ', &['ծ']), ('Կ', &['կ']), ('Հ', &['հ']), ('Ձ', &['ձ']), ('Ղ', &['ղ']), ('Ճ', &['ճ']), ('Մ', &['մ']), ('Յ', &['յ']), ('Ն', &['ն']), ('Շ', &['շ']), ('Ո', &['ո']), ('Չ', &['չ']), ('Պ', &['պ']), ('Ջ', &['ջ']), ('Ռ', &['ռ']), ('Ս', &['ս']), ('Վ', &['վ']), ('Տ', &['տ']), ('Ր', &['ր']), ('Ց', &['ց']), ('Ւ', &['ւ']), ('Փ', &['փ']), ('Ք', &['ք']), ('Օ', &['օ']), ('Ֆ', &['ֆ']), ('ա', &['Ա']), ('բ', &['Բ']), ('գ', &['Գ']), ('դ', &['Դ']), ('ե', &['Ե']), ('զ', &['Զ']), ('է', &['Է']), ('ը', &['Ը']), ('թ', &['Թ']), ('ժ', &['Ժ']), ('ի', &['Ի']), ('լ', &['Լ']), ('խ', &['Խ']), ('ծ', &['Ծ']), ('կ', &['Կ']), ('հ', &['Հ']), ('ձ', &['Ձ']), ('ղ', &['Ղ']), ('ճ', &['Ճ']), ('մ', &['Մ']), ('յ', &['Յ']), ('ն', &['Ն']), ('շ', &['Շ']), ('ո', &['Ո']), ('չ', &['Չ']), ('պ', &['Պ']), ('ջ', &['Ջ']), ('ռ', &['Ռ']), ('ս', &['Ս']), ('վ', &['Վ']), ('տ', &['Տ']), ('ր', &['Ր']), ('ց', &['Ց']), ('ւ', &['Ւ']), ('փ', &['Փ']), ('ք', &['Ք']), ('օ', &['Օ']), ('ֆ', &['Ֆ']), ('Ⴀ', &['ⴀ']), ('Ⴁ', &['ⴁ']), ('Ⴂ', &['ⴂ']), ('Ⴃ', &['ⴃ']), ('Ⴄ', &['ⴄ']), ('Ⴅ', &['ⴅ']), ('Ⴆ', &['ⴆ']), ('Ⴇ', &['ⴇ']), ('Ⴈ', &['ⴈ']), ('Ⴉ', &['ⴉ']), ('Ⴊ', &['ⴊ']), ('Ⴋ', &['ⴋ']), ('Ⴌ', &['ⴌ']), ('Ⴍ', &['ⴍ']), ('Ⴎ', &['ⴎ']), ('Ⴏ', &['ⴏ']), ('Ⴐ', &['ⴐ']), ('Ⴑ', &['ⴑ']), ('Ⴒ', &['ⴒ']), ('Ⴓ', &['ⴓ']), ('Ⴔ', &['ⴔ']), ('Ⴕ', &['ⴕ']), ('Ⴖ', &['ⴖ']), ('Ⴗ', &['ⴗ']), ('Ⴘ', &['ⴘ']), ('Ⴙ', &['ⴙ']), ('Ⴚ', &['ⴚ']), ('Ⴛ', &['ⴛ']), ('Ⴜ', &['ⴜ']), ('Ⴝ', &['ⴝ']), ('Ⴞ', &['ⴞ']), ('Ⴟ', &['ⴟ']), ('Ⴠ', &['ⴠ']), ('Ⴡ', &['ⴡ']), ('Ⴢ', &['ⴢ']), ('Ⴣ', &['ⴣ']), ('Ⴤ', &['ⴤ']), ('Ⴥ', &['ⴥ']), ('Ⴧ', &['ⴧ']), ('Ⴭ', &['ⴭ']), ('ა', &['Ა']), ('ბ', &['Ბ']), ('გ', &['Გ']), ('დ', &['Დ']), ('ე', &['Ე']), ('ვ', &['Ვ']), ('ზ', &['Ზ']), ('თ', &['Თ']), ('ი', &['Ი']), ('კ', &['Კ']), ('ლ', &['Ლ']), ('მ', &['Მ']), ('ნ', &['Ნ']), ('ო', &['Ო']), ('პ', &['Პ']), ('ჟ', &['Ჟ']), ('რ', &['Რ']), ('ს', &['Ს']), ('ტ', &['Ტ']), ('უ', &['Უ']), ('ფ', &['Ფ']), ('ქ', &['Ქ']), ('ღ', &['Ღ']), ('ყ', &['Ყ']), ('შ', &['Შ']), ('ჩ', &['Ჩ']), ('ც', &['Ც']), ('ძ', &['Ძ']), ('წ', &['Წ']), ('ჭ', &['Ჭ']), ('ხ', &['Ხ']), ('ჯ', &['Ჯ']), ('ჰ', &['Ჰ']), ('ჱ', &['Ჱ']), ('ჲ', &['Ჲ']), ('ჳ', &['Ჳ']), ('ჴ', &['Ჴ']), ('ჵ', &['Ჵ']), ('ჶ', &['Ჶ']), ('ჷ', &['Ჷ']), ('ჸ', &['Ჸ']), ('ჹ', &['Ჹ']), ('ჺ', &['Ჺ']), ('ჽ', &['Ჽ']), ('ჾ', &['Ჾ']), ('ჿ', &['Ჿ']), ('Ꭰ', &['ꭰ']), ('Ꭱ', &['ꭱ']), ('Ꭲ', &['ꭲ']), ('Ꭳ', &['ꭳ']), ('Ꭴ', &['ꭴ']), ('Ꭵ', &['ꭵ']), ('Ꭶ', &['ꭶ']), ('Ꭷ', &['ꭷ']), ('Ꭸ', &['ꭸ']), ('Ꭹ', &['ꭹ']), ('Ꭺ', &['ꭺ']), ('Ꭻ', &['ꭻ']), ('Ꭼ', &['ꭼ']), ('Ꭽ', &['ꭽ']), ('Ꭾ', &['ꭾ']), ('Ꭿ', &['ꭿ']), ('Ꮀ', &['ꮀ']), ('Ꮁ', &['ꮁ']), ('Ꮂ', &['ꮂ']), ('Ꮃ', &['ꮃ']), ('Ꮄ', &['ꮄ']), ('Ꮅ', &['ꮅ']), ('Ꮆ', &['ꮆ']), ('Ꮇ', &['ꮇ']), ('Ꮈ', &['ꮈ']), ('Ꮉ', &['ꮉ']), ('Ꮊ', &['ꮊ']), ('Ꮋ', &['ꮋ']), ('Ꮌ', &['ꮌ']), ('Ꮍ', &['ꮍ']), ('Ꮎ', &['ꮎ']), ('Ꮏ', &['ꮏ']), ('Ꮐ', &['ꮐ']), ('Ꮑ', &['ꮑ']), ('Ꮒ', &['ꮒ']), ('Ꮓ', &['ꮓ']), ('Ꮔ', &['ꮔ']), ('Ꮕ', &['ꮕ']), ('Ꮖ', &['ꮖ']), ('Ꮗ', &['ꮗ']), ('Ꮘ', &['ꮘ']), ('Ꮙ', &['ꮙ']), ('Ꮚ', &['ꮚ']), ('Ꮛ', &['ꮛ']), ('Ꮜ', &['ꮜ']), ('Ꮝ', &['ꮝ']), ('Ꮞ', &['ꮞ']), ('Ꮟ', &['ꮟ']), ('Ꮠ', &['ꮠ']), ('Ꮡ', &['ꮡ']), ('Ꮢ', &['ꮢ']), ('Ꮣ', &['ꮣ']), ('Ꮤ', &['ꮤ']), ('Ꮥ', &['ꮥ']), ('Ꮦ', &['ꮦ']), ('Ꮧ', &['ꮧ']), ('Ꮨ', &['ꮨ']), ('Ꮩ', &['ꮩ']), ('Ꮪ', &['ꮪ']), ('Ꮫ', &['ꮫ']), ('Ꮬ', &['ꮬ']), ('Ꮭ', &['ꮭ']), ('Ꮮ', &['ꮮ']), ('Ꮯ', &['ꮯ']), ('Ꮰ', &['ꮰ']), ('Ꮱ', &['ꮱ']), ('Ꮲ', &['ꮲ']), ('Ꮳ', &['ꮳ']), ('Ꮴ', &['ꮴ']), ('Ꮵ', &['ꮵ']), ('Ꮶ', &['ꮶ']), ('Ꮷ', &['ꮷ']), ('Ꮸ', &['ꮸ']), ('Ꮹ', &['ꮹ']), ('Ꮺ', &['ꮺ']), ('Ꮻ', &['ꮻ']), ('Ꮼ', &['ꮼ']), ('Ꮽ', &['ꮽ']), ('Ꮾ', &['ꮾ']), ('Ꮿ', &['ꮿ']), ('Ᏸ', &['ᏸ']), ('Ᏹ', &['ᏹ']), ('Ᏺ', &['ᏺ']), ('Ᏻ', &['ᏻ']), ('Ᏼ', &['ᏼ']), ('Ᏽ', &['ᏽ']), ('ᏸ', &['Ᏸ']), ('ᏹ', &['Ᏹ']), ('ᏺ', &['Ᏺ']), ('ᏻ', &['Ᏻ']), ('ᏼ', &['Ᏼ']), ('ᏽ', &['Ᏽ']), ('ᲀ', &['В', 'в']), ('ᲁ', &['Д', 'д']), ('ᲂ', &['О', 'о']), ('ᲃ', &['С', 'с']), ('ᲄ', &['Т', 'т', 'ᲅ']), ('ᲅ', &['Т', 'т', 'ᲄ']), ('ᲆ', &['Ъ', 'ъ']), ('ᲇ', &['Ѣ', 'ѣ']), ('ᲈ', &['Ꙋ', 'ꙋ']), ('Ა', &['ა']), ('Ბ', &['ბ']), ('Გ', &['გ']), ('Დ', &['დ']), ('Ე', &['ე']), ('Ვ', &['ვ']), ('Ზ', &['ზ']), ('Თ', &['თ']), ('Ი', &['ი']), ('Კ', &['კ']), ('Ლ', &['ლ']), ('Მ', &['მ']), ('Ნ', &['ნ']), ('Ო', &['ო']), ('Პ', &['პ']), ('Ჟ', &['ჟ']), ('Რ', &['რ']), ('Ს', &['ს']), ('Ტ', &['ტ']), ('Უ', &['უ']), ('Ფ', &['ფ']), ('Ქ', &['ქ']), ('Ღ', &['ღ']), ('Ყ', &['ყ']), ('Შ', &['შ']), ('Ჩ', &['ჩ']), ('Ც', &['ც']), ('Ძ', &['ძ']), ('Წ', &['წ']), ('Ჭ', &['ჭ']), ('Ხ', &['ხ']), ('Ჯ', &['ჯ']), ('Ჰ', &['ჰ']), ('Ჱ', &['ჱ']), ('Ჲ', &['ჲ']), ('Ჳ', &['ჳ']), ('Ჴ', &['ჴ']), ('Ჵ', &['ჵ']), ('Ჶ', &['ჶ']), ('Ჷ', &['ჷ']), ('Ჸ', &['ჸ']), ('Ჹ', &['ჹ']), ('Ჺ', &['ჺ']), ('Ჽ', &['ჽ']), ('Ჾ', &['ჾ']), ('Ჿ', &['ჿ']), ('ᵹ', &['Ᵹ']), ('ᵽ', &['Ᵽ']), ('ᶎ', &['Ᶎ']), ('Ḁ', &['ḁ']), ('ḁ', &['Ḁ']), ('Ḃ', &['ḃ']), ('ḃ', &['Ḃ']), ('Ḅ', &['ḅ']), ('ḅ', &['Ḅ']), ('Ḇ', &['ḇ']), ('ḇ', &['Ḇ']), ('Ḉ', &['ḉ']), ('ḉ', &['Ḉ']), ('Ḋ', &['ḋ']), ('ḋ', &['Ḋ']), ('Ḍ', &['ḍ']), ('ḍ', &['Ḍ']), ('Ḏ', &['ḏ']), ('ḏ', &['Ḏ']), ('Ḑ', &['ḑ']), ('ḑ', &['Ḑ']), ('Ḓ', &['ḓ']), ('ḓ', &['Ḓ']), ('Ḕ', &['ḕ']), ('ḕ', &['Ḕ']), ('Ḗ', &['ḗ']), ('ḗ', &['Ḗ']), ('Ḙ', &['ḙ']), ('ḙ', &['Ḙ']), ('Ḛ', &['ḛ']), ('ḛ', &['Ḛ']), ('Ḝ', &['ḝ']), ('ḝ', &['Ḝ']), ('Ḟ', &['ḟ']), ('ḟ', &['Ḟ']), ('Ḡ', &['ḡ']), ('ḡ', &['Ḡ']), ('Ḣ', &['ḣ']), ('ḣ', &['Ḣ']), ('Ḥ', &['ḥ']), ('ḥ', &['Ḥ']), ('Ḧ', &['ḧ']), ('ḧ', &['Ḧ']), ('Ḩ', &['ḩ']), ('ḩ', &['Ḩ']), ('Ḫ', &['ḫ']), ('ḫ', &['Ḫ']), ('Ḭ', &['ḭ']), ('ḭ', &['Ḭ']), ('Ḯ', &['ḯ']), ('ḯ', &['Ḯ']), ('Ḱ', &['ḱ']), ('ḱ', &['Ḱ']), ('Ḳ', &['ḳ']), ('ḳ', &['Ḳ']), ('Ḵ', &['ḵ']), ('ḵ', &['Ḵ']), ('Ḷ', &['ḷ']), ('ḷ', &['Ḷ']), ('Ḹ', &['ḹ']), ('ḹ', &['Ḹ']), ('Ḻ', &['ḻ']), ('ḻ', &['Ḻ']), ('Ḽ', &['ḽ']), ('ḽ', &['Ḽ']), ('Ḿ', &['ḿ']), ('ḿ', &['Ḿ']), ('Ṁ', &['ṁ']), ('ṁ', &['Ṁ']), ('Ṃ', &['ṃ']), ('ṃ', &['Ṃ']), ('Ṅ', &['ṅ']), ('ṅ', &['Ṅ']), ('Ṇ', &['ṇ']), ('ṇ', &['Ṇ']), ('Ṉ', &['ṉ']), ('ṉ', &['Ṉ']), ('Ṋ', &['ṋ']), ('ṋ', &['Ṋ']), ('Ṍ', &['ṍ']), ('ṍ', &['Ṍ']), ('Ṏ', &['ṏ']), ('ṏ', &['Ṏ']), ('Ṑ', &['ṑ']), ('ṑ', &['Ṑ']), ('Ṓ', &['ṓ']), ('ṓ', &['Ṓ']), ('Ṕ', &['ṕ']), ('ṕ', &['Ṕ']), ('Ṗ', &['ṗ']), ('ṗ', &['Ṗ']), ('Ṙ', &['ṙ']), ('ṙ', &['Ṙ']), ('Ṛ', &['ṛ']), ('ṛ', &['Ṛ']), ('Ṝ', &['ṝ']), ('ṝ', &['Ṝ']), ('Ṟ', &['ṟ']), ('ṟ', &['Ṟ']), ('Ṡ', &['ṡ', 'ẛ']), ('ṡ', &['Ṡ', 'ẛ']), ('Ṣ', &['ṣ']), ('ṣ', &['Ṣ']), ('Ṥ', &['ṥ']), ('ṥ', &['Ṥ']), ('Ṧ', &['ṧ']), ('ṧ', &['Ṧ']), ('Ṩ', &['ṩ']), ('ṩ', &['Ṩ']), ('Ṫ', &['ṫ']), ('ṫ', &['Ṫ']), ('Ṭ', &['ṭ']), ('ṭ', &['Ṭ']), ('Ṯ', &['ṯ']), ('ṯ', &['Ṯ']), ('Ṱ', &['ṱ']), ('ṱ', &['Ṱ']), ('Ṳ', &['ṳ']), ('ṳ', &['Ṳ']), ('Ṵ', &['ṵ']), ('ṵ', &['Ṵ']), ('Ṷ', &['ṷ']), ('ṷ', &['Ṷ']), ('Ṹ', &['ṹ']), ('ṹ', &['Ṹ']), ('Ṻ', &['ṻ']), ('ṻ', &['Ṻ']), ('Ṽ', &['ṽ']), ('ṽ', &['Ṽ']), ('Ṿ', &['ṿ']), ('ṿ', &['Ṿ']), ('Ẁ', &['ẁ']), ('ẁ', &['Ẁ']), ('Ẃ', &['ẃ']), ('ẃ', &['Ẃ']), ('Ẅ', &['ẅ']), ('ẅ', &['Ẅ']), ('Ẇ', &['ẇ']), ('ẇ', &['Ẇ']), ('Ẉ', &['ẉ']), ('ẉ', &['Ẉ']), ('Ẋ', &['ẋ']), ('ẋ', &['Ẋ']), ('Ẍ', &['ẍ']), ('ẍ', &['Ẍ']), ('Ẏ', &['ẏ']), ('ẏ', &['Ẏ']), ('Ẑ', &['ẑ']), ('ẑ', &['Ẑ']), ('Ẓ', &['ẓ']), ('ẓ', &['Ẓ']), ('Ẕ', &['ẕ']), ('ẕ', &['Ẕ']), ('ẛ', &['Ṡ', 'ṡ']), ('ẞ', &['ß']), ('Ạ', &['ạ']), ('ạ', &['Ạ']), ('Ả', &['ả']), ('ả', &['Ả']), ('Ấ', &['ấ']), ('ấ', &['Ấ']), ('Ầ', &['ầ']), ('ầ', &['Ầ']), ('Ẩ', &['ẩ']), ('ẩ', &['Ẩ']), ('Ẫ', &['ẫ']), ('ẫ', &['Ẫ']), ('Ậ', &['ậ']), ('ậ', &['Ậ']), ('Ắ', &['ắ']), ('ắ', &['Ắ']), ('Ằ', &['ằ']), ('ằ', &['Ằ']), ('Ẳ', &['ẳ']), ('ẳ', &['Ẳ']), ('Ẵ', &['ẵ']), ('ẵ', &['Ẵ']), ('Ặ', &['ặ']), ('ặ', &['Ặ']), ('Ẹ', &['ẹ']), ('ẹ', &['Ẹ']), ('Ẻ', &['ẻ']), ('ẻ', &['Ẻ']), ('Ẽ', &['ẽ']), ('ẽ', &['Ẽ']), ('Ế', &['ế']), ('ế', &['Ế']), ('Ề', &['ề']), ('ề', &['Ề']), ('Ể', &['ể']), ('ể', &['Ể']), ('Ễ', &['ễ']), ('ễ', &['Ễ']), ('Ệ', &['ệ']), ('ệ', &['Ệ']), ('Ỉ', &['ỉ']), ('ỉ', &['Ỉ']), ('Ị', &['ị']), ('ị', &['Ị']), ('Ọ', &['ọ']), ('ọ', &['Ọ']), ('Ỏ', &['ỏ']), ('ỏ', &['Ỏ']), ('Ố', &['ố']), ('ố', &['Ố']), ('Ồ', &['ồ']), ('ồ', &['Ồ']), ('Ổ', &['ổ']), ('ổ', &['Ổ']), ('Ỗ', &['ỗ']), ('ỗ', &['Ỗ']), ('Ộ', &['ộ']), ('ộ', &['Ộ']), ('Ớ', &['ớ']), ('ớ', &['Ớ']), ('Ờ', &['ờ']), ('ờ', &['Ờ']), ('Ở', &['ở']), ('ở', &['Ở']), ('Ỡ', &['ỡ']), ('ỡ', &['Ỡ']), ('Ợ', &['ợ']), ('ợ', &['Ợ']), ('Ụ', &['ụ']), ('ụ', &['Ụ']), ('Ủ', &['ủ']), ('ủ', &['Ủ']), ('Ứ', &['ứ']), ('ứ', &['Ứ']), ('Ừ', &['ừ']), ('ừ', &['Ừ']), ('Ử', &['ử']), ('ử', &['Ử']), ('Ữ', &['ữ']), ('ữ', &['Ữ']), ('Ự', &['ự']), ('ự', &['Ự']), ('Ỳ', &['ỳ']), ('ỳ', &['Ỳ']), ('Ỵ', &['ỵ']), ('ỵ', &['Ỵ']), ('Ỷ', &['ỷ']), ('ỷ', &['Ỷ']), ('Ỹ', &['ỹ']), ('ỹ', &['Ỹ']), ('Ỻ', &['ỻ']), ('ỻ', &['Ỻ']), ('Ỽ', &['ỽ']), ('ỽ', &['Ỽ']), ('Ỿ', &['ỿ']), ('ỿ', &['Ỿ']), ('ἀ', &['Ἀ']), ('ἁ', &['Ἁ']), ('ἂ', &['Ἂ']), ('ἃ', &['Ἃ']), ('ἄ', &['Ἄ']), ('ἅ', &['Ἅ']), ('ἆ', &['Ἆ']), ('ἇ', &['Ἇ']), ('Ἀ', &['ἀ']), ('Ἁ', &['ἁ']), ('Ἂ', &['ἂ']), ('Ἃ', &['ἃ']), ('Ἄ', &['ἄ']), ('Ἅ', &['ἅ']), ('Ἆ', &['ἆ']), ('Ἇ', &['ἇ']), ('ἐ', &['Ἐ']), ('ἑ', &['Ἑ']), ('ἒ', &['Ἒ']), ('ἓ', &['Ἓ']), ('ἔ', &['Ἔ']), ('ἕ', &['Ἕ']), ('Ἐ', &['ἐ']), ('Ἑ', &['ἑ']), ('Ἒ', &['ἒ']), ('Ἓ', &['ἓ']), ('Ἔ', &['ἔ']), ('Ἕ', &['ἕ']), ('ἠ', &['Ἠ']), ('ἡ', &['Ἡ']), ('ἢ', &['Ἢ']), ('ἣ', &['Ἣ']), ('ἤ', &['Ἤ']), ('ἥ', &['Ἥ']), ('ἦ', &['Ἦ']), ('ἧ', &['Ἧ']), ('Ἠ', &['ἠ']), ('Ἡ', &['ἡ']), ('Ἢ', &['ἢ']), ('Ἣ', &['ἣ']), ('Ἤ', &['ἤ']), ('Ἥ', &['ἥ']), ('Ἦ', &['ἦ']), ('Ἧ', &['ἧ']), ('ἰ', &['Ἰ']), ('ἱ', &['Ἱ']), ('ἲ', &['Ἲ']), ('ἳ', &['Ἳ']), ('ἴ', &['Ἴ']), ('ἵ', &['Ἵ']), ('ἶ', &['Ἶ']), ('ἷ', &['Ἷ']), ('Ἰ', &['ἰ']), ('Ἱ', &['ἱ']), ('Ἲ', &['ἲ']), ('Ἳ', &['ἳ']), ('Ἴ', &['ἴ']), ('Ἵ', &['ἵ']), ('Ἶ', &['ἶ']), ('Ἷ', &['ἷ']), ('ὀ', &['Ὀ']), ('ὁ', &['Ὁ']), ('ὂ', &['Ὂ']), ('ὃ', &['Ὃ']), ('ὄ', &['Ὄ']), ('ὅ', &['Ὅ']), ('Ὀ', &['ὀ']), ('Ὁ', &['ὁ']), ('Ὂ', &['ὂ']), ('Ὃ', &['ὃ']), ('Ὄ', &['ὄ']), ('Ὅ', &['ὅ']), ('ὑ', &['Ὑ']), ('ὓ', &['Ὓ']), ('ὕ', &['Ὕ']), ('ὗ', &['Ὗ']), ('Ὑ', &['ὑ']), ('Ὓ', &['ὓ']), ('Ὕ', &['ὕ']), ('Ὗ', &['ὗ']), ('ὠ', &['Ὠ']), ('ὡ', &['Ὡ']), ('ὢ', &['Ὢ']), ('ὣ', &['Ὣ']), ('ὤ', &['Ὤ']), ('ὥ', &['Ὥ']), ('ὦ', &['Ὦ']), ('ὧ', &['Ὧ']), ('Ὠ', &['ὠ']), ('Ὡ', &['ὡ']), ('Ὢ', &['ὢ']), ('Ὣ', &['ὣ']), ('Ὤ', &['ὤ']), ('Ὥ', &['ὥ']), ('Ὦ', &['ὦ']), ('Ὧ', &['ὧ']), ('ὰ', &['Ὰ']), ('ά', &['Ά']), ('ὲ', &['Ὲ']), ('έ', &['Έ']), ('ὴ', &['Ὴ']), ('ή', &['Ή']), ('ὶ', &['Ὶ']), ('ί', &['Ί']), ('ὸ', &['Ὸ']), ('ό', &['Ό']), ('ὺ', &['Ὺ']), ('ύ', &['Ύ']), ('ὼ', &['Ὼ']), ('ώ', &['Ώ']), ('ᾀ', &['ᾈ']), ('ᾁ', &['ᾉ']), ('ᾂ', &['ᾊ']), ('ᾃ', &['ᾋ']), ('ᾄ', &['ᾌ']), ('ᾅ', &['ᾍ']), ('ᾆ', &['ᾎ']), ('ᾇ', &['ᾏ']), ('ᾈ', &['ᾀ']), ('ᾉ', &['ᾁ']), ('ᾊ', &['ᾂ']), ('ᾋ', &['ᾃ']), ('ᾌ', &['ᾄ']), ('ᾍ', &['ᾅ']), ('ᾎ', &['ᾆ']), ('ᾏ', &['ᾇ']), ('ᾐ', &['ᾘ']), ('ᾑ', &['ᾙ']), ('ᾒ', &['ᾚ']), ('ᾓ', &['ᾛ']), ('ᾔ', &['ᾜ']), ('ᾕ', &['ᾝ']), ('ᾖ', &['ᾞ']), ('ᾗ', &['ᾟ']), ('ᾘ', &['ᾐ']), ('ᾙ', &['ᾑ']), ('ᾚ', &['ᾒ']), ('ᾛ', &['ᾓ']), ('ᾜ', &['ᾔ']), ('ᾝ', &['ᾕ']), ('ᾞ', &['ᾖ']), ('ᾟ', &['ᾗ']), ('ᾠ', &['ᾨ']), ('ᾡ', &['ᾩ']), ('ᾢ', &['ᾪ']), ('ᾣ', &['ᾫ']), ('ᾤ', &['ᾬ']), ('ᾥ', &['ᾭ']), ('ᾦ', &['ᾮ']), ('ᾧ', &['ᾯ']), ('ᾨ', &['ᾠ']), ('ᾩ', &['ᾡ']), ('ᾪ', &['ᾢ']), ('ᾫ', &['ᾣ']), ('ᾬ', &['ᾤ']), ('ᾭ', &['ᾥ']), ('ᾮ', &['ᾦ']), ('ᾯ', &['ᾧ']), ('ᾰ', &['Ᾰ']), ('ᾱ', &['Ᾱ']), ('ᾳ', &['ᾼ']), ('Ᾰ', &['ᾰ']), ('Ᾱ', &['ᾱ']), ('Ὰ', &['ὰ']), ('Ά', &['ά']), ('ᾼ', &['ᾳ']), ('ι', &['\u{345}', 'Ι', 'ι']), ('ῃ', &['ῌ']), ('Ὲ', &['ὲ']), ('Έ', &['έ']), ('Ὴ', &['ὴ']), ('Ή', &['ή']), ('ῌ', &['ῃ']), ('ῐ', &['Ῐ']), ('ῑ', &['Ῑ']), ('Ῐ', &['ῐ']), ('Ῑ', &['ῑ']), ('Ὶ', &['ὶ']), ('Ί', &['ί']), ('ῠ', &['Ῠ']), ('ῡ', &['Ῡ']), ('ῥ', &['Ῥ']), ('Ῠ', &['ῠ']), ('Ῡ', &['ῡ']), ('Ὺ', &['ὺ']), ('Ύ', &['ύ']), ('Ῥ', &['ῥ']), ('ῳ', &['ῼ']), ('Ὸ', &['ὸ']), ('Ό', &['ό']), ('Ὼ', &['ὼ']), ('Ώ', &['ώ']), ('ῼ', &['ῳ']), ('Ω', &['Ω', 'ω']), ('K', &['K', 'k']), ('Å', &['Å', 'å']), ('Ⅎ', &['ⅎ']), ('ⅎ', &['Ⅎ']), ('Ⅰ', &['ⅰ']), ('Ⅱ', &['ⅱ']), ('Ⅲ', &['ⅲ']), ('Ⅳ', &['ⅳ']), ('Ⅴ', &['ⅴ']), ('Ⅵ', &['ⅵ']), ('Ⅶ', &['ⅶ']), ('Ⅷ', &['ⅷ']), ('Ⅸ', &['ⅸ']), ('Ⅹ', &['ⅹ']), ('Ⅺ', &['ⅺ']), ('Ⅻ', &['ⅻ']), ('Ⅼ', &['ⅼ']), ('Ⅽ', &['ⅽ']), ('Ⅾ', &['ⅾ']), ('Ⅿ', &['ⅿ']), ('ⅰ', &['Ⅰ']), ('ⅱ', &['Ⅱ']), ('ⅲ', &['Ⅲ']), ('ⅳ', &['Ⅳ']), ('ⅴ', &['Ⅴ']), ('ⅵ', &['Ⅵ']), ('ⅶ', &['Ⅶ']), ('ⅷ', &['Ⅷ']), ('ⅸ', &['Ⅸ']), ('ⅹ', &['Ⅹ']), ('ⅺ', &['Ⅺ']), ('ⅻ', &['Ⅻ']), ('ⅼ', &['Ⅼ']), ('ⅽ', &['Ⅽ']), ('ⅾ', &['Ⅾ']), ('ⅿ', &['Ⅿ']), ('Ↄ', &['ↄ']), ('ↄ', &['Ↄ']), ('Ⓐ', &['ⓐ']), ('Ⓑ', &['ⓑ']), ('Ⓒ', &['ⓒ']), ('Ⓓ', &['ⓓ']), ('Ⓔ', &['ⓔ']), ('Ⓕ', &['ⓕ']), ('Ⓖ', &['ⓖ']), ('Ⓗ', &['ⓗ']), ('Ⓘ', &['ⓘ']), ('Ⓙ', &['ⓙ']), ('Ⓚ', &['ⓚ']), ('Ⓛ', &['ⓛ']), ('Ⓜ', &['ⓜ']), ('Ⓝ', &['ⓝ']), ('Ⓞ', &['ⓞ']), ('Ⓟ', &['ⓟ']), ('Ⓠ', &['ⓠ']), ('Ⓡ', &['ⓡ']), ('Ⓢ', &['ⓢ']), ('Ⓣ', &['ⓣ']), ('Ⓤ', &['ⓤ']), ('Ⓥ', &['ⓥ']), ('Ⓦ', &['ⓦ']), ('Ⓧ', &['ⓧ']), ('Ⓨ', &['ⓨ']), ('Ⓩ', &['ⓩ']), ('ⓐ', &['Ⓐ']), ('ⓑ', &['Ⓑ']), ('ⓒ', &['Ⓒ']), ('ⓓ', &['Ⓓ']), ('ⓔ', &['Ⓔ']), ('ⓕ', &['Ⓕ']), ('ⓖ', &['Ⓖ']), ('ⓗ', &['Ⓗ']), ('ⓘ', &['Ⓘ']), ('ⓙ', &['Ⓙ']), ('ⓚ', &['Ⓚ']), ('ⓛ', &['Ⓛ']), ('ⓜ', &['Ⓜ']), ('ⓝ', &['Ⓝ']), ('ⓞ', &['Ⓞ']), ('ⓟ', &['Ⓟ']), ('ⓠ', &['Ⓠ']), ('ⓡ', &['Ⓡ']), ('ⓢ', &['Ⓢ']), ('ⓣ', &['Ⓣ']), ('ⓤ', &['Ⓤ']), ('ⓥ', &['Ⓥ']), ('ⓦ', &['Ⓦ']), ('ⓧ', &['Ⓧ']), ('ⓨ', &['Ⓨ']), ('ⓩ', &['Ⓩ']), ('Ⰰ', &['ⰰ']), ('Ⰱ', &['ⰱ']), ('Ⰲ', &['ⰲ']), ('Ⰳ', &['ⰳ']), ('Ⰴ', &['ⰴ']), ('Ⰵ', &['ⰵ']), ('Ⰶ', &['ⰶ']), ('Ⰷ', &['ⰷ']), ('Ⰸ', &['ⰸ']), ('Ⰹ', &['ⰹ']), ('Ⰺ', &['ⰺ']), ('Ⰻ', &['ⰻ']), ('Ⰼ', &['ⰼ']), ('Ⰽ', &['ⰽ']), ('Ⰾ', &['ⰾ']), ('Ⰿ', &['ⰿ']), ('Ⱀ', &['ⱀ']), ('Ⱁ', &['ⱁ']), ('Ⱂ', &['ⱂ']), ('Ⱃ', &['ⱃ']), ('Ⱄ', &['ⱄ']), ('Ⱅ', &['ⱅ']), ('Ⱆ', &['ⱆ']), ('Ⱇ', &['ⱇ']), ('Ⱈ', &['ⱈ']), ('Ⱉ', &['ⱉ']), ('Ⱊ', &['ⱊ']), ('Ⱋ', &['ⱋ']), ('Ⱌ', &['ⱌ']), ('Ⱍ', &['ⱍ']), ('Ⱎ', &['ⱎ']), ('Ⱏ', &['ⱏ']), ('Ⱐ', &['ⱐ']), ('Ⱑ', &['ⱑ']), ('Ⱒ', &['ⱒ']), ('Ⱓ', &['ⱓ']), ('Ⱔ', &['ⱔ']), ('Ⱕ', &['ⱕ']), ('Ⱖ', &['ⱖ']), ('Ⱗ', &['ⱗ']), ('Ⱘ', &['ⱘ']), ('Ⱙ', &['ⱙ']), ('Ⱚ', &['ⱚ']), ('Ⱛ', &['ⱛ']), ('Ⱜ', &['ⱜ']), ('Ⱝ', &['ⱝ']), ('Ⱞ', &['ⱞ']), ('Ⱟ', &['ⱟ']), ('ⰰ', &['Ⰰ']), ('ⰱ', &['Ⰱ']), ('ⰲ', &['Ⰲ']), ('ⰳ', &['Ⰳ']), ('ⰴ', &['Ⰴ']), ('ⰵ', &['Ⰵ']), ('ⰶ', &['Ⰶ']), ('ⰷ', &['Ⰷ']), ('ⰸ', &['Ⰸ']), ('ⰹ', &['Ⰹ']), ('ⰺ', &['Ⰺ']), ('ⰻ', &['Ⰻ']), ('ⰼ', &['Ⰼ']), ('ⰽ', &['Ⰽ']), ('ⰾ', &['Ⰾ']), ('ⰿ', &['Ⰿ']), ('ⱀ', &['Ⱀ']), ('ⱁ', &['Ⱁ']), ('ⱂ', &['Ⱂ']), ('ⱃ', &['Ⱃ']), ('ⱄ', &['Ⱄ']), ('ⱅ', &['Ⱅ']), ('ⱆ', &['Ⱆ']), ('ⱇ', &['Ⱇ']), ('ⱈ', &['Ⱈ']), ('ⱉ', &['Ⱉ']), ('ⱊ', &['Ⱊ']), ('ⱋ', &['Ⱋ']), ('ⱌ', &['Ⱌ']), ('ⱍ', &['Ⱍ']), ('ⱎ', &['Ⱎ']), ('ⱏ', &['Ⱏ']), ('ⱐ', &['Ⱐ']), ('ⱑ', &['Ⱑ']), ('ⱒ', &['Ⱒ']), ('ⱓ', &['Ⱓ']), ('ⱔ', &['Ⱔ']), ('ⱕ', &['Ⱕ']), ('ⱖ', &['Ⱖ']), ('ⱗ', &['Ⱗ']), ('ⱘ', &['Ⱘ']), ('ⱙ', &['Ⱙ']), ('ⱚ', &['Ⱚ']), ('ⱛ', &['Ⱛ']), ('ⱜ', &['Ⱜ']), ('ⱝ', &['Ⱝ']), ('ⱞ', &['Ⱞ']), ('ⱟ', &['Ⱟ']), ('Ⱡ', &['ⱡ']), ('ⱡ', &['Ⱡ']), ('Ɫ', &['ɫ']), ('Ᵽ', &['ᵽ']), ('Ɽ', &['ɽ']), ('ⱥ', &['Ⱥ']), ('ⱦ', &['Ⱦ']), ('Ⱨ', &['ⱨ']), ('ⱨ', &['Ⱨ']), ('Ⱪ', &['ⱪ']), ('ⱪ', &['Ⱪ']), ('Ⱬ', &['ⱬ']), ('ⱬ', &['Ⱬ']), ('Ɑ', &['ɑ']), ('Ɱ', &['ɱ']), ('Ɐ', &['ɐ']), ('Ɒ', &['ɒ']), ('Ⱳ', &['ⱳ']), ('ⱳ', &['Ⱳ']), ('Ⱶ', &['ⱶ']), ('ⱶ', &['Ⱶ']), ('Ȿ', &['ȿ']), ('Ɀ', &['ɀ']), ('Ⲁ', &['ⲁ']), ('ⲁ', &['Ⲁ']), ('Ⲃ', &['ⲃ']), ('ⲃ', &['Ⲃ']), ('Ⲅ', &['ⲅ']), ('ⲅ', &['Ⲅ']), ('Ⲇ', &['ⲇ']), ('ⲇ', &['Ⲇ']), ('Ⲉ', &['ⲉ']), ('ⲉ', &['Ⲉ']), ('Ⲋ', &['ⲋ']), ('ⲋ', &['Ⲋ']), ('Ⲍ', &['ⲍ']), ('ⲍ', &['Ⲍ']), ('Ⲏ', &['ⲏ']), ('ⲏ', &['Ⲏ']), ('Ⲑ', &['ⲑ']), ('ⲑ', &['Ⲑ']), ('Ⲓ', &['ⲓ']), ('ⲓ', &['Ⲓ']), ('Ⲕ', &['ⲕ']), ('ⲕ', &['Ⲕ']), ('Ⲗ', &['ⲗ']), ('ⲗ', &['Ⲗ']), ('Ⲙ', &['ⲙ']), ('ⲙ', &['Ⲙ']), ('Ⲛ', &['ⲛ']), ('ⲛ', &['Ⲛ']), ('Ⲝ', &['ⲝ']), ('ⲝ', &['Ⲝ']), ('Ⲟ', &['ⲟ']), ('ⲟ', &['Ⲟ']), ('Ⲡ', &['ⲡ']), ('ⲡ', &['Ⲡ']), ('Ⲣ', &['ⲣ']), ('ⲣ', &['Ⲣ']), ('Ⲥ', &['ⲥ']), ('ⲥ', &['Ⲥ']), ('Ⲧ', &['ⲧ']), ('ⲧ', &['Ⲧ']), ('Ⲩ', &['ⲩ']), ('ⲩ', &['Ⲩ']), ('Ⲫ', &['ⲫ']), ('ⲫ', &['Ⲫ']), ('Ⲭ', &['ⲭ']), ('ⲭ', &['Ⲭ']), ('Ⲯ', &['ⲯ']), ('ⲯ', &['Ⲯ']), ('Ⲱ', &['ⲱ']), ('ⲱ', &['Ⲱ']), ('Ⲳ', &['ⲳ']), ('ⲳ', &['Ⲳ']), ('Ⲵ', &['ⲵ']), ('ⲵ', &['Ⲵ']), ('Ⲷ', &['ⲷ']), ('ⲷ', &['Ⲷ']), ('Ⲹ', &['ⲹ']), ('ⲹ', &['Ⲹ']), ('Ⲻ', &['ⲻ']), ('ⲻ', &['Ⲻ']), ('Ⲽ', &['ⲽ']), ('ⲽ', &['Ⲽ']), ('Ⲿ', &['ⲿ']), ('ⲿ', &['Ⲿ']), ('Ⳁ', &['ⳁ']), ('ⳁ', &['Ⳁ']), ('Ⳃ', &['ⳃ']), ('ⳃ', &['Ⳃ']), ('Ⳅ', &['ⳅ']), ('ⳅ', &['Ⳅ']), ('Ⳇ', &['ⳇ']), ('ⳇ', &['Ⳇ']), ('Ⳉ', &['ⳉ']), ('ⳉ', &['Ⳉ']), ('Ⳋ', &['ⳋ']), ('ⳋ', &['Ⳋ']), ('Ⳍ', &['ⳍ']), ('ⳍ', &['Ⳍ']), ('Ⳏ', &['ⳏ']), ('ⳏ', &['Ⳏ']), ('Ⳑ', &['ⳑ']), ('ⳑ', &['Ⳑ']), ('Ⳓ', &['ⳓ']), ('ⳓ', &['Ⳓ']), ('Ⳕ', &['ⳕ']), ('ⳕ', &['Ⳕ']), ('Ⳗ', &['ⳗ']), ('ⳗ', &['Ⳗ']), ('Ⳙ', &['ⳙ']), ('ⳙ', &['Ⳙ']), ('Ⳛ', &['ⳛ']), ('ⳛ', &['Ⳛ']), ('Ⳝ', &['ⳝ']), ('ⳝ', &['Ⳝ']), ('Ⳟ', &['ⳟ']), ('ⳟ', &['Ⳟ']), ('Ⳡ', &['ⳡ']), ('ⳡ', &['Ⳡ']), ('Ⳣ', &['ⳣ']), ('ⳣ', &['Ⳣ']), ('Ⳬ', &['ⳬ']), ('ⳬ', &['Ⳬ']), ('Ⳮ', &['ⳮ']), ('ⳮ', &['Ⳮ']), ('Ⳳ', &['ⳳ']), ('ⳳ', &['Ⳳ']), ('ⴀ', &['Ⴀ']), ('ⴁ', &['Ⴁ']), ('ⴂ', &['Ⴂ']), ('ⴃ', &['Ⴃ']), ('ⴄ', &['Ⴄ']), ('ⴅ', &['Ⴅ']), ('ⴆ', &['Ⴆ']), ('ⴇ', &['Ⴇ']), ('ⴈ', &['Ⴈ']), ('ⴉ', &['Ⴉ']), ('ⴊ', &['Ⴊ']), ('ⴋ', &['Ⴋ']), ('ⴌ', &['Ⴌ']), ('ⴍ', &['Ⴍ']), ('ⴎ', &['Ⴎ']), ('ⴏ', &['Ⴏ']), ('ⴐ', &['Ⴐ']), ('ⴑ', &['Ⴑ']), ('ⴒ', &['Ⴒ']), ('ⴓ', &['Ⴓ']), ('ⴔ', &['Ⴔ']), ('ⴕ', &['Ⴕ']), ('ⴖ', &['Ⴖ']), ('ⴗ', &['Ⴗ']), ('ⴘ', &['Ⴘ']), ('ⴙ', &['Ⴙ']), ('ⴚ', &['Ⴚ']), ('ⴛ', &['Ⴛ']), ('ⴜ', &['Ⴜ']), ('ⴝ', &['Ⴝ']), ('ⴞ', &['Ⴞ']), ('ⴟ', &['Ⴟ']), ('ⴠ', &['Ⴠ']), ('ⴡ', &['Ⴡ']), ('ⴢ', &['Ⴢ']), ('ⴣ', &['Ⴣ']), ('ⴤ', &['Ⴤ']), ('ⴥ', &['Ⴥ']), ('ⴧ', &['Ⴧ']), ('ⴭ', &['Ⴭ']), ('Ꙁ', &['ꙁ']), ('ꙁ', &['Ꙁ']), ('Ꙃ', &['ꙃ']), ('ꙃ', &['Ꙃ']), ('Ꙅ', &['ꙅ']), ('ꙅ', &['Ꙅ']), ('Ꙇ', &['ꙇ']), ('ꙇ', &['Ꙇ']), ('Ꙉ', &['ꙉ']), ('ꙉ', &['Ꙉ']), ('Ꙋ', &['ᲈ', 'ꙋ']), ('ꙋ', &['ᲈ', 'Ꙋ']), ('Ꙍ', &['ꙍ']), ('ꙍ', &['Ꙍ']), ('Ꙏ', &['ꙏ']), ('ꙏ', &['Ꙏ']), ('Ꙑ', &['ꙑ']), ('ꙑ', &['Ꙑ']), ('Ꙓ', &['ꙓ']), ('ꙓ', &['Ꙓ']), ('Ꙕ', &['ꙕ']), ('ꙕ', &['Ꙕ']), ('Ꙗ', &['ꙗ']), ('ꙗ', &['Ꙗ']), ('Ꙙ', &['ꙙ']), ('ꙙ', &['Ꙙ']), ('Ꙛ', &['ꙛ']), ('ꙛ', &['Ꙛ']), ('Ꙝ', &['ꙝ']), ('ꙝ', &['Ꙝ']), ('Ꙟ', &['ꙟ']), ('ꙟ', &['Ꙟ']), ('Ꙡ', &['ꙡ']), ('ꙡ', &['Ꙡ']), ('Ꙣ', &['ꙣ']), ('ꙣ', &['Ꙣ']), ('Ꙥ', &['ꙥ']), ('ꙥ', &['Ꙥ']), ('Ꙧ', &['ꙧ']), ('ꙧ', &['Ꙧ']), ('Ꙩ', &['ꙩ']), ('ꙩ', &['Ꙩ']), ('Ꙫ', &['ꙫ']), ('ꙫ', &['Ꙫ']), ('Ꙭ', &['ꙭ']), ('ꙭ', &['Ꙭ']), ('Ꚁ', &['ꚁ']), ('ꚁ', &['Ꚁ']), ('Ꚃ', &['ꚃ']), ('ꚃ', &['Ꚃ']), ('Ꚅ', &['ꚅ']), ('ꚅ', &['Ꚅ']), ('Ꚇ', &['ꚇ']), ('ꚇ', &['Ꚇ']), ('Ꚉ', &['ꚉ']), ('ꚉ', &['Ꚉ']), ('Ꚋ', &['ꚋ']), ('ꚋ', &['Ꚋ']), ('Ꚍ', &['ꚍ']), ('ꚍ', &['Ꚍ']), ('Ꚏ', &['ꚏ']), ('ꚏ', &['Ꚏ']), ('Ꚑ', &['ꚑ']), ('ꚑ', &['Ꚑ']), ('Ꚓ', &['ꚓ']), ('ꚓ', &['Ꚓ']), ('Ꚕ', &['ꚕ']), ('ꚕ', &['Ꚕ']), ('Ꚗ', &['ꚗ']), ('ꚗ', &['Ꚗ']), ('Ꚙ', &['ꚙ']), ('ꚙ', &['Ꚙ']), ('Ꚛ', &['ꚛ']), ('ꚛ', &['Ꚛ']), ('Ꜣ', &['ꜣ']), ('ꜣ', &['Ꜣ']), ('Ꜥ', &['ꜥ']), ('ꜥ', &['Ꜥ']), ('Ꜧ', &['ꜧ']), ('ꜧ', &['Ꜧ']), ('Ꜩ', &['ꜩ']), ('ꜩ', &['Ꜩ']), ('Ꜫ', &['ꜫ']), ('ꜫ', &['Ꜫ']), ('Ꜭ', &['ꜭ']), ('ꜭ', &['Ꜭ']), ('Ꜯ', &['ꜯ']), ('ꜯ', &['Ꜯ']), ('Ꜳ', &['ꜳ']), ('ꜳ', &['Ꜳ']), ('Ꜵ', &['ꜵ']), ('ꜵ', &['Ꜵ']), ('Ꜷ', &['ꜷ']), ('ꜷ', &['Ꜷ']), ('Ꜹ', &['ꜹ']), ('ꜹ', &['Ꜹ']), ('Ꜻ', &['ꜻ']), ('ꜻ', &['Ꜻ']), ('Ꜽ', &['ꜽ']), ('ꜽ', &['Ꜽ']), ('Ꜿ', &['ꜿ']), ('ꜿ', &['Ꜿ']), ('Ꝁ', &['ꝁ']), ('ꝁ', &['Ꝁ']), ('Ꝃ', &['ꝃ']), ('ꝃ', &['Ꝃ']), ('Ꝅ', &['ꝅ']), ('ꝅ', &['Ꝅ']), ('Ꝇ', &['ꝇ']), ('ꝇ', &['Ꝇ']), ('Ꝉ', &['ꝉ']), ('ꝉ', &['Ꝉ']), ('Ꝋ', &['ꝋ']), ('ꝋ', &['Ꝋ']), ('Ꝍ', &['ꝍ']), ('ꝍ', &['Ꝍ']), ('Ꝏ', &['ꝏ']), ('ꝏ', &['Ꝏ']), ('Ꝑ', &['ꝑ']), ('ꝑ', &['Ꝑ']), ('Ꝓ', &['ꝓ']), ('ꝓ', &['Ꝓ']), ('Ꝕ', &['ꝕ']), ('ꝕ', &['Ꝕ']), ('Ꝗ', &['ꝗ']), ('ꝗ', &['Ꝗ']), ('Ꝙ', &['ꝙ']), ('ꝙ', &['Ꝙ']), ('Ꝛ', &['ꝛ']), ('ꝛ', &['Ꝛ']), ('Ꝝ', &['ꝝ']), ('ꝝ', &['Ꝝ']), ('Ꝟ', &['ꝟ']), ('ꝟ', &['Ꝟ']), ('Ꝡ', &['ꝡ']), ('ꝡ', &['Ꝡ']), ('Ꝣ', &['ꝣ']), ('ꝣ', &['Ꝣ']), ('Ꝥ', &['ꝥ']), ('ꝥ', &['Ꝥ']), ('Ꝧ', &['ꝧ']), ('ꝧ', &['Ꝧ']), ('Ꝩ', &['ꝩ']), ('ꝩ', &['Ꝩ']), ('Ꝫ', &['ꝫ']), ('ꝫ', &['Ꝫ']), ('Ꝭ', &['ꝭ']), ('ꝭ', &['Ꝭ']), ('Ꝯ', &['ꝯ']), ('ꝯ', &['Ꝯ']), ('Ꝺ', &['ꝺ']), ('ꝺ', &['Ꝺ']), ('Ꝼ', &['ꝼ']), ('ꝼ', &['Ꝼ']), ('Ᵹ', &['ᵹ']), ('Ꝿ', &['ꝿ']), ('ꝿ', &['Ꝿ']), ('Ꞁ', &['ꞁ']), ('ꞁ', &['Ꞁ']), ('Ꞃ', &['ꞃ']), ('ꞃ', &['Ꞃ']), ('Ꞅ', &['ꞅ']), ('ꞅ', &['Ꞅ']), ('Ꞇ', &['ꞇ']), ('ꞇ', &['Ꞇ']), ('Ꞌ', &['ꞌ']), ('ꞌ', &['Ꞌ']), ('Ɥ', &['ɥ']), ('Ꞑ', &['ꞑ']), ('ꞑ', &['Ꞑ']), ('Ꞓ', &['ꞓ']), ('ꞓ', &['Ꞓ']), ('ꞔ', &['Ꞔ']), ('Ꞗ', &['ꞗ']), ('ꞗ', &['Ꞗ']), ('Ꞙ', &['ꞙ']), ('ꞙ', &['Ꞙ']), ('Ꞛ', &['ꞛ']), ('ꞛ', &['Ꞛ']), ('Ꞝ', &['ꞝ']), ('ꞝ', &['Ꞝ']), ('Ꞟ', &['ꞟ']), ('ꞟ', &['Ꞟ']), ('Ꞡ', &['ꞡ']), ('ꞡ', &['Ꞡ']), ('Ꞣ', &['ꞣ']), ('ꞣ', &['Ꞣ']), ('Ꞥ', &['ꞥ']), ('ꞥ', &['Ꞥ']), ('Ꞧ', &['ꞧ']), ('ꞧ', &['Ꞧ']), ('Ꞩ', &['ꞩ']), ('ꞩ', &['Ꞩ']), ('Ɦ', &['ɦ']), ('Ɜ', &['ɜ']), ('Ɡ', &['ɡ']), ('Ɬ', &['ɬ']), ('Ɪ', &['ɪ']), ('Ʞ', &['ʞ']), ('Ʇ', &['ʇ']), ('Ʝ', &['ʝ']), ('Ꭓ', &['ꭓ']), ('Ꞵ', &['ꞵ']), ('ꞵ', &['Ꞵ']), ('Ꞷ', &['ꞷ']), ('ꞷ', &['Ꞷ']), ('Ꞹ', &['ꞹ']), ('ꞹ', &['Ꞹ']), ('Ꞻ', &['ꞻ']), ('ꞻ', &['Ꞻ']), ('Ꞽ', &['ꞽ']), ('ꞽ', &['Ꞽ']), ('Ꞿ', &['ꞿ']), ('ꞿ', &['Ꞿ']), ('Ꟁ', &['ꟁ']), ('ꟁ', &['Ꟁ']), ('Ꟃ', &['ꟃ']), ('ꟃ', &['Ꟃ']), ('Ꞔ', &['ꞔ']), ('Ʂ', &['ʂ']), ('Ᶎ', &['ᶎ']), ('Ꟈ', &['ꟈ']), ('ꟈ', &['Ꟈ']), ('Ꟊ', &['ꟊ']), ('ꟊ', &['Ꟊ']), ('Ꟑ', &['ꟑ']), ('ꟑ', &['Ꟑ']), ('Ꟗ', &['ꟗ']), ('ꟗ', &['Ꟗ']), ('Ꟙ', &['ꟙ']), ('ꟙ', &['Ꟙ']), ('Ꟶ', &['ꟶ']), ('ꟶ', &['Ꟶ']), ('ꭓ', &['Ꭓ']), ('ꭰ', &['Ꭰ']), ('ꭱ', &['Ꭱ']), ('ꭲ', &['Ꭲ']), ('ꭳ', &['Ꭳ']), ('ꭴ', &['Ꭴ']), ('ꭵ', &['Ꭵ']), ('ꭶ', &['Ꭶ']), ('ꭷ', &['Ꭷ']), ('ꭸ', &['Ꭸ']), ('ꭹ', &['Ꭹ']), ('ꭺ', &['Ꭺ']), ('ꭻ', &['Ꭻ']), ('ꭼ', &['Ꭼ']), ('ꭽ', &['Ꭽ']), ('ꭾ', &['Ꭾ']), ('ꭿ', &['Ꭿ']), ('ꮀ', &['Ꮀ']), ('ꮁ', &['Ꮁ']), ('ꮂ', &['Ꮂ']), ('ꮃ', &['Ꮃ']), ('ꮄ', &['Ꮄ']), ('ꮅ', &['Ꮅ']), ('ꮆ', &['Ꮆ']), ('ꮇ', &['Ꮇ']), ('ꮈ', &['Ꮈ']), ('ꮉ', &['Ꮉ']), ('ꮊ', &['Ꮊ']), ('ꮋ', &['Ꮋ']), ('ꮌ', &['Ꮌ']), ('ꮍ', &['Ꮍ']), ('ꮎ', &['Ꮎ']), ('ꮏ', &['Ꮏ']), ('ꮐ', &['Ꮐ']), ('ꮑ', &['Ꮑ']), ('ꮒ', &['Ꮒ']), ('ꮓ', &['Ꮓ']), ('ꮔ', &['Ꮔ']), ('ꮕ', &['Ꮕ']), ('ꮖ', &['Ꮖ']), ('ꮗ', &['Ꮗ']), ('ꮘ', &['Ꮘ']), ('ꮙ', &['Ꮙ']), ('ꮚ', &['Ꮚ']), ('ꮛ', &['Ꮛ']), ('ꮜ', &['Ꮜ']), ('ꮝ', &['Ꮝ']), ('ꮞ', &['Ꮞ']), ('ꮟ', &['Ꮟ']), ('ꮠ', &['Ꮠ']), ('ꮡ', &['Ꮡ']), ('ꮢ', &['Ꮢ']), ('ꮣ', &['Ꮣ']), ('ꮤ', &['Ꮤ']), ('ꮥ', &['Ꮥ']), ('ꮦ', &['Ꮦ']), ('ꮧ', &['Ꮧ']), ('ꮨ', &['Ꮨ']), ('ꮩ', &['Ꮩ']), ('ꮪ', &['Ꮪ']), ('ꮫ', &['Ꮫ']), ('ꮬ', &['Ꮬ']), ('ꮭ', &['Ꮭ']), ('ꮮ', &['Ꮮ']), ('ꮯ', &['Ꮯ']), ('ꮰ', &['Ꮰ']), ('ꮱ', &['Ꮱ']), ('ꮲ', &['Ꮲ']), ('ꮳ', &['Ꮳ']), ('ꮴ', &['Ꮴ']), ('ꮵ', &['Ꮵ']), ('ꮶ', &['Ꮶ']), ('ꮷ', &['Ꮷ']), ('ꮸ', &['Ꮸ']), ('ꮹ', &['Ꮹ']), ('ꮺ', &['Ꮺ']), ('ꮻ', &['Ꮻ']), ('ꮼ', &['Ꮼ']), ('ꮽ', &['Ꮽ']), ('ꮾ', &['Ꮾ']), ('ꮿ', &['Ꮿ']), ('A', &['a']), ('B', &['b']), ('C', &['c']), ('D', &['d']), ('E', &['e']), ('F', &['f']), ('G', &['g']), ('H', &['h']), ('I', &['i']), ('J', &['j']), ('K', &['k']), ('L', &['l']), ('M', &['m']), ('N', &['n']), ('O', &['o']), ('P', &['p']), ('Q', &['q']), ('R', &['r']), ('S', &['s']), ('T', &['t']), ('U', &['u']), ('V', &['v']), ('W', &['w']), ('X', &['x']), ('Y', &['y']), ('Z', &['z']), ('a', &['A']), ('b', &['B']), ('c', &['C']), ('d', &['D']), ('e', &['E']), ('f', &['F']), ('g', &['G']), ('h', &['H']), ('i', &['I']), ('j', &['J']), ('k', &['K']), ('l', &['L']), ('m', &['M']), ('n', &['N']), ('o', &['O']), ('p', &['P']), ('q', &['Q']), ('r', &['R']), ('s', &['S']), ('t', &['T']), ('u', &['U']), ('v', &['V']), ('w', &['W']), ('x', &['X']), ('y', &['Y']), ('z', &['Z']), ('𐐀', &['𐐨']), ('𐐁', &['𐐩']), ('𐐂', &['𐐪']), ('𐐃', &['𐐫']), ('𐐄', &['𐐬']), ('𐐅', &['𐐭']), ('𐐆', &['𐐮']), ('𐐇', &['𐐯']), ('𐐈', &['𐐰']), ('𐐉', &['𐐱']), ('𐐊', &['𐐲']), ('𐐋', &['𐐳']), ('𐐌', &['𐐴']), ('𐐍', &['𐐵']), ('𐐎', &['𐐶']), ('𐐏', &['𐐷']), ('𐐐', &['𐐸']), ('𐐑', &['𐐹']), ('𐐒', &['𐐺']), ('𐐓', &['𐐻']), ('𐐔', &['𐐼']), ('𐐕', &['𐐽']), ('𐐖', &['𐐾']), ('𐐗', &['𐐿']), ('𐐘', &['𐑀']), ('𐐙', &['𐑁']), ('𐐚', &['𐑂']), ('𐐛', &['𐑃']), ('𐐜', &['𐑄']), ('𐐝', &['𐑅']), ('𐐞', &['𐑆']), ('𐐟', &['𐑇']), ('𐐠', &['𐑈']), ('𐐡', &['𐑉']), ('𐐢', &['𐑊']), ('𐐣', &['𐑋']), ('𐐤', &['𐑌']), ('𐐥', &['𐑍']), ('𐐦', &['𐑎']), ('𐐧', &['𐑏']), ('𐐨', &['𐐀']), ('𐐩', &['𐐁']), ('𐐪', &['𐐂']), ('𐐫', &['𐐃']), ('𐐬', &['𐐄']), ('𐐭', &['𐐅']), ('𐐮', &['𐐆']), ('𐐯', &['𐐇']), ('𐐰', &['𐐈']), ('𐐱', &['𐐉']), ('𐐲', &['𐐊']), ('𐐳', &['𐐋']), ('𐐴', &['𐐌']), ('𐐵', &['𐐍']), ('𐐶', &['𐐎']), ('𐐷', &['𐐏']), ('𐐸', &['𐐐']), ('𐐹', &['𐐑']), ('𐐺', &['𐐒']), ('𐐻', &['𐐓']), ('𐐼', &['𐐔']), ('𐐽', &['𐐕']), ('𐐾', &['𐐖']), ('𐐿', &['𐐗']), ('𐑀', &['𐐘']), ('𐑁', &['𐐙']), ('𐑂', &['𐐚']), ('𐑃', &['𐐛']), ('𐑄', &['𐐜']), ('𐑅', &['𐐝']), ('𐑆', &['𐐞']), ('𐑇', &['𐐟']), ('𐑈', &['𐐠']), ('𐑉', &['𐐡']), ('𐑊', &['𐐢']), ('𐑋', &['𐐣']), ('𐑌', &['𐐤']), ('𐑍', &['𐐥']), ('𐑎', &['𐐦']), ('𐑏', &['𐐧']), ('𐒰', &['𐓘']), ('𐒱', &['𐓙']), ('𐒲', &['𐓚']), ('𐒳', &['𐓛']), ('𐒴', &['𐓜']), ('𐒵', &['𐓝']), ('𐒶', &['𐓞']), ('𐒷', &['𐓟']), ('𐒸', &['𐓠']), ('𐒹', &['𐓡']), ('𐒺', &['𐓢']), ('𐒻', &['𐓣']), ('𐒼', &['𐓤']), ('𐒽', &['𐓥']), ('𐒾', &['𐓦']), ('𐒿', &['𐓧']), ('𐓀', &['𐓨']), ('𐓁', &['𐓩']), ('𐓂', &['𐓪']), ('𐓃', &['𐓫']), ('𐓄', &['𐓬']), ('𐓅', &['𐓭']), ('𐓆', &['𐓮']), ('𐓇', &['𐓯']), ('𐓈', &['𐓰']), ('𐓉', &['𐓱']), ('𐓊', &['𐓲']), ('𐓋', &['𐓳']), ('𐓌', &['𐓴']), ('𐓍', &['𐓵']), ('𐓎', &['𐓶']), ('𐓏', &['𐓷']), ('𐓐', &['𐓸']), ('𐓑', &['𐓹']), ('𐓒', &['𐓺']), ('𐓓', &['𐓻']), ('𐓘', &['𐒰']), ('𐓙', &['𐒱']), ('𐓚', &['𐒲']), ('𐓛', &['𐒳']), ('𐓜', &['𐒴']), ('𐓝', &['𐒵']), ('𐓞', &['𐒶']), ('𐓟', &['𐒷']), ('𐓠', &['𐒸']), ('𐓡', &['𐒹']), ('𐓢', &['𐒺']), ('𐓣', &['𐒻']), ('𐓤', &['𐒼']), ('𐓥', &['𐒽']), ('𐓦', &['𐒾']), ('𐓧', &['𐒿']), ('𐓨', &['𐓀']), ('𐓩', &['𐓁']), ('𐓪', &['𐓂']), ('𐓫', &['𐓃']), ('𐓬', &['𐓄']), ('𐓭', &['𐓅']), ('𐓮', &['𐓆']), ('𐓯', &['𐓇']), ('𐓰', &['𐓈']), ('𐓱', &['𐓉']), ('𐓲', &['𐓊']), ('𐓳', &['𐓋']), ('𐓴', &['𐓌']), ('𐓵', &['𐓍']), ('𐓶', &['𐓎']), ('𐓷', &['𐓏']), ('𐓸', &['𐓐']), ('𐓹', &['𐓑']), ('𐓺', &['𐓒']), ('𐓻', &['𐓓']), ('𐕰', &['𐖗']), ('𐕱', &['𐖘']), ('𐕲', &['𐖙']), ('𐕳', &['𐖚']), ('𐕴', &['𐖛']), ('𐕵', &['𐖜']), ('𐕶', &['𐖝']), ('𐕷', &['𐖞']), ('𐕸', &['𐖟']), ('𐕹', &['𐖠']), ('𐕺', &['𐖡']), ('𐕼', &['𐖣']), ('𐕽', &['𐖤']), ('𐕾', &['𐖥']), ('𐕿', &['𐖦']), ('𐖀', &['𐖧']), ('𐖁', &['𐖨']), ('𐖂', &['𐖩']), ('𐖃', &['𐖪']), ('𐖄', &['𐖫']), ('𐖅', &['𐖬']), ('𐖆', &['𐖭']), ('𐖇', &['𐖮']), ('𐖈', &['𐖯']), ('𐖉', &['𐖰']), ('𐖊', &['𐖱']), ('𐖌', &['𐖳']), ('𐖍', &['𐖴']), ('𐖎', &['𐖵']), ('𐖏', &['𐖶']), ('𐖐', &['𐖷']), ('𐖑', &['𐖸']), ('𐖒', &['𐖹']), ('𐖔', &['𐖻']), ('𐖕', &['𐖼']), ('𐖗', &['𐕰']), ('𐖘', &['𐕱']), ('𐖙', &['𐕲']), ('𐖚', &['𐕳']), ('𐖛', &['𐕴']), ('𐖜', &['𐕵']), ('𐖝', &['𐕶']), ('𐖞', &['𐕷']), ('𐖟', &['𐕸']), ('𐖠', &['𐕹']), ('𐖡', &['𐕺']), ('𐖣', &['𐕼']), ('𐖤', &['𐕽']), ('𐖥', &['𐕾']), ('𐖦', &['𐕿']), ('𐖧', &['𐖀']), ('𐖨', &['𐖁']), ('𐖩', &['𐖂']), ('𐖪', &['𐖃']), ('𐖫', &['𐖄']), ('𐖬', &['𐖅']), ('𐖭', &['𐖆']), ('𐖮', &['𐖇']), ('𐖯', &['𐖈']), ('𐖰', &['𐖉']), ('𐖱', &['𐖊']), ('𐖳', &['𐖌']), ('𐖴', &['𐖍']), ('𐖵', &['𐖎']), ('𐖶', &['𐖏']), ('𐖷', &['𐖐']), ('𐖸', &['𐖑']), ('𐖹', &['𐖒']), ('𐖻', &['𐖔']), ('𐖼', &['𐖕']), ('𐲀', &['𐳀']), ('𐲁', &['𐳁']), ('𐲂', &['𐳂']), ('𐲃', &['𐳃']), ('𐲄', &['𐳄']), ('𐲅', &['𐳅']), ('𐲆', &['𐳆']), ('𐲇', &['𐳇']), ('𐲈', &['𐳈']), ('𐲉', &['𐳉']), ('𐲊', &['𐳊']), ('𐲋', &['𐳋']), ('𐲌', &['𐳌']), ('𐲍', &['𐳍']), ('𐲎', &['𐳎']), ('𐲏', &['𐳏']), ('𐲐', &['𐳐']), ('𐲑', &['𐳑']), ('𐲒', &['𐳒']), ('𐲓', &['𐳓']), ('𐲔', &['𐳔']), ('𐲕', &['𐳕']), ('𐲖', &['𐳖']), ('𐲗', &['𐳗']), ('𐲘', &['𐳘']), ('𐲙', &['𐳙']), ('𐲚', &['𐳚']), ('𐲛', &['𐳛']), ('𐲜', &['𐳜']), ('𐲝', &['𐳝']), ('𐲞', &['𐳞']), ('𐲟', &['𐳟']), ('𐲠', &['𐳠']), ('𐲡', &['𐳡']), ('𐲢', &['𐳢']), ('𐲣', &['𐳣']), ('𐲤', &['𐳤']), ('𐲥', &['𐳥']), ('𐲦', &['𐳦']), ('𐲧', &['𐳧']), ('𐲨', &['𐳨']), ('𐲩', &['𐳩']), ('𐲪', &['𐳪']), ('𐲫', &['𐳫']), ('𐲬', &['𐳬']), ('𐲭', &['𐳭']), ('𐲮', &['𐳮']), ('𐲯', &['𐳯']), ('𐲰', &['𐳰']), ('𐲱', &['𐳱']), ('𐲲', &['𐳲']), ('𐳀', &['𐲀']), ('𐳁', &['𐲁']), ('𐳂', &['𐲂']), ('𐳃', &['𐲃']), ('𐳄', &['𐲄']), ('𐳅', &['𐲅']), ('𐳆', &['𐲆']), ('𐳇', &['𐲇']), ('𐳈', &['𐲈']), ('𐳉', &['𐲉']), ('𐳊', &['𐲊']), ('𐳋', &['𐲋']), ('𐳌', &['𐲌']), ('𐳍', &['𐲍']), ('𐳎', &['𐲎']), ('𐳏', &['𐲏']), ('𐳐', &['𐲐']), ('𐳑', &['𐲑']), ('𐳒', &['𐲒']), ('𐳓', &['𐲓']), ('𐳔', &['𐲔']), ('𐳕', &['𐲕']), ('𐳖', &['𐲖']), ('𐳗', &['𐲗']), ('𐳘', &['𐲘']), ('𐳙', &['𐲙']), ('𐳚', &['𐲚']), ('𐳛', &['𐲛']), ('𐳜', &['𐲜']), ('𐳝', &['𐲝']), ('𐳞', &['𐲞']), ('𐳟', &['𐲟']), ('𐳠', &['𐲠']), ('𐳡', &['𐲡']), ('𐳢', &['𐲢']), ('𐳣', &['𐲣']), ('𐳤', &['𐲤']), ('𐳥', &['𐲥']), ('𐳦', &['𐲦']), ('𐳧', &['𐲧']), ('𐳨', &['𐲨']), ('𐳩', &['𐲩']), ('𐳪', &['𐲪']), ('𐳫', &['𐲫']), ('𐳬', &['𐲬']), ('𐳭', &['𐲭']), ('𐳮', &['𐲮']), ('𐳯', &['𐲯']), ('𐳰', &['𐲰']), ('𐳱', &['𐲱']), ('𐳲', &['𐲲']), ('𑢠', &['𑣀']), ('𑢡', &['𑣁']), ('𑢢', &['𑣂']), ('𑢣', &['𑣃']), ('𑢤', &['𑣄']), ('𑢥', &['𑣅']), ('𑢦', &['𑣆']), ('𑢧', &['𑣇']), ('𑢨', &['𑣈']), ('𑢩', &['𑣉']), ('𑢪', &['𑣊']), ('𑢫', &['𑣋']), ('𑢬', &['𑣌']), ('𑢭', &['𑣍']), ('𑢮', &['𑣎']), ('𑢯', &['𑣏']), ('𑢰', &['𑣐']), ('𑢱', &['𑣑']), ('𑢲', &['𑣒']), ('𑢳', &['𑣓']), ('𑢴', &['𑣔']), ('𑢵', &['𑣕']), ('𑢶', &['𑣖']), ('𑢷', &['𑣗']), ('𑢸', &['𑣘']), ('𑢹', &['𑣙']), ('𑢺', &['𑣚']), ('𑢻', &['𑣛']), ('𑢼', &['𑣜']), ('𑢽', &['𑣝']), ('𑢾', &['𑣞']), ('𑢿', &['𑣟']), ('𑣀', &['𑢠']), ('𑣁', &['𑢡']), ('𑣂', &['𑢢']), ('𑣃', &['𑢣']), ('𑣄', &['𑢤']), ('𑣅', &['𑢥']), ('𑣆', &['𑢦']), ('𑣇', &['𑢧']), ('𑣈', &['𑢨']), ('𑣉', &['𑢩']), ('𑣊', &['𑢪']), ('𑣋', &['𑢫']), ('𑣌', &['𑢬']), ('𑣍', &['𑢭']), ('𑣎', &['𑢮']), ('𑣏', &['𑢯']), ('𑣐', &['𑢰']), ('𑣑', &['𑢱']), ('𑣒', &['𑢲']), ('𑣓', &['𑢳']), ('𑣔', &['𑢴']), ('𑣕', &['𑢵']), ('𑣖', &['𑢶']), ('𑣗', &['𑢷']), ('𑣘', &['𑢸']), ('𑣙', &['𑢹']), ('𑣚', &['𑢺']), ('𑣛', &['𑢻']), ('𑣜', &['𑢼']), ('𑣝', &['𑢽']), ('𑣞', &['𑢾']), ('𑣟', &['𑢿']), ('𖹀', &['𖹠']), ('𖹁', &['𖹡']), ('𖹂', &['𖹢']), ('𖹃', &['𖹣']), ('𖹄', &['𖹤']), ('𖹅', &['𖹥']), ('𖹆', &['𖹦']), ('𖹇', &['𖹧']), ('𖹈', &['𖹨']), ('𖹉', &['𖹩']), ('𖹊', &['𖹪']), ('𖹋', &['𖹫']), ('𖹌', &['𖹬']), ('𖹍', &['𖹭']), ('𖹎', &['𖹮']), ('𖹏', &['𖹯']), ('𖹐', &['𖹰']), ('𖹑', &['𖹱']), ('𖹒', &['𖹲']), ('𖹓', &['𖹳']), ('𖹔', &['𖹴']), ('𖹕', &['𖹵']), ('𖹖', &['𖹶']), ('𖹗', &['𖹷']), ('𖹘', &['𖹸']), ('𖹙', &['𖹹']), ('𖹚', &['𖹺']), ('𖹛', &['𖹻']), ('𖹜', &['𖹼']), ('𖹝', &['𖹽']), ('𖹞', &['𖹾']), ('𖹟', &['𖹿']), ('𖹠', &['𖹀']), ('𖹡', &['𖹁']), ('𖹢', &['𖹂']), ('𖹣', &['𖹃']), ('𖹤', &['𖹄']), ('𖹥', &['𖹅']), ('𖹦', &['𖹆']), ('𖹧', &['𖹇']), ('𖹨', &['𖹈']), ('𖹩', &['𖹉']), ('𖹪', &['𖹊']), ('𖹫', &['𖹋']), ('𖹬', &['𖹌']), ('𖹭', &['𖹍']), ('𖹮', &['𖹎']), ('𖹯', &['𖹏']), ('𖹰', &['𖹐']), ('𖹱', &['𖹑']), ('𖹲', &['𖹒']), ('𖹳', &['𖹓']), ('𖹴', &['𖹔']), ('𖹵', &['𖹕']), ('𖹶', &['𖹖']), ('𖹷', &['𖹗']), ('𖹸', &['𖹘']), ('𖹹', &['𖹙']), ('𖹺', &['𖹚']), ('𖹻', &['𖹛']), ('𖹼', &['𖹜']), ('𖹽', &['𖹝']), ('𖹾', &['𖹞']), ('𖹿', &['𖹟']), ('𞤀', &['𞤢']), ('𞤁', &['𞤣']), ('𞤂', &['𞤤']), ('𞤃', &['𞤥']), ('𞤄', &['𞤦']), ('𞤅', &['𞤧']), ('𞤆', &['𞤨']), ('𞤇', &['𞤩']), ('𞤈', &['𞤪']), ('𞤉', &['𞤫']), ('𞤊', &['𞤬']), ('𞤋', &['𞤭']), ('𞤌', &['𞤮']), ('𞤍', &['𞤯']), ('𞤎', &['𞤰']), ('𞤏', &['𞤱']), ('𞤐', &['𞤲']), ('𞤑', &['𞤳']), ('𞤒', &['𞤴']), ('𞤓', &['𞤵']), ('𞤔', &['𞤶']), ('𞤕', &['𞤷']), ('𞤖', &['𞤸']), ('𞤗', &['𞤹']), ('𞤘', &['𞤺']), ('𞤙', &['𞤻']), ('𞤚', &['𞤼']), ('𞤛', &['𞤽']), ('𞤜', &['𞤾']), ('𞤝', &['𞤿']), ('𞤞', &['𞥀']), ('𞤟', &['𞥁']), ('𞤠', &['𞥂']), ('𞤡', &['𞥃']), ('𞤢', &['𞤀']), ('𞤣', &['𞤁']), ('𞤤', &['𞤂']), ('𞤥', &['𞤃']), ('𞤦', &['𞤄']), ('𞤧', &['𞤅']), ('𞤨', &['𞤆']), ('𞤩', &['𞤇']), ('𞤪', &['𞤈']), ('𞤫', &['𞤉']), ('𞤬', &['𞤊']), ('𞤭', &['𞤋']), ('𞤮', &['𞤌']), ('𞤯', &['𞤍']), ('𞤰', &['𞤎']), ('𞤱', &['𞤏']), ('𞤲', &['𞤐']), ('𞤳', &['𞤑']), ('𞤴', &['𞤒']), ('𞤵', &['𞤓']), ('𞤶', &['𞤔']), ('𞤷', &['𞤕']), ('𞤸', &['𞤖']), ('𞤹', &['𞤗']), ('𞤺', &['𞤘']), ('𞤻', &['𞤙']), ('𞤼', &['𞤚']), ('𞤽', &['𞤛']), ('𞤾', &['𞤜']), ('𞤿', &['𞤝']), ('𞥀', &['𞤞']), ('𞥁', &['𞤟']), ('𞥂', &['𞤠']), ('𞥃', &['𞤡']), ]; regex-syntax-0.8.2/src/unicode_tables/general_category.rs000064400000000000000000004546071046102023000217400ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate general-category ucd-15.0.0 --chars --exclude surrogate // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("Cased_Letter", CASED_LETTER), ("Close_Punctuation", CLOSE_PUNCTUATION), ("Connector_Punctuation", CONNECTOR_PUNCTUATION), ("Control", CONTROL), ("Currency_Symbol", CURRENCY_SYMBOL), ("Dash_Punctuation", DASH_PUNCTUATION), ("Decimal_Number", DECIMAL_NUMBER), ("Enclosing_Mark", ENCLOSING_MARK), ("Final_Punctuation", FINAL_PUNCTUATION), ("Format", FORMAT), ("Initial_Punctuation", INITIAL_PUNCTUATION), ("Letter", LETTER), ("Letter_Number", LETTER_NUMBER), ("Line_Separator", LINE_SEPARATOR), ("Lowercase_Letter", LOWERCASE_LETTER), ("Mark", MARK), ("Math_Symbol", MATH_SYMBOL), ("Modifier_Letter", MODIFIER_LETTER), ("Modifier_Symbol", MODIFIER_SYMBOL), ("Nonspacing_Mark", NONSPACING_MARK), ("Number", NUMBER), ("Open_Punctuation", OPEN_PUNCTUATION), ("Other", OTHER), ("Other_Letter", OTHER_LETTER), ("Other_Number", OTHER_NUMBER), ("Other_Punctuation", OTHER_PUNCTUATION), ("Other_Symbol", OTHER_SYMBOL), ("Paragraph_Separator", PARAGRAPH_SEPARATOR), ("Private_Use", PRIVATE_USE), ("Punctuation", PUNCTUATION), ("Separator", SEPARATOR), ("Space_Separator", SPACE_SEPARATOR), ("Spacing_Mark", SPACING_MARK), ("Symbol", SYMBOL), ("Titlecase_Letter", TITLECASE_LETTER), ("Unassigned", UNASSIGNED), ("Uppercase_Letter", UPPERCASE_LETTER), ]; pub const CASED_LETTER: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), ('ʕ', 'ʯ'), ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՠ', 'ֈ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჽ', 'ჿ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᴀ', 'ᴫ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶚ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), ('Ⰰ', 'ⱻ'), ('Ȿ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꝯ'), ('ꝱ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('Ꟶ', 'ꟶ'), ('ꟺ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭨ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𖹀', '𖹿'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼉'), ('𝼋', '𝼞'), ('𝼥', '𝼪'), ('𞤀', '𞥃'), ]; pub const CLOSE_PUNCTUATION: &'static [(char, char)] = &[ (')', ')'), (']', ']'), ('}', '}'), ('༻', '༻'), ('༽', '༽'), ('᚜', '᚜'), ('⁆', '⁆'), ('⁾', '⁾'), ('₎', '₎'), ('⌉', '⌉'), ('⌋', '⌋'), ('〉', '〉'), ('❩', '❩'), ('❫', '❫'), ('❭', '❭'), ('❯', '❯'), ('❱', '❱'), ('❳', '❳'), ('❵', '❵'), ('⟆', '⟆'), ('⟧', '⟧'), ('⟩', '⟩'), ('⟫', '⟫'), ('⟭', '⟭'), ('⟯', '⟯'), ('⦄', '⦄'), ('⦆', '⦆'), ('⦈', '⦈'), ('⦊', '⦊'), ('⦌', '⦌'), ('⦎', '⦎'), ('⦐', '⦐'), ('⦒', '⦒'), ('⦔', '⦔'), ('⦖', '⦖'), ('⦘', '⦘'), ('⧙', '⧙'), ('⧛', '⧛'), ('⧽', '⧽'), ('⸣', '⸣'), ('⸥', '⸥'), ('⸧', '⸧'), ('⸩', '⸩'), ('⹖', '⹖'), ('⹘', '⹘'), ('⹚', '⹚'), ('⹜', '⹜'), ('〉', '〉'), ('》', '》'), ('」', '」'), ('』', '』'), ('】', '】'), ('〕', '〕'), ('〗', '〗'), ('〙', '〙'), ('〛', '〛'), ('〞', '〟'), ('﴾', '﴾'), ('︘', '︘'), ('︶', '︶'), ('︸', '︸'), ('︺', '︺'), ('︼', '︼'), ('︾', '︾'), ('﹀', '﹀'), ('﹂', '﹂'), ('﹄', '﹄'), ('﹈', '﹈'), ('﹚', '﹚'), ('﹜', '﹜'), ('﹞', '﹞'), (')', ')'), (']', ']'), ('}', '}'), ('⦆', '⦆'), ('」', '」'), ]; pub const CONNECTOR_PUNCTUATION: &'static [(char, char)] = &[ ('_', '_'), ('‿', '⁀'), ('⁔', '⁔'), ('︳', '︴'), ('﹍', '﹏'), ('_', '_'), ]; pub const CONTROL: &'static [(char, char)] = &[('\0', '\u{1f}'), ('\u{7f}', '\u{9f}')]; pub const CURRENCY_SYMBOL: &'static [(char, char)] = &[ ('$', '$'), ('¢', '¥'), ('֏', '֏'), ('؋', '؋'), ('߾', '߿'), ('৲', '৳'), ('৻', '৻'), ('૱', '૱'), ('௹', '௹'), ('฿', '฿'), ('៛', '៛'), ('₠', '⃀'), ('꠸', '꠸'), ('﷼', '﷼'), ('﹩', '﹩'), ('$', '$'), ('¢', '£'), ('¥', '₩'), ('𑿝', '𑿠'), ('𞋿', '𞋿'), ('𞲰', '𞲰'), ]; pub const DASH_PUNCTUATION: &'static [(char, char)] = &[ ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), ('‐', '―'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), ('⹀', '⹀'), ('⹝', '⹝'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), ('𐺭', '𐺭'), ]; pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ ('0', '9'), ('٠', '٩'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), ('𐒠', '𐒩'), ('𐴰', '𐴹'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), ('𑥐', '𑥙'), ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𑶠', '𑶩'), ('𑽐', '𑽙'), ('𖩠', '𖩩'), ('𖫀', '𖫉'), ('𖭐', '𖭙'), ('𝟎', '𝟿'), ('𞅀', '𞅉'), ('𞋰', '𞋹'), ('𞓰', '𞓹'), ('𞥐', '𞥙'), ('🯰', '🯹'), ]; pub const ENCLOSING_MARK: &'static [(char, char)] = &[ ('\u{488}', '\u{489}'), ('\u{1abe}', '\u{1abe}'), ('\u{20dd}', '\u{20e0}'), ('\u{20e2}', '\u{20e4}'), ('\u{a670}', '\u{a672}'), ]; pub const FINAL_PUNCTUATION: &'static [(char, char)] = &[ ('»', '»'), ('’', '’'), ('”', '”'), ('›', '›'), ('⸃', '⸃'), ('⸅', '⸅'), ('⸊', '⸊'), ('⸍', '⸍'), ('⸝', '⸝'), ('⸡', '⸡'), ]; pub const FORMAT: &'static [(char, char)] = &[ ('\u{ad}', '\u{ad}'), ('\u{600}', '\u{605}'), ('\u{61c}', '\u{61c}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{890}', '\u{891}'), ('\u{8e2}', '\u{8e2}'), ('\u{180e}', '\u{180e}'), ('\u{200b}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('\u{feff}', '\u{feff}'), ('\u{fff9}', '\u{fffb}'), ('\u{110bd}', '\u{110bd}'), ('\u{110cd}', '\u{110cd}'), ('\u{13430}', '\u{1343f}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), ]; pub const INITIAL_PUNCTUATION: &'static [(char, char)] = &[ ('«', '«'), ('‘', '‘'), ('‛', '“'), ('‟', '‟'), ('‹', '‹'), ('⸂', '⸂'), ('⸄', '⸄'), ('⸉', '⸉'), ('⸌', '⸌'), ('⸜', '⸜'), ('⸠', '⸠'), ]; pub const LETTER: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('א', 'ת'), ('ׯ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ↄ', 'ↄ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⸯ', 'ⸯ'), ('々', '〆'), ('〱', '〵'), ('〻', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛥ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍀'), ('𐍂', '𐍉'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('𑝀', '𑝆'), ('𑠀', '𑠫'), ('𑢠', '𑣟'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const LETTER_NUMBER: &'static [(char, char)] = &[ ('ᛮ', 'ᛰ'), ('Ⅰ', 'ↂ'), ('ↅ', 'ↈ'), ('〇', '〇'), ('〡', '〩'), ('〸', '〺'), ('ꛦ', 'ꛯ'), ('𐅀', '𐅴'), ('𐍁', '𐍁'), ('𐍊', '𐍊'), ('𐏑', '𐏕'), ('𒐀', '𒑮'), ]; pub const LINE_SEPARATOR: &'static [(char, char)] = &[('\u{2028}', '\u{2028}')]; pub const LOWERCASE_LETTER: &'static [(char, char)] = &[ ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), ('ʕ', 'ʯ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ՠ', 'ֈ'), ('ა', 'ჺ'), ('ჽ', 'ჿ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᴫ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶚ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), ('ẟ', 'ẟ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), ('ℓ', 'ℓ'), ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℽ'), ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ↄ', 'ↄ'), ('ⰰ', 'ⱟ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱻ'), ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝱ', 'ꝸ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞯ', 'ꞯ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꞹ', 'ꞹ'), ('ꞻ', 'ꞻ'), ('ꞽ', 'ꞽ'), ('ꞿ', 'ꞿ'), ('ꟁ', 'ꟁ'), ('ꟃ', 'ꟃ'), ('ꟈ', 'ꟈ'), ('ꟊ', 'ꟊ'), ('ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟕ'), ('ꟗ', 'ꟗ'), ('ꟙ', 'ꟙ'), ('ꟶ', 'ꟶ'), ('ꟺ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭠ', 'ꭨ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𖹠', '𖹿'), ('𝐚', '𝐳'), ('𝑎', '𝑔'), ('𝑖', '𝑧'), ('𝒂', '𝒛'), ('𝒶', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝓏'), ('𝓪', '𝔃'), ('𝔞', '𝔷'), ('𝕒', '𝕫'), ('𝖆', '𝖟'), ('𝖺', '𝗓'), ('𝗮', '𝘇'), ('𝘢', '𝘻'), ('𝙖', '𝙯'), ('𝚊', '𝚥'), ('𝛂', '𝛚'), ('𝛜', '𝛡'), ('𝛼', '𝜔'), ('𝜖', '𝜛'), ('𝜶', '𝝎'), ('𝝐', '𝝕'), ('𝝰', '𝞈'), ('𝞊', '𝞏'), ('𝞪', '𝟂'), ('𝟄', '𝟉'), ('𝟋', '𝟋'), ('𝼀', '𝼉'), ('𝼋', '𝼞'), ('𝼥', '𝼪'), ('𞤢', '𞥃'), ]; pub const MARK: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{489}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', 'ः'), ('\u{93a}', '\u{93c}'), ('ा', 'ॏ'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', 'ঃ'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', 'ઃ'), ('\u{abc}', '\u{abc}'), ('ા', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', 'ଃ'), ('\u{b3c}', '\u{b3c}'), ('\u{b3e}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', 'ಃ'), ('\u{cbc}', '\u{cbc}'), ('ಾ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('ೳ', 'ೳ'), ('\u{d00}', 'ഃ'), ('\u{d3b}', '\u{d3c}'), ('\u{d3e}', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', 'ඃ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', '༿'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('ါ', '\u{103e}'), ('ၖ', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{108d}'), ('ႏ', 'ႏ'), ('ႚ', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '᜕'), ('\u{1732}', '᜴'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('\u{1a17}', '\u{1a1b}'), ('ᩕ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', 'ᬄ'), ('\u{1b34}', '᭄'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', 'ᮂ'), ('ᮡ', '\u{1bad}'), ('\u{1be6}', '᯳'), ('ᰤ', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('᳷', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{20d0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('ꠣ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꢀ', 'ꢁ'), ('ꢴ', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '꥓'), ('\u{a980}', 'ꦃ'), ('\u{a9b3}', '꧀'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', 'ꩍ'), ('ꩻ', 'ꩽ'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('ꫫ', 'ꫯ'), ('ꫵ', '\u{aaf6}'), ('ꯣ', 'ꯪ'), ('꯬', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('𑀀', '𑀂'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '𑂂'), ('𑂰', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{11134}'), ('𑅅', '𑅆'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '𑆂'), ('𑆳', '𑇀'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '\u{111cf}'), ('𑈬', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112ea}'), ('\u{11300}', '𑌃'), ('\u{1133b}', '\u{1133c}'), ('\u{1133e}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('\u{11357}', '\u{11357}'), ('𑍢', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐵', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b0}', '\u{114c3}'), ('\u{115af}', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('𑘰', '\u{11640}'), ('\u{116ab}', '\u{116b7}'), ('\u{1171d}', '\u{1172b}'), ('𑠬', '\u{1183a}'), ('\u{11930}', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{1193e}'), ('𑥀', '𑥀'), ('𑥂', '\u{11943}'), ('𑧑', '\u{119d7}'), ('\u{119da}', '\u{119e0}'), ('𑧤', '𑧤'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '𑨹'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a99}'), ('𑰯', '\u{11c36}'), ('\u{11c38}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('𑶊', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '\u{11d97}'), ('\u{11ef3}', '𑻶'), ('\u{11f00}', '\u{11f01}'), ('𑼃', '𑼃'), ('𑼴', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('𖽑', '𖾇'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('𖿰', '𖿱'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const MATH_SYMBOL: &'static [(char, char)] = &[ ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('±', '±'), ('×', '×'), ('÷', '÷'), ('϶', '϶'), ('؆', '؈'), ('⁄', '⁄'), ('⁒', '⁒'), ('⁺', '⁼'), ('₊', '₌'), ('℘', '℘'), ('⅀', '⅄'), ('⅋', '⅋'), ('←', '↔'), ('↚', '↛'), ('↠', '↠'), ('↣', '↣'), ('↦', '↦'), ('↮', '↮'), ('⇎', '⇏'), ('⇒', '⇒'), ('⇔', '⇔'), ('⇴', '⋿'), ('⌠', '⌡'), ('⍼', '⍼'), ('⎛', '⎳'), ('⏜', '⏡'), ('▷', '▷'), ('◁', '◁'), ('◸', '◿'), ('♯', '♯'), ('⟀', '⟄'), ('⟇', '⟥'), ('⟰', '⟿'), ('⤀', '⦂'), ('⦙', '⧗'), ('⧜', '⧻'), ('⧾', '⫿'), ('⬰', '⭄'), ('⭇', '⭌'), ('﬩', '﬩'), ('﹢', '﹢'), ('﹤', '﹦'), ('+', '+'), ('<', '>'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('←', '↓'), ('𝛁', '𝛁'), ('𝛛', '𝛛'), ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), ('𝝏', '𝝏'), ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), ('𝟃', '𝟃'), ('𞻰', '𞻱'), ]; pub const MODIFIER_LETTER: &'static [(char, char)] = &[ ('ʰ', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('ʹ', 'ʹ'), ('ͺ', 'ͺ'), ('ՙ', 'ՙ'), ('ـ', 'ـ'), ('ۥ', 'ۦ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࣉ', 'ࣉ'), ('ॱ', 'ॱ'), ('ๆ', 'ๆ'), ('ໆ', 'ໆ'), ('ჼ', 'ჼ'), ('ៗ', 'ៗ'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), ('ᱸ', 'ᱽ'), ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⱼ', 'ⱽ'), ('ⵯ', 'ⵯ'), ('ⸯ', 'ⸯ'), ('々', '々'), ('〱', '〵'), ('〻', '〻'), ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꓸ', 'ꓽ'), ('ꘌ', 'ꘌ'), ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('ꜗ', 'ꜟ'), ('ꝰ', 'ꝰ'), ('ꞈ', 'ꞈ'), ('ꟲ', 'ꟴ'), ('ꟸ', 'ꟹ'), ('ꧏ', 'ꧏ'), ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), ('ꫳ', 'ꫴ'), ('ꭜ', 'ꭟ'), ('ꭩ', 'ꭩ'), ('ー', 'ー'), ('\u{ff9e}', '\u{ff9f}'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𖭀', '𖭃'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𞀰', '𞁭'), ('𞄷', '𞄽'), ('𞓫', '𞓫'), ('𞥋', '𞥋'), ]; pub const MODIFIER_SYMBOL: &'static [(char, char)] = &[ ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), ('¸', '¸'), ('˂', '˅'), ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), ('˯', '˿'), ('͵', '͵'), ('΄', '΅'), ('࢈', '࢈'), ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), ('゛', '゜'), ('꜀', '꜖'), ('꜠', '꜡'), ('꞉', '꞊'), ('꭛', '꭛'), ('꭪', '꭫'), ('﮲', '﯂'), ('^', '^'), ('`', '`'), (' ̄', ' ̄'), ('🏻', '🏿'), ]; pub const NONSPACING_MARK: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{487}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', '\u{902}'), ('\u{93a}', '\u{93a}'), ('\u{93c}', '\u{93c}'), ('\u{941}', '\u{948}'), ('\u{94d}', '\u{94d}'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', '\u{981}'), ('\u{9bc}', '\u{9bc}'), ('\u{9c1}', '\u{9c4}'), ('\u{9cd}', '\u{9cd}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', '\u{a02}'), ('\u{a3c}', '\u{a3c}'), ('\u{a41}', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', '\u{a82}'), ('\u{abc}', '\u{abc}'), ('\u{ac1}', '\u{ac5}'), ('\u{ac7}', '\u{ac8}'), ('\u{acd}', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', '\u{b01}'), ('\u{b3c}', '\u{b3c}'), ('\u{b3f}', '\u{b3f}'), ('\u{b41}', '\u{b44}'), ('\u{b4d}', '\u{b4d}'), ('\u{b55}', '\u{b56}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bc0}', '\u{bc0}'), ('\u{bcd}', '\u{bcd}'), ('\u{c00}', '\u{c00}'), ('\u{c04}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', '\u{c40}'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', '\u{c81}'), ('\u{cbc}', '\u{cbc}'), ('\u{cbf}', '\u{cbf}'), ('\u{cc6}', '\u{cc6}'), ('\u{ccc}', '\u{ccd}'), ('\u{ce2}', '\u{ce3}'), ('\u{d00}', '\u{d01}'), ('\u{d3b}', '\u{d3c}'), ('\u{d41}', '\u{d44}'), ('\u{d4d}', '\u{d4d}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', '\u{d81}'), ('\u{dca}', '\u{dca}'), ('\u{dd2}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('\u{f71}', '\u{f7e}'), ('\u{f80}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('\u{102d}', '\u{1030}'), ('\u{1032}', '\u{1037}'), ('\u{1039}', '\u{103a}'), ('\u{103d}', '\u{103e}'), ('\u{1058}', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{1082}'), ('\u{1085}', '\u{1086}'), ('\u{108d}', '\u{108d}'), ('\u{109d}', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '\u{1714}'), ('\u{1732}', '\u{1733}'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17b5}'), ('\u{17b7}', '\u{17bd}'), ('\u{17c6}', '\u{17c6}'), ('\u{17c9}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', '\u{1922}'), ('\u{1927}', '\u{1928}'), ('\u{1932}', '\u{1932}'), ('\u{1939}', '\u{193b}'), ('\u{1a17}', '\u{1a18}'), ('\u{1a1b}', '\u{1a1b}'), ('\u{1a56}', '\u{1a56}'), ('\u{1a58}', '\u{1a5e}'), ('\u{1a60}', '\u{1a60}'), ('\u{1a62}', '\u{1a62}'), ('\u{1a65}', '\u{1a6c}'), ('\u{1a73}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1abd}'), ('\u{1abf}', '\u{1ace}'), ('\u{1b00}', '\u{1b03}'), ('\u{1b34}', '\u{1b34}'), ('\u{1b36}', '\u{1b3a}'), ('\u{1b3c}', '\u{1b3c}'), ('\u{1b42}', '\u{1b42}'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1b81}'), ('\u{1ba2}', '\u{1ba5}'), ('\u{1ba8}', '\u{1ba9}'), ('\u{1bab}', '\u{1bad}'), ('\u{1be6}', '\u{1be6}'), ('\u{1be8}', '\u{1be9}'), ('\u{1bed}', '\u{1bed}'), ('\u{1bef}', '\u{1bf1}'), ('\u{1c2c}', '\u{1c33}'), ('\u{1c36}', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{20d0}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302d}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a66f}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('\u{a825}', '\u{a826}'), ('\u{a82c}', '\u{a82c}'), ('\u{a8c4}', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '\u{a951}'), ('\u{a980}', '\u{a982}'), ('\u{a9b3}', '\u{a9b3}'), ('\u{a9b6}', '\u{a9b9}'), ('\u{a9bc}', '\u{a9bd}'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa2e}'), ('\u{aa31}', '\u{aa32}'), ('\u{aa35}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', '\u{aa4c}'), ('\u{aa7c}', '\u{aa7c}'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('\u{aaec}', '\u{aaed}'), ('\u{aaf6}', '\u{aaf6}'), ('\u{abe5}', '\u{abe5}'), ('\u{abe8}', '\u{abe8}'), ('\u{abed}', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('\u{11001}', '\u{11001}'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '\u{11081}'), ('\u{110b3}', '\u{110b6}'), ('\u{110b9}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{1112b}'), ('\u{1112d}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '\u{11181}'), ('\u{111b6}', '\u{111be}'), ('\u{111c9}', '\u{111cc}'), ('\u{111cf}', '\u{111cf}'), ('\u{1122f}', '\u{11231}'), ('\u{11234}', '\u{11234}'), ('\u{11236}', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112df}'), ('\u{112e3}', '\u{112ea}'), ('\u{11300}', '\u{11301}'), ('\u{1133b}', '\u{1133c}'), ('\u{11340}', '\u{11340}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('\u{11438}', '\u{1143f}'), ('\u{11442}', '\u{11444}'), ('\u{11446}', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b3}', '\u{114b8}'), ('\u{114ba}', '\u{114ba}'), ('\u{114bf}', '\u{114c0}'), ('\u{114c2}', '\u{114c3}'), ('\u{115b2}', '\u{115b5}'), ('\u{115bc}', '\u{115bd}'), ('\u{115bf}', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('\u{11633}', '\u{1163a}'), ('\u{1163d}', '\u{1163d}'), ('\u{1163f}', '\u{11640}'), ('\u{116ab}', '\u{116ab}'), ('\u{116ad}', '\u{116ad}'), ('\u{116b0}', '\u{116b5}'), ('\u{116b7}', '\u{116b7}'), ('\u{1171d}', '\u{1171f}'), ('\u{11722}', '\u{11725}'), ('\u{11727}', '\u{1172b}'), ('\u{1182f}', '\u{11837}'), ('\u{11839}', '\u{1183a}'), ('\u{1193b}', '\u{1193c}'), ('\u{1193e}', '\u{1193e}'), ('\u{11943}', '\u{11943}'), ('\u{119d4}', '\u{119d7}'), ('\u{119da}', '\u{119db}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '\u{11a38}'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a56}'), ('\u{11a59}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a96}'), ('\u{11a98}', '\u{11a99}'), ('\u{11c30}', '\u{11c36}'), ('\u{11c38}', '\u{11c3d}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('\u{11caa}', '\u{11cb0}'), ('\u{11cb2}', '\u{11cb3}'), ('\u{11cb5}', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('\u{11d90}', '\u{11d91}'), ('\u{11d95}', '\u{11d95}'), ('\u{11d97}', '\u{11d97}'), ('\u{11ef3}', '\u{11ef4}'), ('\u{11f00}', '\u{11f01}'), ('\u{11f36}', '\u{11f3a}'), ('\u{11f40}', '\u{11f40}'), ('\u{11f42}', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const NUMBER: &'static [(char, char)] = &[ ('0', '9'), ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('٠', '٩'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('৴', '৹'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('୲', '୷'), ('௦', '௲'), ('౦', '౯'), ('౸', '౾'), ('೦', '೯'), ('൘', '൞'), ('൦', '൸'), ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༳'), ('၀', '၉'), ('႐', '႙'), ('፩', '፼'), ('ᛮ', 'ᛰ'), ('០', '៩'), ('៰', '៹'), ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧚'), ('᪀', '᪉'), ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), ('⅐', 'ↂ'), ('ↅ', '↉'), ('①', '⒛'), ('⓪', '⓿'), ('❶', '➓'), ('⳽', '⳽'), ('〇', '〇'), ('〡', '〩'), ('〸', '〺'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꘠', '꘩'), ('ꛦ', 'ꛯ'), ('꠰', '꠵'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), ('𐄇', '𐄳'), ('𐅀', '𐅸'), ('𐆊', '𐆋'), ('𐋡', '𐋻'), ('𐌠', '𐌣'), ('𐍁', '𐍁'), ('𐍊', '𐍊'), ('𐏑', '𐏕'), ('𐒠', '𐒩'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), ('𐢧', '𐢯'), ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), ('𐧒', '𐧿'), ('𐩀', '𐩈'), ('𐩽', '𐩾'), ('𐪝', '𐪟'), ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), ('𐮩', '𐮯'), ('𐳺', '𐳿'), ('𐴰', '𐴹'), ('𐹠', '𐹾'), ('𐼝', '𐼦'), ('𐽑', '𐽔'), ('𐿅', '𐿋'), ('𑁒', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑇡', '𑇴'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜻'), ('𑣠', '𑣲'), ('𑥐', '𑥙'), ('𑱐', '𑱬'), ('𑵐', '𑵙'), ('𑶠', '𑶩'), ('𑽐', '𑽙'), ('𑿀', '𑿔'), ('𒐀', '𒑮'), ('𖩠', '𖩩'), ('𖫀', '𖫉'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖺀', '𖺖'), ('𝋀', '𝋓'), ('𝋠', '𝋳'), ('𝍠', '𝍸'), ('𝟎', '𝟿'), ('𞅀', '𞅉'), ('𞋰', '𞋹'), ('𞓰', '𞓹'), ('𞣇', '𞣏'), ('𞥐', '𞥙'), ('𞱱', '𞲫'), ('𞲭', '𞲯'), ('𞲱', '𞲴'), ('𞴁', '𞴭'), ('𞴯', '𞴽'), ('🄀', '🄌'), ('🯰', '🯹'), ]; pub const OPEN_PUNCTUATION: &'static [(char, char)] = &[ ('(', '('), ('[', '['), ('{', '{'), ('༺', '༺'), ('༼', '༼'), ('᚛', '᚛'), ('‚', '‚'), ('„', '„'), ('⁅', '⁅'), ('⁽', '⁽'), ('₍', '₍'), ('⌈', '⌈'), ('⌊', '⌊'), ('〈', '〈'), ('❨', '❨'), ('❪', '❪'), ('❬', '❬'), ('❮', '❮'), ('❰', '❰'), ('❲', '❲'), ('❴', '❴'), ('⟅', '⟅'), ('⟦', '⟦'), ('⟨', '⟨'), ('⟪', '⟪'), ('⟬', '⟬'), ('⟮', '⟮'), ('⦃', '⦃'), ('⦅', '⦅'), ('⦇', '⦇'), ('⦉', '⦉'), ('⦋', '⦋'), ('⦍', '⦍'), ('⦏', '⦏'), ('⦑', '⦑'), ('⦓', '⦓'), ('⦕', '⦕'), ('⦗', '⦗'), ('⧘', '⧘'), ('⧚', '⧚'), ('⧼', '⧼'), ('⸢', '⸢'), ('⸤', '⸤'), ('⸦', '⸦'), ('⸨', '⸨'), ('⹂', '⹂'), ('⹕', '⹕'), ('⹗', '⹗'), ('⹙', '⹙'), ('⹛', '⹛'), ('〈', '〈'), ('《', '《'), ('「', '「'), ('『', '『'), ('【', '【'), ('〔', '〔'), ('〖', '〖'), ('〘', '〘'), ('〚', '〚'), ('〝', '〝'), ('﴿', '﴿'), ('︗', '︗'), ('︵', '︵'), ('︷', '︷'), ('︹', '︹'), ('︻', '︻'), ('︽', '︽'), ('︿', '︿'), ('﹁', '﹁'), ('﹃', '﹃'), ('﹇', '﹇'), ('﹙', '﹙'), ('﹛', '﹛'), ('﹝', '﹝'), ('(', '('), ('[', '['), ('{', '{'), ('⦅', '⦅'), ('「', '「'), ]; pub const OTHER: &'static [(char, char)] = &[ ('\0', '\u{1f}'), ('\u{7f}', '\u{9f}'), ('\u{ad}', '\u{ad}'), ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), ('\u{557}', '\u{558}'), ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), ('\u{5eb}', '\u{5ee}'), ('\u{5f5}', '\u{605}'), ('\u{61c}', '\u{61c}'), ('\u{6dd}', '\u{6dd}'), ('\u{70e}', '\u{70f}'), ('\u{74b}', '\u{74c}'), ('\u{7b2}', '\u{7bf}'), ('\u{7fb}', '\u{7fc}'), ('\u{82e}', '\u{82f}'), ('\u{83f}', '\u{83f}'), ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), ('\u{86b}', '\u{86f}'), ('\u{88f}', '\u{897}'), ('\u{8e2}', '\u{8e2}'), ('\u{984}', '\u{984}'), ('\u{98d}', '\u{98e}'), ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), ('\u{9b1}', '\u{9b1}'), ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), ('\u{9c5}', '\u{9c6}'), ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), ('\u{9d8}', '\u{9db}'), ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), ('\u{9ff}', '\u{a00}'), ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), ('\u{a11}', '\u{a12}'), ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), ('\u{a34}', '\u{a34}'), ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), ('\u{a3d}', '\u{a3d}'), ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), ('\u{a4e}', '\u{a50}'), ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), ('\u{a5f}', '\u{a65}'), ('\u{a77}', '\u{a80}'), ('\u{a84}', '\u{a84}'), ('\u{a8e}', '\u{a8e}'), ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), ('\u{ab1}', '\u{ab1}'), ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), ('\u{ac6}', '\u{ac6}'), ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), ('\u{ad1}', '\u{adf}'), ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), ('\u{b00}', '\u{b00}'), ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), ('\u{b11}', '\u{b12}'), ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), ('\u{b34}', '\u{b34}'), ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), ('\u{b49}', '\u{b4a}'), ('\u{b4e}', '\u{b54}'), ('\u{b58}', '\u{b5b}'), ('\u{b5e}', '\u{b5e}'), ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), ('\u{b84}', '\u{b84}'), ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), ('\u{b96}', '\u{b98}'), ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), ('\u{ba0}', '\u{ba2}'), ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), ('\u{bba}', '\u{bbd}'), ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), ('\u{bce}', '\u{bcf}'), ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), ('\u{bfb}', '\u{bff}'), ('\u{c0d}', '\u{c0d}'), ('\u{c11}', '\u{c11}'), ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3b}'), ('\u{c45}', '\u{c45}'), ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), ('\u{c57}', '\u{c57}'), ('\u{c5b}', '\u{c5c}'), ('\u{c5e}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), ('\u{c70}', '\u{c76}'), ('\u{c8d}', '\u{c8d}'), ('\u{c91}', '\u{c91}'), ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), ('\u{cba}', '\u{cbb}'), ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), ('\u{cce}', '\u{cd4}'), ('\u{cd7}', '\u{cdc}'), ('\u{cdf}', '\u{cdf}'), ('\u{ce4}', '\u{ce5}'), ('\u{cf0}', '\u{cf0}'), ('\u{cf4}', '\u{cff}'), ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), ('\u{d45}', '\u{d45}'), ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), ('\u{d64}', '\u{d65}'), ('\u{d80}', '\u{d80}'), ('\u{d84}', '\u{d84}'), ('\u{d97}', '\u{d99}'), ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), ('\u{dbe}', '\u{dbf}'), ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), ('\u{dd5}', '\u{dd5}'), ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), ('\u{df0}', '\u{df1}'), ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), ('\u{e5c}', '\u{e80}'), ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e85}'), ('\u{e8b}', '\u{e8b}'), ('\u{ea4}', '\u{ea4}'), ('\u{ea6}', '\u{ea6}'), ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), ('\u{ec7}', '\u{ec7}'), ('\u{ecf}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), ('\u{ee0}', '\u{eff}'), ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), ('\u{f98}', '\u{f98}'), ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), ('\u{fdb}', '\u{fff}'), ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), ('\u{16f9}', '\u{16ff}'), ('\u{1716}', '\u{171e}'), ('\u{1737}', '\u{173f}'), ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), ('\u{17fa}', '\u{17ff}'), ('\u{180e}', '\u{180e}'), ('\u{181a}', '\u{181f}'), ('\u{1879}', '\u{187f}'), ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), ('\u{1aae}', '\u{1aaf}'), ('\u{1acf}', '\u{1aff}'), ('\u{1b4d}', '\u{1b4f}'), ('\u{1b7f}', '\u{1b7f}'), ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1c8f}'), ('\u{1cbb}', '\u{1cbc}'), ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfb}', '\u{1cff}'), ('\u{1f16}', '\u{1f17}'), ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), ('\u{1fff}', '\u{1fff}'), ('\u{200b}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), ('\u{209d}', '\u{209f}'), ('\u{20c1}', '\u{20cf}'), ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b96}'), ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), ('\u{2ddf}', '\u{2ddf}'), ('\u{2e5e}', '\u{2e7f}'), ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), ('\u{3100}', '\u{3104}'), ('\u{3130}', '\u{3130}'), ('\u{318f}', '\u{318f}'), ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), ('\u{a48d}', '\u{a48f}'), ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), ('\u{a6f8}', '\u{a6ff}'), ('\u{a7cb}', '\u{a7cf}'), ('\u{a7d2}', '\u{a7d2}'), ('\u{a7d4}', '\u{a7d4}'), ('\u{a7da}', '\u{a7f1}'), ('\u{a82d}', '\u{a82f}'), ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), ('\u{a954}', '\u{a95e}'), ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), ('\u{ab6c}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{f8ff}'), ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), ('\u{fb45}', '\u{fb45}'), ('\u{fbc3}', '\u{fbd2}'), ('\u{fd90}', '\u{fd91}'), ('\u{fdc8}', '\u{fdce}'), ('\u{fdd0}', '\u{fdef}'), ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{ff00}'), ('\u{ffbf}', '\u{ffc1}'), ('\u{ffc8}', '\u{ffc9}'), ('\u{ffd0}', '\u{ffd1}'), ('\u{ffd8}', '\u{ffd9}'), ('\u{ffdd}', '\u{ffdf}'), ('\u{ffe7}', '\u{ffe7}'), ('\u{ffef}', '\u{fffb}'), ('\u{fffe}', '\u{ffff}'), ('\u{1000c}', '\u{1000c}'), ('\u{10027}', '\u{10027}'), ('\u{1003b}', '\u{1003b}'), ('\u{1003e}', '\u{1003e}'), ('\u{1004e}', '\u{1004f}'), ('\u{1005e}', '\u{1007f}'), ('\u{100fb}', '\u{100ff}'), ('\u{10103}', '\u{10106}'), ('\u{10134}', '\u{10136}'), ('\u{1018f}', '\u{1018f}'), ('\u{1019d}', '\u{1019f}'), ('\u{101a1}', '\u{101cf}'), ('\u{101fe}', '\u{1027f}'), ('\u{1029d}', '\u{1029f}'), ('\u{102d1}', '\u{102df}'), ('\u{102fc}', '\u{102ff}'), ('\u{10324}', '\u{1032c}'), ('\u{1034b}', '\u{1034f}'), ('\u{1037b}', '\u{1037f}'), ('\u{1039e}', '\u{1039e}'), ('\u{103c4}', '\u{103c7}'), ('\u{103d6}', '\u{103ff}'), ('\u{1049e}', '\u{1049f}'), ('\u{104aa}', '\u{104af}'), ('\u{104d4}', '\u{104d7}'), ('\u{104fc}', '\u{104ff}'), ('\u{10528}', '\u{1052f}'), ('\u{10564}', '\u{1056e}'), ('\u{1057b}', '\u{1057b}'), ('\u{1058b}', '\u{1058b}'), ('\u{10593}', '\u{10593}'), ('\u{10596}', '\u{10596}'), ('\u{105a2}', '\u{105a2}'), ('\u{105b2}', '\u{105b2}'), ('\u{105ba}', '\u{105ba}'), ('\u{105bd}', '\u{105ff}'), ('\u{10737}', '\u{1073f}'), ('\u{10756}', '\u{1075f}'), ('\u{10768}', '\u{1077f}'), ('\u{10786}', '\u{10786}'), ('\u{107b1}', '\u{107b1}'), ('\u{107bb}', '\u{107ff}'), ('\u{10806}', '\u{10807}'), ('\u{10809}', '\u{10809}'), ('\u{10836}', '\u{10836}'), ('\u{10839}', '\u{1083b}'), ('\u{1083d}', '\u{1083e}'), ('\u{10856}', '\u{10856}'), ('\u{1089f}', '\u{108a6}'), ('\u{108b0}', '\u{108df}'), ('\u{108f3}', '\u{108f3}'), ('\u{108f6}', '\u{108fa}'), ('\u{1091c}', '\u{1091e}'), ('\u{1093a}', '\u{1093e}'), ('\u{10940}', '\u{1097f}'), ('\u{109b8}', '\u{109bb}'), ('\u{109d0}', '\u{109d1}'), ('\u{10a04}', '\u{10a04}'), ('\u{10a07}', '\u{10a0b}'), ('\u{10a14}', '\u{10a14}'), ('\u{10a18}', '\u{10a18}'), ('\u{10a36}', '\u{10a37}'), ('\u{10a3b}', '\u{10a3e}'), ('\u{10a49}', '\u{10a4f}'), ('\u{10a59}', '\u{10a5f}'), ('\u{10aa0}', '\u{10abf}'), ('\u{10ae7}', '\u{10aea}'), ('\u{10af7}', '\u{10aff}'), ('\u{10b36}', '\u{10b38}'), ('\u{10b56}', '\u{10b57}'), ('\u{10b73}', '\u{10b77}'), ('\u{10b92}', '\u{10b98}'), ('\u{10b9d}', '\u{10ba8}'), ('\u{10bb0}', '\u{10bff}'), ('\u{10c49}', '\u{10c7f}'), ('\u{10cb3}', '\u{10cbf}'), ('\u{10cf3}', '\u{10cf9}'), ('\u{10d28}', '\u{10d2f}'), ('\u{10d3a}', '\u{10e5f}'), ('\u{10e7f}', '\u{10e7f}'), ('\u{10eaa}', '\u{10eaa}'), ('\u{10eae}', '\u{10eaf}'), ('\u{10eb2}', '\u{10efc}'), ('\u{10f28}', '\u{10f2f}'), ('\u{10f5a}', '\u{10f6f}'), ('\u{10f8a}', '\u{10faf}'), ('\u{10fcc}', '\u{10fdf}'), ('\u{10ff7}', '\u{10fff}'), ('\u{1104e}', '\u{11051}'), ('\u{11076}', '\u{1107e}'), ('\u{110bd}', '\u{110bd}'), ('\u{110c3}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), ('\u{11148}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), ('\u{111e0}', '\u{111e0}'), ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), ('\u{11242}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), ('\u{1133a}', '\u{1133a}'), ('\u{11345}', '\u{11346}'), ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), ('\u{11375}', '\u{113ff}'), ('\u{1145c}', '\u{1145c}'), ('\u{11462}', '\u{1147f}'), ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), ('\u{1166d}', '\u{1167f}'), ('\u{116ba}', '\u{116bf}'), ('\u{116ca}', '\u{116ff}'), ('\u{1171b}', '\u{1171c}'), ('\u{1172c}', '\u{1172f}'), ('\u{11747}', '\u{117ff}'), ('\u{1183c}', '\u{1189f}'), ('\u{118f3}', '\u{118fe}'), ('\u{11907}', '\u{11908}'), ('\u{1190a}', '\u{1190b}'), ('\u{11914}', '\u{11914}'), ('\u{11917}', '\u{11917}'), ('\u{11936}', '\u{11936}'), ('\u{11939}', '\u{1193a}'), ('\u{11947}', '\u{1194f}'), ('\u{1195a}', '\u{1199f}'), ('\u{119a8}', '\u{119a9}'), ('\u{119d8}', '\u{119d9}'), ('\u{119e5}', '\u{119ff}'), ('\u{11a48}', '\u{11a4f}'), ('\u{11aa3}', '\u{11aaf}'), ('\u{11af9}', '\u{11aff}'), ('\u{11b0a}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), ('\u{11d5a}', '\u{11d5f}'), ('\u{11d66}', '\u{11d66}'), ('\u{11d69}', '\u{11d69}'), ('\u{11d8f}', '\u{11d8f}'), ('\u{11d92}', '\u{11d92}'), ('\u{11d99}', '\u{11d9f}'), ('\u{11daa}', '\u{11edf}'), ('\u{11ef9}', '\u{11eff}'), ('\u{11f11}', '\u{11f11}'), ('\u{11f3b}', '\u{11f3d}'), ('\u{11f5a}', '\u{11faf}'), ('\u{11fb1}', '\u{11fbf}'), ('\u{11ff2}', '\u{11ffe}'), ('\u{1239a}', '\u{123ff}'), ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), ('\u{12544}', '\u{12f8f}'), ('\u{12ff3}', '\u{12fff}'), ('\u{13430}', '\u{1343f}'), ('\u{13456}', '\u{143ff}'), ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), ('\u{16abf}', '\u{16abf}'), ('\u{16aca}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16e3f}'), ('\u{16e9b}', '\u{16eff}'), ('\u{16f4b}', '\u{16f4e}'), ('\u{16f88}', '\u{16f8e}'), ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe5}', '\u{16fef}'), ('\u{16ff2}', '\u{16fff}'), ('\u{187f8}', '\u{187ff}'), ('\u{18cd6}', '\u{18cff}'), ('\u{18d09}', '\u{1afef}'), ('\u{1aff4}', '\u{1aff4}'), ('\u{1affc}', '\u{1affc}'), ('\u{1afff}', '\u{1afff}'), ('\u{1b123}', '\u{1b131}'), ('\u{1b133}', '\u{1b14f}'), ('\u{1b153}', '\u{1b154}'), ('\u{1b156}', '\u{1b163}'), ('\u{1b168}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), ('\u{1bca0}', '\u{1ceff}'), ('\u{1cf2e}', '\u{1cf2f}'), ('\u{1cf47}', '\u{1cf4f}'), ('\u{1cfc4}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), ('\u{1d127}', '\u{1d128}'), ('\u{1d173}', '\u{1d17a}'), ('\u{1d1eb}', '\u{1d1ff}'), ('\u{1d246}', '\u{1d2bf}'), ('\u{1d2d4}', '\u{1d2df}'), ('\u{1d2f4}', '\u{1d2ff}'), ('\u{1d357}', '\u{1d35f}'), ('\u{1d379}', '\u{1d3ff}'), ('\u{1d455}', '\u{1d455}'), ('\u{1d49d}', '\u{1d49d}'), ('\u{1d4a0}', '\u{1d4a1}'), ('\u{1d4a3}', '\u{1d4a4}'), ('\u{1d4a7}', '\u{1d4a8}'), ('\u{1d4ad}', '\u{1d4ad}'), ('\u{1d4ba}', '\u{1d4ba}'), ('\u{1d4bc}', '\u{1d4bc}'), ('\u{1d4c4}', '\u{1d4c4}'), ('\u{1d506}', '\u{1d506}'), ('\u{1d50b}', '\u{1d50c}'), ('\u{1d515}', '\u{1d515}'), ('\u{1d51d}', '\u{1d51d}'), ('\u{1d53a}', '\u{1d53a}'), ('\u{1d53f}', '\u{1d53f}'), ('\u{1d545}', '\u{1d545}'), ('\u{1d547}', '\u{1d549}'), ('\u{1d551}', '\u{1d551}'), ('\u{1d6a6}', '\u{1d6a7}'), ('\u{1d7cc}', '\u{1d7cd}'), ('\u{1da8c}', '\u{1da9a}'), ('\u{1daa0}', '\u{1daa0}'), ('\u{1dab0}', '\u{1deff}'), ('\u{1df1f}', '\u{1df24}'), ('\u{1df2b}', '\u{1dfff}'), ('\u{1e007}', '\u{1e007}'), ('\u{1e019}', '\u{1e01a}'), ('\u{1e022}', '\u{1e022}'), ('\u{1e025}', '\u{1e025}'), ('\u{1e02b}', '\u{1e02f}'), ('\u{1e06e}', '\u{1e08e}'), ('\u{1e090}', '\u{1e0ff}'), ('\u{1e12d}', '\u{1e12f}'), ('\u{1e13e}', '\u{1e13f}'), ('\u{1e14a}', '\u{1e14d}'), ('\u{1e150}', '\u{1e28f}'), ('\u{1e2af}', '\u{1e2bf}'), ('\u{1e2fa}', '\u{1e2fe}'), ('\u{1e300}', '\u{1e4cf}'), ('\u{1e4fa}', '\u{1e7df}'), ('\u{1e7e7}', '\u{1e7e7}'), ('\u{1e7ec}', '\u{1e7ec}'), ('\u{1e7ef}', '\u{1e7ef}'), ('\u{1e7ff}', '\u{1e7ff}'), ('\u{1e8c5}', '\u{1e8c6}'), ('\u{1e8d7}', '\u{1e8ff}'), ('\u{1e94c}', '\u{1e94f}'), ('\u{1e95a}', '\u{1e95d}'), ('\u{1e960}', '\u{1ec70}'), ('\u{1ecb5}', '\u{1ed00}'), ('\u{1ed3e}', '\u{1edff}'), ('\u{1ee04}', '\u{1ee04}'), ('\u{1ee20}', '\u{1ee20}'), ('\u{1ee23}', '\u{1ee23}'), ('\u{1ee25}', '\u{1ee26}'), ('\u{1ee28}', '\u{1ee28}'), ('\u{1ee33}', '\u{1ee33}'), ('\u{1ee38}', '\u{1ee38}'), ('\u{1ee3a}', '\u{1ee3a}'), ('\u{1ee3c}', '\u{1ee41}'), ('\u{1ee43}', '\u{1ee46}'), ('\u{1ee48}', '\u{1ee48}'), ('\u{1ee4a}', '\u{1ee4a}'), ('\u{1ee4c}', '\u{1ee4c}'), ('\u{1ee50}', '\u{1ee50}'), ('\u{1ee53}', '\u{1ee53}'), ('\u{1ee55}', '\u{1ee56}'), ('\u{1ee58}', '\u{1ee58}'), ('\u{1ee5a}', '\u{1ee5a}'), ('\u{1ee5c}', '\u{1ee5c}'), ('\u{1ee5e}', '\u{1ee5e}'), ('\u{1ee60}', '\u{1ee60}'), ('\u{1ee63}', '\u{1ee63}'), ('\u{1ee65}', '\u{1ee66}'), ('\u{1ee6b}', '\u{1ee6b}'), ('\u{1ee73}', '\u{1ee73}'), ('\u{1ee78}', '\u{1ee78}'), ('\u{1ee7d}', '\u{1ee7d}'), ('\u{1ee7f}', '\u{1ee7f}'), ('\u{1ee8a}', '\u{1ee8a}'), ('\u{1ee9c}', '\u{1eea0}'), ('\u{1eea4}', '\u{1eea4}'), ('\u{1eeaa}', '\u{1eeaa}'), ('\u{1eebc}', '\u{1eeef}'), ('\u{1eef2}', '\u{1efff}'), ('\u{1f02c}', '\u{1f02f}'), ('\u{1f094}', '\u{1f09f}'), ('\u{1f0af}', '\u{1f0b0}'), ('\u{1f0c0}', '\u{1f0c0}'), ('\u{1f0d0}', '\u{1f0d0}'), ('\u{1f0f6}', '\u{1f0ff}'), ('\u{1f1ae}', '\u{1f1e5}'), ('\u{1f203}', '\u{1f20f}'), ('\u{1f23c}', '\u{1f23f}'), ('\u{1f249}', '\u{1f24f}'), ('\u{1f252}', '\u{1f25f}'), ('\u{1f266}', '\u{1f2ff}'), ('\u{1f6d8}', '\u{1f6db}'), ('\u{1f6ed}', '\u{1f6ef}'), ('\u{1f6fd}', '\u{1f6ff}'), ('\u{1f777}', '\u{1f77a}'), ('\u{1f7da}', '\u{1f7df}'), ('\u{1f7ec}', '\u{1f7ef}'), ('\u{1f7f1}', '\u{1f7ff}'), ('\u{1f80c}', '\u{1f80f}'), ('\u{1f848}', '\u{1f84f}'), ('\u{1f85a}', '\u{1f85f}'), ('\u{1f888}', '\u{1f88f}'), ('\u{1f8ae}', '\u{1f8af}'), ('\u{1f8b2}', '\u{1f8ff}'), ('\u{1fa54}', '\u{1fa5f}'), ('\u{1fa6e}', '\u{1fa6f}'), ('\u{1fa7d}', '\u{1fa7f}'), ('\u{1fa89}', '\u{1fa8f}'), ('\u{1fabe}', '\u{1fabe}'), ('\u{1fac6}', '\u{1facd}'), ('\u{1fadc}', '\u{1fadf}'), ('\u{1fae9}', '\u{1faef}'), ('\u{1faf9}', '\u{1faff}'), ('\u{1fb93}', '\u{1fb93}'), ('\u{1fbcb}', '\u{1fbef}'), ('\u{1fbfa}', '\u{1ffff}'), ('\u{2a6e0}', '\u{2a6ff}'), ('\u{2b73a}', '\u{2b73f}'), ('\u{2b81e}', '\u{2b81f}'), ('\u{2cea2}', '\u{2ceaf}'), ('\u{2ebe1}', '\u{2f7ff}'), ('\u{2fa1e}', '\u{2ffff}'), ('\u{3134b}', '\u{3134f}'), ('\u{323b0}', '\u{e00ff}'), ('\u{e01f0}', '\u{10ffff}'), ]; pub const OTHER_LETTER: &'static [(char, char)] = &[ ('ª', 'ª'), ('º', 'º'), ('ƻ', 'ƻ'), ('ǀ', 'ǃ'), ('ʔ', 'ʔ'), ('א', 'ת'), ('ׯ', 'ײ'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ࠀ', 'ࠕ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣈ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॲ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๅ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('ᄀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛱ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡂ'), ('ᡄ', 'ᡸ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱷ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ℵ', 'ℸ'), ('ⴰ', 'ⵧ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('〆', '〆'), ('〼', '〼'), ('ぁ', 'ゖ'), ('ゟ', 'ゟ'), ('ァ', 'ヺ'), ('ヿ', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꀔ'), ('ꀖ', 'ꒌ'), ('ꓐ', 'ꓷ'), ('ꔀ', 'ꘋ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('ꙮ', 'ꙮ'), ('ꚠ', 'ꛥ'), ('ꞏ', 'ꞏ'), ('ꟷ', 'ꟷ'), ('ꟻ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧠ', 'ꧤ'), ('ꧧ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩯ'), ('ꩱ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫜ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫲ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꯀ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('ヲ', 'ッ'), ('ア', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍀'), ('𐍂', '𐍉'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐑐', '𐒝'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('𑝀', '𑝆'), ('𑠀', '𑠫'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝼊', '𝼊'), ('𞄀', '𞄬'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓪'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const OTHER_NUMBER: &'static [(char, char)] = &[ ('²', '³'), ('¹', '¹'), ('¼', '¾'), ('৴', '৹'), ('୲', '୷'), ('௰', '௲'), ('౸', '౾'), ('൘', '൞'), ('൰', '൸'), ('༪', '༳'), ('፩', '፼'), ('៰', '៹'), ('᧚', '᧚'), ('⁰', '⁰'), ('⁴', '⁹'), ('₀', '₉'), ('⅐', '⅟'), ('↉', '↉'), ('①', '⒛'), ('⓪', '⓿'), ('❶', '➓'), ('⳽', '⳽'), ('㆒', '㆕'), ('㈠', '㈩'), ('㉈', '㉏'), ('㉑', '㉟'), ('㊀', '㊉'), ('㊱', '㊿'), ('꠰', '꠵'), ('𐄇', '𐄳'), ('𐅵', '𐅸'), ('𐆊', '𐆋'), ('𐋡', '𐋻'), ('𐌠', '𐌣'), ('𐡘', '𐡟'), ('𐡹', '𐡿'), ('𐢧', '𐢯'), ('𐣻', '𐣿'), ('𐤖', '𐤛'), ('𐦼', '𐦽'), ('𐧀', '𐧏'), ('𐧒', '𐧿'), ('𐩀', '𐩈'), ('𐩽', '𐩾'), ('𐪝', '𐪟'), ('𐫫', '𐫯'), ('𐭘', '𐭟'), ('𐭸', '𐭿'), ('𐮩', '𐮯'), ('𐳺', '𐳿'), ('𐹠', '𐹾'), ('𐼝', '𐼦'), ('𐽑', '𐽔'), ('𐿅', '𐿋'), ('𑁒', '𑁥'), ('𑇡', '𑇴'), ('𑜺', '𑜻'), ('𑣪', '𑣲'), ('𑱚', '𑱬'), ('𑿀', '𑿔'), ('𖭛', '𖭡'), ('𖺀', '𖺖'), ('𝋀', '𝋓'), ('𝋠', '𝋳'), ('𝍠', '𝍸'), ('𞣇', '𞣏'), ('𞱱', '𞲫'), ('𞲭', '𞲯'), ('𞲱', '𞲴'), ('𞴁', '𞴭'), ('𞴯', '𞴽'), ('🄀', '🄌'), ]; pub const OTHER_PUNCTUATION: &'static [(char, char)] = &[ ('!', '#'), ('%', '\''), ('*', '*'), (',', ','), ('.', '/'), (':', ';'), ('?', '@'), ('\\', '\\'), ('¡', '¡'), ('§', '§'), ('¶', '·'), ('¿', '¿'), (';', ';'), ('·', '·'), ('՚', '՟'), ('։', '։'), ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), ('،', '؍'), ('؛', '؛'), ('؝', '؟'), ('٪', '٭'), ('۔', '۔'), ('܀', '܍'), ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), ('॰', '॰'), ('৽', '৽'), ('੶', '੶'), ('૰', '૰'), ('౷', '౷'), ('಄', '಄'), ('෴', '෴'), ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), ('၊', '၏'), ('჻', '჻'), ('፠', '፨'), ('᙮', '᙮'), ('᛫', '᛭'), ('᜵', '᜶'), ('។', '៖'), ('៘', '៚'), ('᠀', '᠅'), ('᠇', '᠊'), ('᥄', '᥅'), ('᨞', '᨟'), ('᪠', '᪦'), ('᪨', '᪭'), ('᭚', '᭠'), ('᭽', '᭾'), ('᯼', '᯿'), ('᰻', '᰿'), ('᱾', '᱿'), ('᳀', '᳇'), ('᳓', '᳓'), ('‖', '‗'), ('†', '‧'), ('‰', '‸'), ('※', '‾'), ('⁁', '⁃'), ('⁇', '⁑'), ('⁓', '⁓'), ('⁕', '⁞'), ('⳹', '⳼'), ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸁'), ('⸆', '⸈'), ('⸋', '⸋'), ('⸎', '⸖'), ('⸘', '⸙'), ('⸛', '⸛'), ('⸞', '⸟'), ('⸪', '⸮'), ('⸰', '⸹'), ('⸼', '⸿'), ('⹁', '⹁'), ('⹃', '⹏'), ('⹒', '⹔'), ('、', '〃'), ('〽', '〽'), ('・', '・'), ('꓾', '꓿'), ('꘍', '꘏'), ('꙳', '꙳'), ('꙾', '꙾'), ('꛲', '꛷'), ('꡴', '꡷'), ('꣎', '꣏'), ('꣸', '꣺'), ('꣼', '꣼'), ('꤮', '꤯'), ('꥟', '꥟'), ('꧁', '꧍'), ('꧞', '꧟'), ('꩜', '꩟'), ('꫞', '꫟'), ('꫰', '꫱'), ('꯫', '꯫'), ('︐', '︖'), ('︙', '︙'), ('︰', '︰'), ('﹅', '﹆'), ('﹉', '﹌'), ('﹐', '﹒'), ('﹔', '﹗'), ('﹟', '﹡'), ('﹨', '﹨'), ('﹪', '﹫'), ('!', '#'), ('%', '''), ('*', '*'), (',', ','), ('.', '/'), (':', ';'), ('?', '@'), ('\', '\'), ('。', '。'), ('、', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), ('𐏐', '𐏐'), ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐤿', '𐤿'), ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), ('𐬹', '𐬿'), ('𐮙', '𐮜'), ('𐽕', '𐽙'), ('𐾆', '𐾉'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), ('𑂾', '𑃁'), ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇈'), ('𑇍', '𑇍'), ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), ('𑊩', '𑊩'), ('𑑋', '𑑏'), ('𑑚', '𑑛'), ('𑑝', '𑑝'), ('𑓆', '𑓆'), ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), ('𑚹', '𑚹'), ('𑜼', '𑜾'), ('𑠻', '𑠻'), ('𑥄', '𑥆'), ('𑧢', '𑧢'), ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), ('𑬀', '𑬉'), ('𑱁', '𑱅'), ('𑱰', '𑱱'), ('𑻷', '𑻸'), ('𑽃', '𑽏'), ('𑿿', '𑿿'), ('𒑰', '𒑴'), ('𒿱', '𒿲'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𖺗', '𖺚'), ('𖿢', '𖿢'), ('𛲟', '𛲟'), ('𝪇', '𝪋'), ('𞥞', '𞥟'), ]; pub const OTHER_SYMBOL: &'static [(char, char)] = &[ ('¦', '¦'), ('©', '©'), ('®', '®'), ('°', '°'), ('҂', '҂'), ('֍', '֎'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), ('߶', '߶'), ('৺', '৺'), ('୰', '୰'), ('௳', '௸'), ('௺', '௺'), ('౿', '౿'), ('൏', '൏'), ('൹', '൹'), ('༁', '༃'), ('༓', '༓'), ('༕', '༗'), ('༚', '༟'), ('༴', '༴'), ('༶', '༶'), ('༸', '༸'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿏'), ('࿕', '࿘'), ('႞', '႟'), ('᎐', '᎙'), ('᙭', '᙭'), ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), ('᭴', '᭼'), ('℀', '℁'), ('℃', '℆'), ('℈', '℉'), ('℔', '℔'), ('№', '℗'), ('℞', '℣'), ('℥', '℥'), ('℧', '℧'), ('℩', '℩'), ('℮', '℮'), ('℺', '℻'), ('⅊', '⅊'), ('⅌', '⅍'), ('⅏', '⅏'), ('↊', '↋'), ('↕', '↙'), ('↜', '↟'), ('↡', '↢'), ('↤', '↥'), ('↧', '↭'), ('↯', '⇍'), ('⇐', '⇑'), ('⇓', '⇓'), ('⇕', '⇳'), ('⌀', '⌇'), ('⌌', '⌟'), ('⌢', '⌨'), ('⌫', '⍻'), ('⍽', '⎚'), ('⎴', '⏛'), ('⏢', '␦'), ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '▶'), ('▸', '◀'), ('◂', '◷'), ('☀', '♮'), ('♰', '❧'), ('➔', '➿'), ('⠀', '⣿'), ('⬀', '⬯'), ('⭅', '⭆'), ('⭍', '⭳'), ('⭶', '⮕'), ('⮗', '⯿'), ('⳥', '⳪'), ('⹐', '⹑'), ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('〄', '〄'), ('〒', '〓'), ('〠', '〠'), ('〶', '〷'), ('〾', '〿'), ('㆐', '㆑'), ('㆖', '㆟'), ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), ('㉐', '㉐'), ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㏿'), ('䷀', '䷿'), ('꒐', '꓆'), ('꠨', '꠫'), ('꠶', '꠷'), ('꠹', '꠹'), ('꩷', '꩹'), ('﵀', '﵏'), ('﷏', '﷏'), ('﷽', '﷿'), ('¦', '¦'), ('│', '│'), ('■', '○'), ('', '�'), ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), ('𐆐', '𐆜'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), ('𐫈', '𐫈'), ('𑜿', '𑜿'), ('𑿕', '𑿜'), ('𑿡', '𑿱'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), ('𛲜', '𛲜'), ('𜽐', '𜿃'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅪', '𝅬'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇪'), ('𝈀', '𝉁'), ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('𞅏', '𞅏'), ('𞲬', '𞲬'), ('𞴮', '𞴮'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄍', '🆭'), ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🏺'), ('🐀', '🛗'), ('🛜', '🛬'), ('🛰', '🛼'), ('🜀', '🝶'), ('🝻', '🟙'), ('🟠', '🟫'), ('🟰', '🟰'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🢰', '🢱'), ('🤀', '🩓'), ('🩠', '🩭'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ('🬀', '🮒'), ('🮔', '🯊'), ]; pub const PARAGRAPH_SEPARATOR: &'static [(char, char)] = &[('\u{2029}', '\u{2029}')]; pub const PRIVATE_USE: &'static [(char, char)] = &[ ('\u{e000}', '\u{f8ff}'), ('\u{f0000}', '\u{ffffd}'), ('\u{100000}', '\u{10fffd}'), ]; pub const PUNCTUATION: &'static [(char, char)] = &[ ('!', '#'), ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), ('[', ']'), ('_', '_'), ('{', '{'), ('}', '}'), ('¡', '¡'), ('§', '§'), ('«', '«'), ('¶', '·'), ('»', '»'), ('¿', '¿'), (';', ';'), ('·', '·'), ('՚', '՟'), ('։', '֊'), ('־', '־'), ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('׳', '״'), ('؉', '؊'), ('،', '؍'), ('؛', '؛'), ('؝', '؟'), ('٪', '٭'), ('۔', '۔'), ('܀', '܍'), ('߷', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), ('॰', '॰'), ('৽', '৽'), ('੶', '੶'), ('૰', '૰'), ('౷', '౷'), ('಄', '಄'), ('෴', '෴'), ('๏', '๏'), ('๚', '๛'), ('༄', '༒'), ('༔', '༔'), ('༺', '༽'), ('྅', '྅'), ('࿐', '࿔'), ('࿙', '࿚'), ('၊', '၏'), ('჻', '჻'), ('፠', '፨'), ('᐀', '᐀'), ('᙮', '᙮'), ('᚛', '᚜'), ('᛫', '᛭'), ('᜵', '᜶'), ('។', '៖'), ('៘', '៚'), ('᠀', '᠊'), ('᥄', '᥅'), ('᨞', '᨟'), ('᪠', '᪦'), ('᪨', '᪭'), ('᭚', '᭠'), ('᭽', '᭾'), ('᯼', '᯿'), ('᰻', '᰿'), ('᱾', '᱿'), ('᳀', '᳇'), ('᳓', '᳓'), ('‐', '‧'), ('‰', '⁃'), ('⁅', '⁑'), ('⁓', '⁞'), ('⁽', '⁾'), ('₍', '₎'), ('⌈', '⌋'), ('〈', '〉'), ('❨', '❵'), ('⟅', '⟆'), ('⟦', '⟯'), ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), ('⳹', '⳼'), ('⳾', '⳿'), ('⵰', '⵰'), ('⸀', '⸮'), ('⸰', '⹏'), ('⹒', '⹝'), ('、', '〃'), ('〈', '】'), ('〔', '〟'), ('〰', '〰'), ('〽', '〽'), ('゠', '゠'), ('・', '・'), ('꓾', '꓿'), ('꘍', '꘏'), ('꙳', '꙳'), ('꙾', '꙾'), ('꛲', '꛷'), ('꡴', '꡷'), ('꣎', '꣏'), ('꣸', '꣺'), ('꣼', '꣼'), ('꤮', '꤯'), ('꥟', '꥟'), ('꧁', '꧍'), ('꧞', '꧟'), ('꩜', '꩟'), ('꫞', '꫟'), ('꫰', '꫱'), ('꯫', '꯫'), ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), ('﹔', '﹡'), ('﹣', '﹣'), ('﹨', '﹨'), ('﹪', '﹫'), ('!', '#'), ('%', '*'), (',', '/'), (':', ';'), ('?', '@'), ('[', ']'), ('_', '_'), ('{', '{'), ('}', '}'), ('⦅', '・'), ('𐄀', '𐄂'), ('𐎟', '𐎟'), ('𐏐', '𐏐'), ('𐕯', '𐕯'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐤿', '𐤿'), ('𐩐', '𐩘'), ('𐩿', '𐩿'), ('𐫰', '𐫶'), ('𐬹', '𐬿'), ('𐮙', '𐮜'), ('𐺭', '𐺭'), ('𐽕', '𐽙'), ('𐾆', '𐾉'), ('𑁇', '𑁍'), ('𑂻', '𑂼'), ('𑂾', '𑃁'), ('𑅀', '𑅃'), ('𑅴', '𑅵'), ('𑇅', '𑇈'), ('𑇍', '𑇍'), ('𑇛', '𑇛'), ('𑇝', '𑇟'), ('𑈸', '𑈽'), ('𑊩', '𑊩'), ('𑑋', '𑑏'), ('𑑚', '𑑛'), ('𑑝', '𑑝'), ('𑓆', '𑓆'), ('𑗁', '𑗗'), ('𑙁', '𑙃'), ('𑙠', '𑙬'), ('𑚹', '𑚹'), ('𑜼', '𑜾'), ('𑠻', '𑠻'), ('𑥄', '𑥆'), ('𑧢', '𑧢'), ('𑨿', '𑩆'), ('𑪚', '𑪜'), ('𑪞', '𑪢'), ('𑬀', '𑬉'), ('𑱁', '𑱅'), ('𑱰', '𑱱'), ('𑻷', '𑻸'), ('𑽃', '𑽏'), ('𑿿', '𑿿'), ('𒑰', '𒑴'), ('𒿱', '𒿲'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬻'), ('𖭄', '𖭄'), ('𖺗', '𖺚'), ('𖿢', '𖿢'), ('𛲟', '𛲟'), ('𝪇', '𝪋'), ('𞥞', '𞥟'), ]; pub const SEPARATOR: &'static [(char, char)] = &[ (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; pub const SPACE_SEPARATOR: &'static [(char, char)] = &[ (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; pub const SPACING_MARK: &'static [(char, char)] = &[ ('ः', 'ः'), ('ऻ', 'ऻ'), ('ा', 'ी'), ('ॉ', 'ौ'), ('ॎ', 'ॏ'), ('ং', 'ঃ'), ('\u{9be}', 'ী'), ('ে', 'ৈ'), ('ো', 'ৌ'), ('\u{9d7}', '\u{9d7}'), ('ਃ', 'ਃ'), ('ਾ', 'ੀ'), ('ઃ', 'ઃ'), ('ા', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), ('ଂ', 'ଃ'), ('\u{b3e}', '\u{b3e}'), ('ୀ', 'ୀ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('\u{b57}', '\u{b57}'), ('\u{bbe}', 'ி'), ('ு', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), ('\u{bd7}', '\u{bd7}'), ('ఁ', 'ః'), ('ు', 'ౄ'), ('ಂ', 'ಃ'), ('ಾ', 'ಾ'), ('ೀ', 'ೄ'), ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), ('\u{cd5}', '\u{cd6}'), ('ೳ', 'ೳ'), ('ം', 'ഃ'), ('\u{d3e}', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('\u{d57}', '\u{d57}'), ('ං', 'ඃ'), ('\u{dcf}', 'ෑ'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('༾', '༿'), ('ཿ', 'ཿ'), ('ါ', 'ာ'), ('ေ', 'ေ'), ('း', 'း'), ('ျ', 'ြ'), ('ၖ', 'ၗ'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႏ', 'ႏ'), ('ႚ', 'ႜ'), ('᜕', '᜕'), ('᜴', '᜴'), ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), ('ᨙ', 'ᨚ'), ('ᩕ', 'ᩕ'), ('ᩗ', 'ᩗ'), ('ᩡ', 'ᩡ'), ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('ᬄ', 'ᬄ'), ('\u{1b35}', '\u{1b35}'), ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', '᭄'), ('ᮂ', 'ᮂ'), ('ᮡ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), ('ᯧ', 'ᯧ'), ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), ('᯲', '᯳'), ('ᰤ', 'ᰫ'), ('ᰴ', 'ᰵ'), ('᳡', '᳡'), ('᳷', '᳷'), ('\u{302e}', '\u{302f}'), ('ꠣ', 'ꠤ'), ('ꠧ', 'ꠧ'), ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣃ'), ('ꥒ', '꥓'), ('ꦃ', 'ꦃ'), ('ꦴ', 'ꦵ'), ('ꦺ', 'ꦻ'), ('ꦾ', '꧀'), ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), ('ꩍ', 'ꩍ'), ('ꩻ', 'ꩻ'), ('ꩽ', 'ꩽ'), ('ꫫ', 'ꫫ'), ('ꫮ', 'ꫯ'), ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯤ'), ('ꯦ', 'ꯧ'), ('ꯩ', 'ꯪ'), ('꯬', '꯬'), ('𑀀', '𑀀'), ('𑀂', '𑀂'), ('𑂂', '𑂂'), ('𑂰', '𑂲'), ('𑂷', '𑂸'), ('𑄬', '𑄬'), ('𑅅', '𑅆'), ('𑆂', '𑆂'), ('𑆳', '𑆵'), ('𑆿', '𑇀'), ('𑇎', '𑇎'), ('𑈬', '𑈮'), ('𑈲', '𑈳'), ('𑈵', '𑈵'), ('𑋠', '𑋢'), ('𑌂', '𑌃'), ('\u{1133e}', '𑌿'), ('𑍁', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('\u{11357}', '\u{11357}'), ('𑍢', '𑍣'), ('𑐵', '𑐷'), ('𑑀', '𑑁'), ('𑑅', '𑑅'), ('\u{114b0}', '𑒲'), ('𑒹', '𑒹'), ('𑒻', '𑒾'), ('𑓁', '𑓁'), ('\u{115af}', '𑖱'), ('𑖸', '𑖻'), ('𑖾', '𑖾'), ('𑘰', '𑘲'), ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑚬', '𑚬'), ('𑚮', '𑚯'), ('𑚶', '𑚶'), ('𑜠', '𑜡'), ('𑜦', '𑜦'), ('𑠬', '𑠮'), ('𑠸', '𑠸'), ('\u{11930}', '𑤵'), ('𑤷', '𑤸'), ('𑤽', '𑤽'), ('𑥀', '𑥀'), ('𑥂', '𑥂'), ('𑧑', '𑧓'), ('𑧜', '𑧟'), ('𑧤', '𑧤'), ('𑨹', '𑨹'), ('𑩗', '𑩘'), ('𑪗', '𑪗'), ('𑰯', '𑰯'), ('𑰾', '𑰾'), ('𑲩', '𑲩'), ('𑲱', '𑲱'), ('𑲴', '𑲴'), ('𑶊', '𑶎'), ('𑶓', '𑶔'), ('𑶖', '𑶖'), ('𑻵', '𑻶'), ('𑼃', '𑼃'), ('𑼴', '𑼵'), ('𑼾', '𑼿'), ('𑽁', '𑽁'), ('𖽑', '𖾇'), ('𖿰', '𖿱'), ('\u{1d165}', '𝅦'), ('𝅭', '\u{1d172}'), ]; pub const SYMBOL: &'static [(char, char)] = &[ ('$', '$'), ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), ('|', '|'), ('~', '~'), ('¢', '¦'), ('¨', '©'), ('¬', '¬'), ('®', '±'), ('´', '´'), ('¸', '¸'), ('×', '×'), ('÷', '÷'), ('˂', '˅'), ('˒', '˟'), ('˥', '˫'), ('˭', '˭'), ('˯', '˿'), ('͵', '͵'), ('΄', '΅'), ('϶', '϶'), ('҂', '҂'), ('֍', '֏'), ('؆', '؈'), ('؋', '؋'), ('؎', '؏'), ('۞', '۞'), ('۩', '۩'), ('۽', '۾'), ('߶', '߶'), ('߾', '߿'), ('࢈', '࢈'), ('৲', '৳'), ('৺', '৻'), ('૱', '૱'), ('୰', '୰'), ('௳', '௺'), ('౿', '౿'), ('൏', '൏'), ('൹', '൹'), ('฿', '฿'), ('༁', '༃'), ('༓', '༓'), ('༕', '༗'), ('༚', '༟'), ('༴', '༴'), ('༶', '༶'), ('༸', '༸'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿏'), ('࿕', '࿘'), ('႞', '႟'), ('᎐', '᎙'), ('᙭', '᙭'), ('៛', '៛'), ('᥀', '᥀'), ('᧞', '᧿'), ('᭡', '᭪'), ('᭴', '᭼'), ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), ('⁄', '⁄'), ('⁒', '⁒'), ('⁺', '⁼'), ('₊', '₌'), ('₠', '⃀'), ('℀', '℁'), ('℃', '℆'), ('℈', '℉'), ('℔', '℔'), ('№', '℘'), ('℞', '℣'), ('℥', '℥'), ('℧', '℧'), ('℩', '℩'), ('℮', '℮'), ('℺', '℻'), ('⅀', '⅄'), ('⅊', '⅍'), ('⅏', '⅏'), ('↊', '↋'), ('←', '⌇'), ('⌌', '⌨'), ('⌫', '␦'), ('⑀', '⑊'), ('⒜', 'ⓩ'), ('─', '❧'), ('➔', '⟄'), ('⟇', '⟥'), ('⟰', '⦂'), ('⦙', '⧗'), ('⧜', '⧻'), ('⧾', '⭳'), ('⭶', '⮕'), ('⮗', '⯿'), ('⳥', '⳪'), ('⹐', '⹑'), ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('〄', '〄'), ('〒', '〓'), ('〠', '〠'), ('〶', '〷'), ('〾', '〿'), ('゛', '゜'), ('㆐', '㆑'), ('㆖', '㆟'), ('㇀', '㇣'), ('㈀', '㈞'), ('㈪', '㉇'), ('㉐', '㉐'), ('㉠', '㉿'), ('㊊', '㊰'), ('㋀', '㏿'), ('䷀', '䷿'), ('꒐', '꓆'), ('꜀', '꜖'), ('꜠', '꜡'), ('꞉', '꞊'), ('꠨', '꠫'), ('꠶', '꠹'), ('꩷', '꩹'), ('꭛', '꭛'), ('꭪', '꭫'), ('﬩', '﬩'), ('﮲', '﯂'), ('﵀', '﵏'), ('﷏', '﷏'), ('﷼', '﷿'), ('﹢', '﹢'), ('﹤', '﹦'), ('﹩', '﹩'), ('$', '$'), ('+', '+'), ('<', '>'), ('^', '^'), ('`', '`'), ('|', '|'), ('~', '~'), ('¢', '₩'), ('│', '○'), ('', '�'), ('𐄷', '𐄿'), ('𐅹', '𐆉'), ('𐆌', '𐆎'), ('𐆐', '𐆜'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐡷', '𐡸'), ('𐫈', '𐫈'), ('𑜿', '𑜿'), ('𑿕', '𑿱'), ('𖬼', '𖬿'), ('𖭅', '𖭅'), ('𛲜', '𛲜'), ('𜽐', '𜿃'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅪', '𝅬'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇪'), ('𝈀', '𝉁'), ('𝉅', '𝉅'), ('𝌀', '𝍖'), ('𝛁', '𝛁'), ('𝛛', '𝛛'), ('𝛻', '𝛻'), ('𝜕', '𝜕'), ('𝜵', '𝜵'), ('𝝏', '𝝏'), ('𝝯', '𝝯'), ('𝞉', '𝞉'), ('𝞩', '𝞩'), ('𝟃', '𝟃'), ('𝠀', '𝧿'), ('𝨷', '𝨺'), ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪆'), ('𞅏', '𞅏'), ('𞋿', '𞋿'), ('𞲬', '𞲬'), ('𞲰', '𞲰'), ('𞴮', '𞴮'), ('𞻰', '𞻱'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄍', '🆭'), ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🛗'), ('🛜', '🛬'), ('🛰', '🛼'), ('🜀', '🝶'), ('🝻', '🟙'), ('🟠', '🟫'), ('🟰', '🟰'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🢰', '🢱'), ('🤀', '🩓'), ('🩠', '🩭'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ('🬀', '🮒'), ('🮔', '🯊'), ]; pub const TITLECASE_LETTER: &'static [(char, char)] = &[ ('Dž', 'Dž'), ('Lj', 'Lj'), ('Nj', 'Nj'), ('Dz', 'Dz'), ('ᾈ', 'ᾏ'), ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('ᾼ', 'ᾼ'), ('ῌ', 'ῌ'), ('ῼ', 'ῼ'), ]; pub const UNASSIGNED: &'static [(char, char)] = &[ ('\u{378}', '\u{379}'), ('\u{380}', '\u{383}'), ('\u{38b}', '\u{38b}'), ('\u{38d}', '\u{38d}'), ('\u{3a2}', '\u{3a2}'), ('\u{530}', '\u{530}'), ('\u{557}', '\u{558}'), ('\u{58b}', '\u{58c}'), ('\u{590}', '\u{590}'), ('\u{5c8}', '\u{5cf}'), ('\u{5eb}', '\u{5ee}'), ('\u{5f5}', '\u{5ff}'), ('\u{70e}', '\u{70e}'), ('\u{74b}', '\u{74c}'), ('\u{7b2}', '\u{7bf}'), ('\u{7fb}', '\u{7fc}'), ('\u{82e}', '\u{82f}'), ('\u{83f}', '\u{83f}'), ('\u{85c}', '\u{85d}'), ('\u{85f}', '\u{85f}'), ('\u{86b}', '\u{86f}'), ('\u{88f}', '\u{88f}'), ('\u{892}', '\u{897}'), ('\u{984}', '\u{984}'), ('\u{98d}', '\u{98e}'), ('\u{991}', '\u{992}'), ('\u{9a9}', '\u{9a9}'), ('\u{9b1}', '\u{9b1}'), ('\u{9b3}', '\u{9b5}'), ('\u{9ba}', '\u{9bb}'), ('\u{9c5}', '\u{9c6}'), ('\u{9c9}', '\u{9ca}'), ('\u{9cf}', '\u{9d6}'), ('\u{9d8}', '\u{9db}'), ('\u{9de}', '\u{9de}'), ('\u{9e4}', '\u{9e5}'), ('\u{9ff}', '\u{a00}'), ('\u{a04}', '\u{a04}'), ('\u{a0b}', '\u{a0e}'), ('\u{a11}', '\u{a12}'), ('\u{a29}', '\u{a29}'), ('\u{a31}', '\u{a31}'), ('\u{a34}', '\u{a34}'), ('\u{a37}', '\u{a37}'), ('\u{a3a}', '\u{a3b}'), ('\u{a3d}', '\u{a3d}'), ('\u{a43}', '\u{a46}'), ('\u{a49}', '\u{a4a}'), ('\u{a4e}', '\u{a50}'), ('\u{a52}', '\u{a58}'), ('\u{a5d}', '\u{a5d}'), ('\u{a5f}', '\u{a65}'), ('\u{a77}', '\u{a80}'), ('\u{a84}', '\u{a84}'), ('\u{a8e}', '\u{a8e}'), ('\u{a92}', '\u{a92}'), ('\u{aa9}', '\u{aa9}'), ('\u{ab1}', '\u{ab1}'), ('\u{ab4}', '\u{ab4}'), ('\u{aba}', '\u{abb}'), ('\u{ac6}', '\u{ac6}'), ('\u{aca}', '\u{aca}'), ('\u{ace}', '\u{acf}'), ('\u{ad1}', '\u{adf}'), ('\u{ae4}', '\u{ae5}'), ('\u{af2}', '\u{af8}'), ('\u{b00}', '\u{b00}'), ('\u{b04}', '\u{b04}'), ('\u{b0d}', '\u{b0e}'), ('\u{b11}', '\u{b12}'), ('\u{b29}', '\u{b29}'), ('\u{b31}', '\u{b31}'), ('\u{b34}', '\u{b34}'), ('\u{b3a}', '\u{b3b}'), ('\u{b45}', '\u{b46}'), ('\u{b49}', '\u{b4a}'), ('\u{b4e}', '\u{b54}'), ('\u{b58}', '\u{b5b}'), ('\u{b5e}', '\u{b5e}'), ('\u{b64}', '\u{b65}'), ('\u{b78}', '\u{b81}'), ('\u{b84}', '\u{b84}'), ('\u{b8b}', '\u{b8d}'), ('\u{b91}', '\u{b91}'), ('\u{b96}', '\u{b98}'), ('\u{b9b}', '\u{b9b}'), ('\u{b9d}', '\u{b9d}'), ('\u{ba0}', '\u{ba2}'), ('\u{ba5}', '\u{ba7}'), ('\u{bab}', '\u{bad}'), ('\u{bba}', '\u{bbd}'), ('\u{bc3}', '\u{bc5}'), ('\u{bc9}', '\u{bc9}'), ('\u{bce}', '\u{bcf}'), ('\u{bd1}', '\u{bd6}'), ('\u{bd8}', '\u{be5}'), ('\u{bfb}', '\u{bff}'), ('\u{c0d}', '\u{c0d}'), ('\u{c11}', '\u{c11}'), ('\u{c29}', '\u{c29}'), ('\u{c3a}', '\u{c3b}'), ('\u{c45}', '\u{c45}'), ('\u{c49}', '\u{c49}'), ('\u{c4e}', '\u{c54}'), ('\u{c57}', '\u{c57}'), ('\u{c5b}', '\u{c5c}'), ('\u{c5e}', '\u{c5f}'), ('\u{c64}', '\u{c65}'), ('\u{c70}', '\u{c76}'), ('\u{c8d}', '\u{c8d}'), ('\u{c91}', '\u{c91}'), ('\u{ca9}', '\u{ca9}'), ('\u{cb4}', '\u{cb4}'), ('\u{cba}', '\u{cbb}'), ('\u{cc5}', '\u{cc5}'), ('\u{cc9}', '\u{cc9}'), ('\u{cce}', '\u{cd4}'), ('\u{cd7}', '\u{cdc}'), ('\u{cdf}', '\u{cdf}'), ('\u{ce4}', '\u{ce5}'), ('\u{cf0}', '\u{cf0}'), ('\u{cf4}', '\u{cff}'), ('\u{d0d}', '\u{d0d}'), ('\u{d11}', '\u{d11}'), ('\u{d45}', '\u{d45}'), ('\u{d49}', '\u{d49}'), ('\u{d50}', '\u{d53}'), ('\u{d64}', '\u{d65}'), ('\u{d80}', '\u{d80}'), ('\u{d84}', '\u{d84}'), ('\u{d97}', '\u{d99}'), ('\u{db2}', '\u{db2}'), ('\u{dbc}', '\u{dbc}'), ('\u{dbe}', '\u{dbf}'), ('\u{dc7}', '\u{dc9}'), ('\u{dcb}', '\u{dce}'), ('\u{dd5}', '\u{dd5}'), ('\u{dd7}', '\u{dd7}'), ('\u{de0}', '\u{de5}'), ('\u{df0}', '\u{df1}'), ('\u{df5}', '\u{e00}'), ('\u{e3b}', '\u{e3e}'), ('\u{e5c}', '\u{e80}'), ('\u{e83}', '\u{e83}'), ('\u{e85}', '\u{e85}'), ('\u{e8b}', '\u{e8b}'), ('\u{ea4}', '\u{ea4}'), ('\u{ea6}', '\u{ea6}'), ('\u{ebe}', '\u{ebf}'), ('\u{ec5}', '\u{ec5}'), ('\u{ec7}', '\u{ec7}'), ('\u{ecf}', '\u{ecf}'), ('\u{eda}', '\u{edb}'), ('\u{ee0}', '\u{eff}'), ('\u{f48}', '\u{f48}'), ('\u{f6d}', '\u{f70}'), ('\u{f98}', '\u{f98}'), ('\u{fbd}', '\u{fbd}'), ('\u{fcd}', '\u{fcd}'), ('\u{fdb}', '\u{fff}'), ('\u{10c6}', '\u{10c6}'), ('\u{10c8}', '\u{10cc}'), ('\u{10ce}', '\u{10cf}'), ('\u{1249}', '\u{1249}'), ('\u{124e}', '\u{124f}'), ('\u{1257}', '\u{1257}'), ('\u{1259}', '\u{1259}'), ('\u{125e}', '\u{125f}'), ('\u{1289}', '\u{1289}'), ('\u{128e}', '\u{128f}'), ('\u{12b1}', '\u{12b1}'), ('\u{12b6}', '\u{12b7}'), ('\u{12bf}', '\u{12bf}'), ('\u{12c1}', '\u{12c1}'), ('\u{12c6}', '\u{12c7}'), ('\u{12d7}', '\u{12d7}'), ('\u{1311}', '\u{1311}'), ('\u{1316}', '\u{1317}'), ('\u{135b}', '\u{135c}'), ('\u{137d}', '\u{137f}'), ('\u{139a}', '\u{139f}'), ('\u{13f6}', '\u{13f7}'), ('\u{13fe}', '\u{13ff}'), ('\u{169d}', '\u{169f}'), ('\u{16f9}', '\u{16ff}'), ('\u{1716}', '\u{171e}'), ('\u{1737}', '\u{173f}'), ('\u{1754}', '\u{175f}'), ('\u{176d}', '\u{176d}'), ('\u{1771}', '\u{1771}'), ('\u{1774}', '\u{177f}'), ('\u{17de}', '\u{17df}'), ('\u{17ea}', '\u{17ef}'), ('\u{17fa}', '\u{17ff}'), ('\u{181a}', '\u{181f}'), ('\u{1879}', '\u{187f}'), ('\u{18ab}', '\u{18af}'), ('\u{18f6}', '\u{18ff}'), ('\u{191f}', '\u{191f}'), ('\u{192c}', '\u{192f}'), ('\u{193c}', '\u{193f}'), ('\u{1941}', '\u{1943}'), ('\u{196e}', '\u{196f}'), ('\u{1975}', '\u{197f}'), ('\u{19ac}', '\u{19af}'), ('\u{19ca}', '\u{19cf}'), ('\u{19db}', '\u{19dd}'), ('\u{1a1c}', '\u{1a1d}'), ('\u{1a5f}', '\u{1a5f}'), ('\u{1a7d}', '\u{1a7e}'), ('\u{1a8a}', '\u{1a8f}'), ('\u{1a9a}', '\u{1a9f}'), ('\u{1aae}', '\u{1aaf}'), ('\u{1acf}', '\u{1aff}'), ('\u{1b4d}', '\u{1b4f}'), ('\u{1b7f}', '\u{1b7f}'), ('\u{1bf4}', '\u{1bfb}'), ('\u{1c38}', '\u{1c3a}'), ('\u{1c4a}', '\u{1c4c}'), ('\u{1c89}', '\u{1c8f}'), ('\u{1cbb}', '\u{1cbc}'), ('\u{1cc8}', '\u{1ccf}'), ('\u{1cfb}', '\u{1cff}'), ('\u{1f16}', '\u{1f17}'), ('\u{1f1e}', '\u{1f1f}'), ('\u{1f46}', '\u{1f47}'), ('\u{1f4e}', '\u{1f4f}'), ('\u{1f58}', '\u{1f58}'), ('\u{1f5a}', '\u{1f5a}'), ('\u{1f5c}', '\u{1f5c}'), ('\u{1f5e}', '\u{1f5e}'), ('\u{1f7e}', '\u{1f7f}'), ('\u{1fb5}', '\u{1fb5}'), ('\u{1fc5}', '\u{1fc5}'), ('\u{1fd4}', '\u{1fd5}'), ('\u{1fdc}', '\u{1fdc}'), ('\u{1ff0}', '\u{1ff1}'), ('\u{1ff5}', '\u{1ff5}'), ('\u{1fff}', '\u{1fff}'), ('\u{2065}', '\u{2065}'), ('\u{2072}', '\u{2073}'), ('\u{208f}', '\u{208f}'), ('\u{209d}', '\u{209f}'), ('\u{20c1}', '\u{20cf}'), ('\u{20f1}', '\u{20ff}'), ('\u{218c}', '\u{218f}'), ('\u{2427}', '\u{243f}'), ('\u{244b}', '\u{245f}'), ('\u{2b74}', '\u{2b75}'), ('\u{2b96}', '\u{2b96}'), ('\u{2cf4}', '\u{2cf8}'), ('\u{2d26}', '\u{2d26}'), ('\u{2d28}', '\u{2d2c}'), ('\u{2d2e}', '\u{2d2f}'), ('\u{2d68}', '\u{2d6e}'), ('\u{2d71}', '\u{2d7e}'), ('\u{2d97}', '\u{2d9f}'), ('\u{2da7}', '\u{2da7}'), ('\u{2daf}', '\u{2daf}'), ('\u{2db7}', '\u{2db7}'), ('\u{2dbf}', '\u{2dbf}'), ('\u{2dc7}', '\u{2dc7}'), ('\u{2dcf}', '\u{2dcf}'), ('\u{2dd7}', '\u{2dd7}'), ('\u{2ddf}', '\u{2ddf}'), ('\u{2e5e}', '\u{2e7f}'), ('\u{2e9a}', '\u{2e9a}'), ('\u{2ef4}', '\u{2eff}'), ('\u{2fd6}', '\u{2fef}'), ('\u{2ffc}', '\u{2fff}'), ('\u{3040}', '\u{3040}'), ('\u{3097}', '\u{3098}'), ('\u{3100}', '\u{3104}'), ('\u{3130}', '\u{3130}'), ('\u{318f}', '\u{318f}'), ('\u{31e4}', '\u{31ef}'), ('\u{321f}', '\u{321f}'), ('\u{a48d}', '\u{a48f}'), ('\u{a4c7}', '\u{a4cf}'), ('\u{a62c}', '\u{a63f}'), ('\u{a6f8}', '\u{a6ff}'), ('\u{a7cb}', '\u{a7cf}'), ('\u{a7d2}', '\u{a7d2}'), ('\u{a7d4}', '\u{a7d4}'), ('\u{a7da}', '\u{a7f1}'), ('\u{a82d}', '\u{a82f}'), ('\u{a83a}', '\u{a83f}'), ('\u{a878}', '\u{a87f}'), ('\u{a8c6}', '\u{a8cd}'), ('\u{a8da}', '\u{a8df}'), ('\u{a954}', '\u{a95e}'), ('\u{a97d}', '\u{a97f}'), ('\u{a9ce}', '\u{a9ce}'), ('\u{a9da}', '\u{a9dd}'), ('\u{a9ff}', '\u{a9ff}'), ('\u{aa37}', '\u{aa3f}'), ('\u{aa4e}', '\u{aa4f}'), ('\u{aa5a}', '\u{aa5b}'), ('\u{aac3}', '\u{aada}'), ('\u{aaf7}', '\u{ab00}'), ('\u{ab07}', '\u{ab08}'), ('\u{ab0f}', '\u{ab10}'), ('\u{ab17}', '\u{ab1f}'), ('\u{ab27}', '\u{ab27}'), ('\u{ab2f}', '\u{ab2f}'), ('\u{ab6c}', '\u{ab6f}'), ('\u{abee}', '\u{abef}'), ('\u{abfa}', '\u{abff}'), ('\u{d7a4}', '\u{d7af}'), ('\u{d7c7}', '\u{d7ca}'), ('\u{d7fc}', '\u{d7ff}'), ('\u{fa6e}', '\u{fa6f}'), ('\u{fada}', '\u{faff}'), ('\u{fb07}', '\u{fb12}'), ('\u{fb18}', '\u{fb1c}'), ('\u{fb37}', '\u{fb37}'), ('\u{fb3d}', '\u{fb3d}'), ('\u{fb3f}', '\u{fb3f}'), ('\u{fb42}', '\u{fb42}'), ('\u{fb45}', '\u{fb45}'), ('\u{fbc3}', '\u{fbd2}'), ('\u{fd90}', '\u{fd91}'), ('\u{fdc8}', '\u{fdce}'), ('\u{fdd0}', '\u{fdef}'), ('\u{fe1a}', '\u{fe1f}'), ('\u{fe53}', '\u{fe53}'), ('\u{fe67}', '\u{fe67}'), ('\u{fe6c}', '\u{fe6f}'), ('\u{fe75}', '\u{fe75}'), ('\u{fefd}', '\u{fefe}'), ('\u{ff00}', '\u{ff00}'), ('\u{ffbf}', '\u{ffc1}'), ('\u{ffc8}', '\u{ffc9}'), ('\u{ffd0}', '\u{ffd1}'), ('\u{ffd8}', '\u{ffd9}'), ('\u{ffdd}', '\u{ffdf}'), ('\u{ffe7}', '\u{ffe7}'), ('\u{ffef}', '\u{fff8}'), ('\u{fffe}', '\u{ffff}'), ('\u{1000c}', '\u{1000c}'), ('\u{10027}', '\u{10027}'), ('\u{1003b}', '\u{1003b}'), ('\u{1003e}', '\u{1003e}'), ('\u{1004e}', '\u{1004f}'), ('\u{1005e}', '\u{1007f}'), ('\u{100fb}', '\u{100ff}'), ('\u{10103}', '\u{10106}'), ('\u{10134}', '\u{10136}'), ('\u{1018f}', '\u{1018f}'), ('\u{1019d}', '\u{1019f}'), ('\u{101a1}', '\u{101cf}'), ('\u{101fe}', '\u{1027f}'), ('\u{1029d}', '\u{1029f}'), ('\u{102d1}', '\u{102df}'), ('\u{102fc}', '\u{102ff}'), ('\u{10324}', '\u{1032c}'), ('\u{1034b}', '\u{1034f}'), ('\u{1037b}', '\u{1037f}'), ('\u{1039e}', '\u{1039e}'), ('\u{103c4}', '\u{103c7}'), ('\u{103d6}', '\u{103ff}'), ('\u{1049e}', '\u{1049f}'), ('\u{104aa}', '\u{104af}'), ('\u{104d4}', '\u{104d7}'), ('\u{104fc}', '\u{104ff}'), ('\u{10528}', '\u{1052f}'), ('\u{10564}', '\u{1056e}'), ('\u{1057b}', '\u{1057b}'), ('\u{1058b}', '\u{1058b}'), ('\u{10593}', '\u{10593}'), ('\u{10596}', '\u{10596}'), ('\u{105a2}', '\u{105a2}'), ('\u{105b2}', '\u{105b2}'), ('\u{105ba}', '\u{105ba}'), ('\u{105bd}', '\u{105ff}'), ('\u{10737}', '\u{1073f}'), ('\u{10756}', '\u{1075f}'), ('\u{10768}', '\u{1077f}'), ('\u{10786}', '\u{10786}'), ('\u{107b1}', '\u{107b1}'), ('\u{107bb}', '\u{107ff}'), ('\u{10806}', '\u{10807}'), ('\u{10809}', '\u{10809}'), ('\u{10836}', '\u{10836}'), ('\u{10839}', '\u{1083b}'), ('\u{1083d}', '\u{1083e}'), ('\u{10856}', '\u{10856}'), ('\u{1089f}', '\u{108a6}'), ('\u{108b0}', '\u{108df}'), ('\u{108f3}', '\u{108f3}'), ('\u{108f6}', '\u{108fa}'), ('\u{1091c}', '\u{1091e}'), ('\u{1093a}', '\u{1093e}'), ('\u{10940}', '\u{1097f}'), ('\u{109b8}', '\u{109bb}'), ('\u{109d0}', '\u{109d1}'), ('\u{10a04}', '\u{10a04}'), ('\u{10a07}', '\u{10a0b}'), ('\u{10a14}', '\u{10a14}'), ('\u{10a18}', '\u{10a18}'), ('\u{10a36}', '\u{10a37}'), ('\u{10a3b}', '\u{10a3e}'), ('\u{10a49}', '\u{10a4f}'), ('\u{10a59}', '\u{10a5f}'), ('\u{10aa0}', '\u{10abf}'), ('\u{10ae7}', '\u{10aea}'), ('\u{10af7}', '\u{10aff}'), ('\u{10b36}', '\u{10b38}'), ('\u{10b56}', '\u{10b57}'), ('\u{10b73}', '\u{10b77}'), ('\u{10b92}', '\u{10b98}'), ('\u{10b9d}', '\u{10ba8}'), ('\u{10bb0}', '\u{10bff}'), ('\u{10c49}', '\u{10c7f}'), ('\u{10cb3}', '\u{10cbf}'), ('\u{10cf3}', '\u{10cf9}'), ('\u{10d28}', '\u{10d2f}'), ('\u{10d3a}', '\u{10e5f}'), ('\u{10e7f}', '\u{10e7f}'), ('\u{10eaa}', '\u{10eaa}'), ('\u{10eae}', '\u{10eaf}'), ('\u{10eb2}', '\u{10efc}'), ('\u{10f28}', '\u{10f2f}'), ('\u{10f5a}', '\u{10f6f}'), ('\u{10f8a}', '\u{10faf}'), ('\u{10fcc}', '\u{10fdf}'), ('\u{10ff7}', '\u{10fff}'), ('\u{1104e}', '\u{11051}'), ('\u{11076}', '\u{1107e}'), ('\u{110c3}', '\u{110cc}'), ('\u{110ce}', '\u{110cf}'), ('\u{110e9}', '\u{110ef}'), ('\u{110fa}', '\u{110ff}'), ('\u{11135}', '\u{11135}'), ('\u{11148}', '\u{1114f}'), ('\u{11177}', '\u{1117f}'), ('\u{111e0}', '\u{111e0}'), ('\u{111f5}', '\u{111ff}'), ('\u{11212}', '\u{11212}'), ('\u{11242}', '\u{1127f}'), ('\u{11287}', '\u{11287}'), ('\u{11289}', '\u{11289}'), ('\u{1128e}', '\u{1128e}'), ('\u{1129e}', '\u{1129e}'), ('\u{112aa}', '\u{112af}'), ('\u{112eb}', '\u{112ef}'), ('\u{112fa}', '\u{112ff}'), ('\u{11304}', '\u{11304}'), ('\u{1130d}', '\u{1130e}'), ('\u{11311}', '\u{11312}'), ('\u{11329}', '\u{11329}'), ('\u{11331}', '\u{11331}'), ('\u{11334}', '\u{11334}'), ('\u{1133a}', '\u{1133a}'), ('\u{11345}', '\u{11346}'), ('\u{11349}', '\u{1134a}'), ('\u{1134e}', '\u{1134f}'), ('\u{11351}', '\u{11356}'), ('\u{11358}', '\u{1135c}'), ('\u{11364}', '\u{11365}'), ('\u{1136d}', '\u{1136f}'), ('\u{11375}', '\u{113ff}'), ('\u{1145c}', '\u{1145c}'), ('\u{11462}', '\u{1147f}'), ('\u{114c8}', '\u{114cf}'), ('\u{114da}', '\u{1157f}'), ('\u{115b6}', '\u{115b7}'), ('\u{115de}', '\u{115ff}'), ('\u{11645}', '\u{1164f}'), ('\u{1165a}', '\u{1165f}'), ('\u{1166d}', '\u{1167f}'), ('\u{116ba}', '\u{116bf}'), ('\u{116ca}', '\u{116ff}'), ('\u{1171b}', '\u{1171c}'), ('\u{1172c}', '\u{1172f}'), ('\u{11747}', '\u{117ff}'), ('\u{1183c}', '\u{1189f}'), ('\u{118f3}', '\u{118fe}'), ('\u{11907}', '\u{11908}'), ('\u{1190a}', '\u{1190b}'), ('\u{11914}', '\u{11914}'), ('\u{11917}', '\u{11917}'), ('\u{11936}', '\u{11936}'), ('\u{11939}', '\u{1193a}'), ('\u{11947}', '\u{1194f}'), ('\u{1195a}', '\u{1199f}'), ('\u{119a8}', '\u{119a9}'), ('\u{119d8}', '\u{119d9}'), ('\u{119e5}', '\u{119ff}'), ('\u{11a48}', '\u{11a4f}'), ('\u{11aa3}', '\u{11aaf}'), ('\u{11af9}', '\u{11aff}'), ('\u{11b0a}', '\u{11bff}'), ('\u{11c09}', '\u{11c09}'), ('\u{11c37}', '\u{11c37}'), ('\u{11c46}', '\u{11c4f}'), ('\u{11c6d}', '\u{11c6f}'), ('\u{11c90}', '\u{11c91}'), ('\u{11ca8}', '\u{11ca8}'), ('\u{11cb7}', '\u{11cff}'), ('\u{11d07}', '\u{11d07}'), ('\u{11d0a}', '\u{11d0a}'), ('\u{11d37}', '\u{11d39}'), ('\u{11d3b}', '\u{11d3b}'), ('\u{11d3e}', '\u{11d3e}'), ('\u{11d48}', '\u{11d4f}'), ('\u{11d5a}', '\u{11d5f}'), ('\u{11d66}', '\u{11d66}'), ('\u{11d69}', '\u{11d69}'), ('\u{11d8f}', '\u{11d8f}'), ('\u{11d92}', '\u{11d92}'), ('\u{11d99}', '\u{11d9f}'), ('\u{11daa}', '\u{11edf}'), ('\u{11ef9}', '\u{11eff}'), ('\u{11f11}', '\u{11f11}'), ('\u{11f3b}', '\u{11f3d}'), ('\u{11f5a}', '\u{11faf}'), ('\u{11fb1}', '\u{11fbf}'), ('\u{11ff2}', '\u{11ffe}'), ('\u{1239a}', '\u{123ff}'), ('\u{1246f}', '\u{1246f}'), ('\u{12475}', '\u{1247f}'), ('\u{12544}', '\u{12f8f}'), ('\u{12ff3}', '\u{12fff}'), ('\u{13456}', '\u{143ff}'), ('\u{14647}', '\u{167ff}'), ('\u{16a39}', '\u{16a3f}'), ('\u{16a5f}', '\u{16a5f}'), ('\u{16a6a}', '\u{16a6d}'), ('\u{16abf}', '\u{16abf}'), ('\u{16aca}', '\u{16acf}'), ('\u{16aee}', '\u{16aef}'), ('\u{16af6}', '\u{16aff}'), ('\u{16b46}', '\u{16b4f}'), ('\u{16b5a}', '\u{16b5a}'), ('\u{16b62}', '\u{16b62}'), ('\u{16b78}', '\u{16b7c}'), ('\u{16b90}', '\u{16e3f}'), ('\u{16e9b}', '\u{16eff}'), ('\u{16f4b}', '\u{16f4e}'), ('\u{16f88}', '\u{16f8e}'), ('\u{16fa0}', '\u{16fdf}'), ('\u{16fe5}', '\u{16fef}'), ('\u{16ff2}', '\u{16fff}'), ('\u{187f8}', '\u{187ff}'), ('\u{18cd6}', '\u{18cff}'), ('\u{18d09}', '\u{1afef}'), ('\u{1aff4}', '\u{1aff4}'), ('\u{1affc}', '\u{1affc}'), ('\u{1afff}', '\u{1afff}'), ('\u{1b123}', '\u{1b131}'), ('\u{1b133}', '\u{1b14f}'), ('\u{1b153}', '\u{1b154}'), ('\u{1b156}', '\u{1b163}'), ('\u{1b168}', '\u{1b16f}'), ('\u{1b2fc}', '\u{1bbff}'), ('\u{1bc6b}', '\u{1bc6f}'), ('\u{1bc7d}', '\u{1bc7f}'), ('\u{1bc89}', '\u{1bc8f}'), ('\u{1bc9a}', '\u{1bc9b}'), ('\u{1bca4}', '\u{1ceff}'), ('\u{1cf2e}', '\u{1cf2f}'), ('\u{1cf47}', '\u{1cf4f}'), ('\u{1cfc4}', '\u{1cfff}'), ('\u{1d0f6}', '\u{1d0ff}'), ('\u{1d127}', '\u{1d128}'), ('\u{1d1eb}', '\u{1d1ff}'), ('\u{1d246}', '\u{1d2bf}'), ('\u{1d2d4}', '\u{1d2df}'), ('\u{1d2f4}', '\u{1d2ff}'), ('\u{1d357}', '\u{1d35f}'), ('\u{1d379}', '\u{1d3ff}'), ('\u{1d455}', '\u{1d455}'), ('\u{1d49d}', '\u{1d49d}'), ('\u{1d4a0}', '\u{1d4a1}'), ('\u{1d4a3}', '\u{1d4a4}'), ('\u{1d4a7}', '\u{1d4a8}'), ('\u{1d4ad}', '\u{1d4ad}'), ('\u{1d4ba}', '\u{1d4ba}'), ('\u{1d4bc}', '\u{1d4bc}'), ('\u{1d4c4}', '\u{1d4c4}'), ('\u{1d506}', '\u{1d506}'), ('\u{1d50b}', '\u{1d50c}'), ('\u{1d515}', '\u{1d515}'), ('\u{1d51d}', '\u{1d51d}'), ('\u{1d53a}', '\u{1d53a}'), ('\u{1d53f}', '\u{1d53f}'), ('\u{1d545}', '\u{1d545}'), ('\u{1d547}', '\u{1d549}'), ('\u{1d551}', '\u{1d551}'), ('\u{1d6a6}', '\u{1d6a7}'), ('\u{1d7cc}', '\u{1d7cd}'), ('\u{1da8c}', '\u{1da9a}'), ('\u{1daa0}', '\u{1daa0}'), ('\u{1dab0}', '\u{1deff}'), ('\u{1df1f}', '\u{1df24}'), ('\u{1df2b}', '\u{1dfff}'), ('\u{1e007}', '\u{1e007}'), ('\u{1e019}', '\u{1e01a}'), ('\u{1e022}', '\u{1e022}'), ('\u{1e025}', '\u{1e025}'), ('\u{1e02b}', '\u{1e02f}'), ('\u{1e06e}', '\u{1e08e}'), ('\u{1e090}', '\u{1e0ff}'), ('\u{1e12d}', '\u{1e12f}'), ('\u{1e13e}', '\u{1e13f}'), ('\u{1e14a}', '\u{1e14d}'), ('\u{1e150}', '\u{1e28f}'), ('\u{1e2af}', '\u{1e2bf}'), ('\u{1e2fa}', '\u{1e2fe}'), ('\u{1e300}', '\u{1e4cf}'), ('\u{1e4fa}', '\u{1e7df}'), ('\u{1e7e7}', '\u{1e7e7}'), ('\u{1e7ec}', '\u{1e7ec}'), ('\u{1e7ef}', '\u{1e7ef}'), ('\u{1e7ff}', '\u{1e7ff}'), ('\u{1e8c5}', '\u{1e8c6}'), ('\u{1e8d7}', '\u{1e8ff}'), ('\u{1e94c}', '\u{1e94f}'), ('\u{1e95a}', '\u{1e95d}'), ('\u{1e960}', '\u{1ec70}'), ('\u{1ecb5}', '\u{1ed00}'), ('\u{1ed3e}', '\u{1edff}'), ('\u{1ee04}', '\u{1ee04}'), ('\u{1ee20}', '\u{1ee20}'), ('\u{1ee23}', '\u{1ee23}'), ('\u{1ee25}', '\u{1ee26}'), ('\u{1ee28}', '\u{1ee28}'), ('\u{1ee33}', '\u{1ee33}'), ('\u{1ee38}', '\u{1ee38}'), ('\u{1ee3a}', '\u{1ee3a}'), ('\u{1ee3c}', '\u{1ee41}'), ('\u{1ee43}', '\u{1ee46}'), ('\u{1ee48}', '\u{1ee48}'), ('\u{1ee4a}', '\u{1ee4a}'), ('\u{1ee4c}', '\u{1ee4c}'), ('\u{1ee50}', '\u{1ee50}'), ('\u{1ee53}', '\u{1ee53}'), ('\u{1ee55}', '\u{1ee56}'), ('\u{1ee58}', '\u{1ee58}'), ('\u{1ee5a}', '\u{1ee5a}'), ('\u{1ee5c}', '\u{1ee5c}'), ('\u{1ee5e}', '\u{1ee5e}'), ('\u{1ee60}', '\u{1ee60}'), ('\u{1ee63}', '\u{1ee63}'), ('\u{1ee65}', '\u{1ee66}'), ('\u{1ee6b}', '\u{1ee6b}'), ('\u{1ee73}', '\u{1ee73}'), ('\u{1ee78}', '\u{1ee78}'), ('\u{1ee7d}', '\u{1ee7d}'), ('\u{1ee7f}', '\u{1ee7f}'), ('\u{1ee8a}', '\u{1ee8a}'), ('\u{1ee9c}', '\u{1eea0}'), ('\u{1eea4}', '\u{1eea4}'), ('\u{1eeaa}', '\u{1eeaa}'), ('\u{1eebc}', '\u{1eeef}'), ('\u{1eef2}', '\u{1efff}'), ('\u{1f02c}', '\u{1f02f}'), ('\u{1f094}', '\u{1f09f}'), ('\u{1f0af}', '\u{1f0b0}'), ('\u{1f0c0}', '\u{1f0c0}'), ('\u{1f0d0}', '\u{1f0d0}'), ('\u{1f0f6}', '\u{1f0ff}'), ('\u{1f1ae}', '\u{1f1e5}'), ('\u{1f203}', '\u{1f20f}'), ('\u{1f23c}', '\u{1f23f}'), ('\u{1f249}', '\u{1f24f}'), ('\u{1f252}', '\u{1f25f}'), ('\u{1f266}', '\u{1f2ff}'), ('\u{1f6d8}', '\u{1f6db}'), ('\u{1f6ed}', '\u{1f6ef}'), ('\u{1f6fd}', '\u{1f6ff}'), ('\u{1f777}', '\u{1f77a}'), ('\u{1f7da}', '\u{1f7df}'), ('\u{1f7ec}', '\u{1f7ef}'), ('\u{1f7f1}', '\u{1f7ff}'), ('\u{1f80c}', '\u{1f80f}'), ('\u{1f848}', '\u{1f84f}'), ('\u{1f85a}', '\u{1f85f}'), ('\u{1f888}', '\u{1f88f}'), ('\u{1f8ae}', '\u{1f8af}'), ('\u{1f8b2}', '\u{1f8ff}'), ('\u{1fa54}', '\u{1fa5f}'), ('\u{1fa6e}', '\u{1fa6f}'), ('\u{1fa7d}', '\u{1fa7f}'), ('\u{1fa89}', '\u{1fa8f}'), ('\u{1fabe}', '\u{1fabe}'), ('\u{1fac6}', '\u{1facd}'), ('\u{1fadc}', '\u{1fadf}'), ('\u{1fae9}', '\u{1faef}'), ('\u{1faf9}', '\u{1faff}'), ('\u{1fb93}', '\u{1fb93}'), ('\u{1fbcb}', '\u{1fbef}'), ('\u{1fbfa}', '\u{1ffff}'), ('\u{2a6e0}', '\u{2a6ff}'), ('\u{2b73a}', '\u{2b73f}'), ('\u{2b81e}', '\u{2b81f}'), ('\u{2cea2}', '\u{2ceaf}'), ('\u{2ebe1}', '\u{2f7ff}'), ('\u{2fa1e}', '\u{2ffff}'), ('\u{3134b}', '\u{3134f}'), ('\u{323b0}', '\u{e0000}'), ('\u{e0002}', '\u{e001f}'), ('\u{e0080}', '\u{e00ff}'), ('\u{e01f0}', '\u{effff}'), ('\u{ffffe}', '\u{fffff}'), ('\u{10fffe}', '\u{10ffff}'), ]; pub const UPPERCASE_LETTER: &'static [(char, char)] = &[ ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('Ꭰ', 'Ᏽ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), ('ⅅ', 'ⅅ'), ('Ↄ', 'Ↄ'), ('Ⰰ', 'Ⱟ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('Ꞹ', 'Ꞹ'), ('Ꞻ', 'Ꞻ'), ('Ꞽ', 'Ꞽ'), ('Ꞿ', 'Ꞿ'), ('Ꟁ', 'Ꟁ'), ('Ꟃ', 'Ꟃ'), ('Ꞔ', 'Ꟈ'), ('Ꟊ', 'Ꟊ'), ('Ꟑ', 'Ꟑ'), ('Ꟗ', 'Ꟗ'), ('Ꟙ', 'Ꟙ'), ('Ꟶ', 'Ꟶ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𖹀', '𖹟'), ('𝐀', '𝐙'), ('𝐴', '𝑍'), ('𝑨', '𝒁'), ('𝒜', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒵'), ('𝓐', '𝓩'), ('𝔄', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔸', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕬', '𝖅'), ('𝖠', '𝖹'), ('𝗔', '𝗭'), ('𝘈', '𝘡'), ('𝘼', '𝙕'), ('𝙰', '𝚉'), ('𝚨', '𝛀'), ('𝛢', '𝛺'), ('𝜜', '𝜴'), ('𝝖', '𝝮'), ('𝞐', '𝞨'), ('𝟊', '𝟊'), ('𞤀', '𞤡'), ]; regex-syntax-0.8.2/src/unicode_tables/grapheme_cluster_break.rs000064400000000000000000000776301046102023000231200ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate grapheme-cluster-break ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("CR", CR), ("Control", CONTROL), ("Extend", EXTEND), ("L", L), ("LF", LF), ("LV", LV), ("LVT", LVT), ("Prepend", PREPEND), ("Regional_Indicator", REGIONAL_INDICATOR), ("SpacingMark", SPACINGMARK), ("T", T), ("V", V), ("ZWJ", ZWJ), ]; pub const CR: &'static [(char, char)] = &[('\r', '\r')]; pub const CONTROL: &'static [(char, char)] = &[ ('\0', '\t'), ('\u{b}', '\u{c}'), ('\u{e}', '\u{1f}'), ('\u{7f}', '\u{9f}'), ('\u{ad}', '\u{ad}'), ('\u{61c}', '\u{61c}'), ('\u{180e}', '\u{180e}'), ('\u{200b}', '\u{200b}'), ('\u{200e}', '\u{200f}'), ('\u{2028}', '\u{202e}'), ('\u{2060}', '\u{206f}'), ('\u{feff}', '\u{feff}'), ('\u{fff0}', '\u{fffb}'), ('\u{13430}', '\u{1343f}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), ('\u{e0000}', '\u{e001f}'), ('\u{e0080}', '\u{e00ff}'), ('\u{e01f0}', '\u{e0fff}'), ]; pub const EXTEND: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{489}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', '\u{902}'), ('\u{93a}', '\u{93a}'), ('\u{93c}', '\u{93c}'), ('\u{941}', '\u{948}'), ('\u{94d}', '\u{94d}'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', '\u{981}'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9be}'), ('\u{9c1}', '\u{9c4}'), ('\u{9cd}', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', '\u{a02}'), ('\u{a3c}', '\u{a3c}'), ('\u{a41}', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', '\u{a82}'), ('\u{abc}', '\u{abc}'), ('\u{ac1}', '\u{ac5}'), ('\u{ac7}', '\u{ac8}'), ('\u{acd}', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', '\u{b01}'), ('\u{b3c}', '\u{b3c}'), ('\u{b3e}', '\u{b3f}'), ('\u{b41}', '\u{b44}'), ('\u{b4d}', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', '\u{bbe}'), ('\u{bc0}', '\u{bc0}'), ('\u{bcd}', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c00}'), ('\u{c04}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', '\u{c40}'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', '\u{c81}'), ('\u{cbc}', '\u{cbc}'), ('\u{cbf}', '\u{cbf}'), ('\u{cc2}', '\u{cc2}'), ('\u{cc6}', '\u{cc6}'), ('\u{ccc}', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('\u{d00}', '\u{d01}'), ('\u{d3b}', '\u{d3c}'), ('\u{d3e}', '\u{d3e}'), ('\u{d41}', '\u{d44}'), ('\u{d4d}', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', '\u{d81}'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dcf}'), ('\u{dd2}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('\u{ddf}', '\u{ddf}'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('\u{f71}', '\u{f7e}'), ('\u{f80}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('\u{102d}', '\u{1030}'), ('\u{1032}', '\u{1037}'), ('\u{1039}', '\u{103a}'), ('\u{103d}', '\u{103e}'), ('\u{1058}', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{1082}'), ('\u{1085}', '\u{1086}'), ('\u{108d}', '\u{108d}'), ('\u{109d}', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '\u{1714}'), ('\u{1732}', '\u{1733}'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17b5}'), ('\u{17b7}', '\u{17bd}'), ('\u{17c6}', '\u{17c6}'), ('\u{17c9}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', '\u{1922}'), ('\u{1927}', '\u{1928}'), ('\u{1932}', '\u{1932}'), ('\u{1939}', '\u{193b}'), ('\u{1a17}', '\u{1a18}'), ('\u{1a1b}', '\u{1a1b}'), ('\u{1a56}', '\u{1a56}'), ('\u{1a58}', '\u{1a5e}'), ('\u{1a60}', '\u{1a60}'), ('\u{1a62}', '\u{1a62}'), ('\u{1a65}', '\u{1a6c}'), ('\u{1a73}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', '\u{1b03}'), ('\u{1b34}', '\u{1b3a}'), ('\u{1b3c}', '\u{1b3c}'), ('\u{1b42}', '\u{1b42}'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1b81}'), ('\u{1ba2}', '\u{1ba5}'), ('\u{1ba8}', '\u{1ba9}'), ('\u{1bab}', '\u{1bad}'), ('\u{1be6}', '\u{1be6}'), ('\u{1be8}', '\u{1be9}'), ('\u{1bed}', '\u{1bed}'), ('\u{1bef}', '\u{1bf1}'), ('\u{1c2c}', '\u{1c33}'), ('\u{1c36}', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{200c}', '\u{200c}'), ('\u{20d0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('\u{a825}', '\u{a826}'), ('\u{a82c}', '\u{a82c}'), ('\u{a8c4}', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '\u{a951}'), ('\u{a980}', '\u{a982}'), ('\u{a9b3}', '\u{a9b3}'), ('\u{a9b6}', '\u{a9b9}'), ('\u{a9bc}', '\u{a9bd}'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa2e}'), ('\u{aa31}', '\u{aa32}'), ('\u{aa35}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', '\u{aa4c}'), ('\u{aa7c}', '\u{aa7c}'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('\u{aaec}', '\u{aaed}'), ('\u{aaf6}', '\u{aaf6}'), ('\u{abe5}', '\u{abe5}'), ('\u{abe8}', '\u{abe8}'), ('\u{abed}', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{ff9e}', '\u{ff9f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('\u{11001}', '\u{11001}'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '\u{11081}'), ('\u{110b3}', '\u{110b6}'), ('\u{110b9}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{1112b}'), ('\u{1112d}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '\u{11181}'), ('\u{111b6}', '\u{111be}'), ('\u{111c9}', '\u{111cc}'), ('\u{111cf}', '\u{111cf}'), ('\u{1122f}', '\u{11231}'), ('\u{11234}', '\u{11234}'), ('\u{11236}', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112df}'), ('\u{112e3}', '\u{112ea}'), ('\u{11300}', '\u{11301}'), ('\u{1133b}', '\u{1133c}'), ('\u{1133e}', '\u{1133e}'), ('\u{11340}', '\u{11340}'), ('\u{11357}', '\u{11357}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('\u{11438}', '\u{1143f}'), ('\u{11442}', '\u{11444}'), ('\u{11446}', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b0}', '\u{114b0}'), ('\u{114b3}', '\u{114b8}'), ('\u{114ba}', '\u{114ba}'), ('\u{114bd}', '\u{114bd}'), ('\u{114bf}', '\u{114c0}'), ('\u{114c2}', '\u{114c3}'), ('\u{115af}', '\u{115af}'), ('\u{115b2}', '\u{115b5}'), ('\u{115bc}', '\u{115bd}'), ('\u{115bf}', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('\u{11633}', '\u{1163a}'), ('\u{1163d}', '\u{1163d}'), ('\u{1163f}', '\u{11640}'), ('\u{116ab}', '\u{116ab}'), ('\u{116ad}', '\u{116ad}'), ('\u{116b0}', '\u{116b5}'), ('\u{116b7}', '\u{116b7}'), ('\u{1171d}', '\u{1171f}'), ('\u{11722}', '\u{11725}'), ('\u{11727}', '\u{1172b}'), ('\u{1182f}', '\u{11837}'), ('\u{11839}', '\u{1183a}'), ('\u{11930}', '\u{11930}'), ('\u{1193b}', '\u{1193c}'), ('\u{1193e}', '\u{1193e}'), ('\u{11943}', '\u{11943}'), ('\u{119d4}', '\u{119d7}'), ('\u{119da}', '\u{119db}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '\u{11a38}'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a56}'), ('\u{11a59}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a96}'), ('\u{11a98}', '\u{11a99}'), ('\u{11c30}', '\u{11c36}'), ('\u{11c38}', '\u{11c3d}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('\u{11caa}', '\u{11cb0}'), ('\u{11cb2}', '\u{11cb3}'), ('\u{11cb5}', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('\u{11d90}', '\u{11d91}'), ('\u{11d95}', '\u{11d95}'), ('\u{11d97}', '\u{11d97}'), ('\u{11ef3}', '\u{11ef4}'), ('\u{11f00}', '\u{11f01}'), ('\u{11f36}', '\u{11f3a}'), ('\u{11f40}', '\u{11f40}'), ('\u{11f42}', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d165}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d16e}', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('🏻', '🏿'), ('\u{e0020}', '\u{e007f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const L: &'static [(char, char)] = &[('ᄀ', 'ᅟ'), ('ꥠ', 'ꥼ')]; pub const LF: &'static [(char, char)] = &[('\n', '\n')]; pub const LV: &'static [(char, char)] = &[ ('가', '가'), ('개', '개'), ('갸', '갸'), ('걔', '걔'), ('거', '거'), ('게', '게'), ('겨', '겨'), ('계', '계'), ('고', '고'), ('과', '과'), ('괘', '괘'), ('괴', '괴'), ('교', '교'), ('구', '구'), ('궈', '궈'), ('궤', '궤'), ('귀', '귀'), ('규', '규'), ('그', '그'), ('긔', '긔'), ('기', '기'), ('까', '까'), ('깨', '깨'), ('꺄', '꺄'), ('꺠', '꺠'), ('꺼', '꺼'), ('께', '께'), ('껴', '껴'), ('꼐', '꼐'), ('꼬', '꼬'), ('꽈', '꽈'), ('꽤', '꽤'), ('꾀', '꾀'), ('꾜', '꾜'), ('꾸', '꾸'), ('꿔', '꿔'), ('꿰', '꿰'), ('뀌', '뀌'), ('뀨', '뀨'), ('끄', '끄'), ('끠', '끠'), ('끼', '끼'), ('나', '나'), ('내', '내'), ('냐', '냐'), ('냬', '냬'), ('너', '너'), ('네', '네'), ('녀', '녀'), ('녜', '녜'), ('노', '노'), ('놔', '놔'), ('놰', '놰'), ('뇌', '뇌'), ('뇨', '뇨'), ('누', '누'), ('눠', '눠'), ('눼', '눼'), ('뉘', '뉘'), ('뉴', '뉴'), ('느', '느'), ('늬', '늬'), ('니', '니'), ('다', '다'), ('대', '대'), ('댜', '댜'), ('댸', '댸'), ('더', '더'), ('데', '데'), ('뎌', '뎌'), ('뎨', '뎨'), ('도', '도'), ('돠', '돠'), ('돼', '돼'), ('되', '되'), ('됴', '됴'), ('두', '두'), ('둬', '둬'), ('뒈', '뒈'), ('뒤', '뒤'), ('듀', '듀'), ('드', '드'), ('듸', '듸'), ('디', '디'), ('따', '따'), ('때', '때'), ('땨', '땨'), ('떄', '떄'), ('떠', '떠'), ('떼', '떼'), ('뗘', '뗘'), ('뗴', '뗴'), ('또', '또'), ('똬', '똬'), ('뙈', '뙈'), ('뙤', '뙤'), ('뚀', '뚀'), ('뚜', '뚜'), ('뚸', '뚸'), ('뛔', '뛔'), ('뛰', '뛰'), ('뜌', '뜌'), ('뜨', '뜨'), ('띄', '띄'), ('띠', '띠'), ('라', '라'), ('래', '래'), ('랴', '랴'), ('럐', '럐'), ('러', '러'), ('레', '레'), ('려', '려'), ('례', '례'), ('로', '로'), ('롸', '롸'), ('뢔', '뢔'), ('뢰', '뢰'), ('료', '료'), ('루', '루'), ('뤄', '뤄'), ('뤠', '뤠'), ('뤼', '뤼'), ('류', '류'), ('르', '르'), ('릐', '릐'), ('리', '리'), ('마', '마'), ('매', '매'), ('먀', '먀'), ('먜', '먜'), ('머', '머'), ('메', '메'), ('며', '며'), ('몌', '몌'), ('모', '모'), ('뫄', '뫄'), ('뫠', '뫠'), ('뫼', '뫼'), ('묘', '묘'), ('무', '무'), ('뭐', '뭐'), ('뭬', '뭬'), ('뮈', '뮈'), ('뮤', '뮤'), ('므', '므'), ('믜', '믜'), ('미', '미'), ('바', '바'), ('배', '배'), ('뱌', '뱌'), ('뱨', '뱨'), ('버', '버'), ('베', '베'), ('벼', '벼'), ('볘', '볘'), ('보', '보'), ('봐', '봐'), ('봬', '봬'), ('뵈', '뵈'), ('뵤', '뵤'), ('부', '부'), ('붜', '붜'), ('붸', '붸'), ('뷔', '뷔'), ('뷰', '뷰'), ('브', '브'), ('븨', '븨'), ('비', '비'), ('빠', '빠'), ('빼', '빼'), ('뺘', '뺘'), ('뺴', '뺴'), ('뻐', '뻐'), ('뻬', '뻬'), ('뼈', '뼈'), ('뼤', '뼤'), ('뽀', '뽀'), ('뽜', '뽜'), ('뽸', '뽸'), ('뾔', '뾔'), ('뾰', '뾰'), ('뿌', '뿌'), ('뿨', '뿨'), ('쀄', '쀄'), ('쀠', '쀠'), ('쀼', '쀼'), ('쁘', '쁘'), ('쁴', '쁴'), ('삐', '삐'), ('사', '사'), ('새', '새'), ('샤', '샤'), ('섀', '섀'), ('서', '서'), ('세', '세'), ('셔', '셔'), ('셰', '셰'), ('소', '소'), ('솨', '솨'), ('쇄', '쇄'), ('쇠', '쇠'), ('쇼', '쇼'), ('수', '수'), ('숴', '숴'), ('쉐', '쉐'), ('쉬', '쉬'), ('슈', '슈'), ('스', '스'), ('싀', '싀'), ('시', '시'), ('싸', '싸'), ('쌔', '쌔'), ('쌰', '쌰'), ('썌', '썌'), ('써', '써'), ('쎄', '쎄'), ('쎠', '쎠'), ('쎼', '쎼'), ('쏘', '쏘'), ('쏴', '쏴'), ('쐐', '쐐'), ('쐬', '쐬'), ('쑈', '쑈'), ('쑤', '쑤'), ('쒀', '쒀'), ('쒜', '쒜'), ('쒸', '쒸'), ('쓔', '쓔'), ('쓰', '쓰'), ('씌', '씌'), ('씨', '씨'), ('아', '아'), ('애', '애'), ('야', '야'), ('얘', '얘'), ('어', '어'), ('에', '에'), ('여', '여'), ('예', '예'), ('오', '오'), ('와', '와'), ('왜', '왜'), ('외', '외'), ('요', '요'), ('우', '우'), ('워', '워'), ('웨', '웨'), ('위', '위'), ('유', '유'), ('으', '으'), ('의', '의'), ('이', '이'), ('자', '자'), ('재', '재'), ('쟈', '쟈'), ('쟤', '쟤'), ('저', '저'), ('제', '제'), ('져', '져'), ('졔', '졔'), ('조', '조'), ('좌', '좌'), ('좨', '좨'), ('죄', '죄'), ('죠', '죠'), ('주', '주'), ('줘', '줘'), ('줴', '줴'), ('쥐', '쥐'), ('쥬', '쥬'), ('즈', '즈'), ('즤', '즤'), ('지', '지'), ('짜', '짜'), ('째', '째'), ('쨔', '쨔'), ('쨰', '쨰'), ('쩌', '쩌'), ('쩨', '쩨'), ('쪄', '쪄'), ('쪠', '쪠'), ('쪼', '쪼'), ('쫘', '쫘'), ('쫴', '쫴'), ('쬐', '쬐'), ('쬬', '쬬'), ('쭈', '쭈'), ('쭤', '쭤'), ('쮀', '쮀'), ('쮜', '쮜'), ('쮸', '쮸'), ('쯔', '쯔'), ('쯰', '쯰'), ('찌', '찌'), ('차', '차'), ('채', '채'), ('챠', '챠'), ('챼', '챼'), ('처', '처'), ('체', '체'), ('쳐', '쳐'), ('쳬', '쳬'), ('초', '초'), ('촤', '촤'), ('쵀', '쵀'), ('최', '최'), ('쵸', '쵸'), ('추', '추'), ('춰', '춰'), ('췌', '췌'), ('취', '취'), ('츄', '츄'), ('츠', '츠'), ('츼', '츼'), ('치', '치'), ('카', '카'), ('캐', '캐'), ('캬', '캬'), ('컈', '컈'), ('커', '커'), ('케', '케'), ('켜', '켜'), ('켸', '켸'), ('코', '코'), ('콰', '콰'), ('쾌', '쾌'), ('쾨', '쾨'), ('쿄', '쿄'), ('쿠', '쿠'), ('쿼', '쿼'), ('퀘', '퀘'), ('퀴', '퀴'), ('큐', '큐'), ('크', '크'), ('킈', '킈'), ('키', '키'), ('타', '타'), ('태', '태'), ('탸', '탸'), ('턔', '턔'), ('터', '터'), ('테', '테'), ('텨', '텨'), ('톄', '톄'), ('토', '토'), ('톼', '톼'), ('퇘', '퇘'), ('퇴', '퇴'), ('툐', '툐'), ('투', '투'), ('퉈', '퉈'), ('퉤', '퉤'), ('튀', '튀'), ('튜', '튜'), ('트', '트'), ('틔', '틔'), ('티', '티'), ('파', '파'), ('패', '패'), ('퍄', '퍄'), ('퍠', '퍠'), ('퍼', '퍼'), ('페', '페'), ('펴', '펴'), ('폐', '폐'), ('포', '포'), ('퐈', '퐈'), ('퐤', '퐤'), ('푀', '푀'), ('표', '표'), ('푸', '푸'), ('풔', '풔'), ('풰', '풰'), ('퓌', '퓌'), ('퓨', '퓨'), ('프', '프'), ('픠', '픠'), ('피', '피'), ('하', '하'), ('해', '해'), ('햐', '햐'), ('햬', '햬'), ('허', '허'), ('헤', '헤'), ('혀', '혀'), ('혜', '혜'), ('호', '호'), ('화', '화'), ('홰', '홰'), ('회', '회'), ('효', '효'), ('후', '후'), ('훠', '훠'), ('훼', '훼'), ('휘', '휘'), ('휴', '휴'), ('흐', '흐'), ('희', '희'), ('히', '히'), ]; pub const LVT: &'static [(char, char)] = &[ ('각', '갛'), ('객', '갷'), ('갹', '걓'), ('걕', '걯'), ('걱', '겋'), ('겍', '겧'), ('격', '곃'), ('곅', '곟'), ('곡', '곻'), ('곽', '괗'), ('괙', '괳'), ('괵', '굏'), ('굑', '굫'), ('국', '궇'), ('궉', '궣'), ('궥', '궿'), ('귁', '귛'), ('귝', '귷'), ('극', '긓'), ('긕', '긯'), ('긱', '깋'), ('깍', '깧'), ('깩', '꺃'), ('꺅', '꺟'), ('꺡', '꺻'), ('꺽', '껗'), ('껙', '껳'), ('껵', '꼏'), ('꼑', '꼫'), ('꼭', '꽇'), ('꽉', '꽣'), ('꽥', '꽿'), ('꾁', '꾛'), ('꾝', '꾷'), ('꾹', '꿓'), ('꿕', '꿯'), ('꿱', '뀋'), ('뀍', '뀧'), ('뀩', '끃'), ('끅', '끟'), ('끡', '끻'), ('끽', '낗'), ('낙', '낳'), ('낵', '냏'), ('냑', '냫'), ('냭', '넇'), ('넉', '넣'), ('넥', '넿'), ('녁', '녛'), ('녝', '녷'), ('녹', '놓'), ('놕', '놯'), ('놱', '뇋'), ('뇍', '뇧'), ('뇩', '눃'), ('눅', '눟'), ('눡', '눻'), ('눽', '뉗'), ('뉙', '뉳'), ('뉵', '늏'), ('늑', '늫'), ('늭', '닇'), ('닉', '닣'), ('닥', '닿'), ('댁', '댛'), ('댝', '댷'), ('댹', '덓'), ('덕', '덯'), ('덱', '뎋'), ('뎍', '뎧'), ('뎩', '돃'), ('독', '돟'), ('돡', '돻'), ('돽', '됗'), ('됙', '됳'), ('됵', '둏'), ('둑', '둫'), ('둭', '뒇'), ('뒉', '뒣'), ('뒥', '뒿'), ('듁', '듛'), ('득', '듷'), ('듹', '딓'), ('딕', '딯'), ('딱', '땋'), ('땍', '땧'), ('땩', '떃'), ('떅', '떟'), ('떡', '떻'), ('떽', '뗗'), ('뗙', '뗳'), ('뗵', '똏'), ('똑', '똫'), ('똭', '뙇'), ('뙉', '뙣'), ('뙥', '뙿'), ('뚁', '뚛'), ('뚝', '뚷'), ('뚹', '뛓'), ('뛕', '뛯'), ('뛱', '뜋'), ('뜍', '뜧'), ('뜩', '띃'), ('띅', '띟'), ('띡', '띻'), ('락', '랗'), ('랙', '랳'), ('략', '럏'), ('럑', '럫'), ('럭', '렇'), ('렉', '렣'), ('력', '렿'), ('롁', '롛'), ('록', '롷'), ('롹', '뢓'), ('뢕', '뢯'), ('뢱', '룋'), ('룍', '룧'), ('룩', '뤃'), ('뤅', '뤟'), ('뤡', '뤻'), ('뤽', '륗'), ('륙', '륳'), ('륵', '릏'), ('릑', '릫'), ('릭', '맇'), ('막', '맣'), ('맥', '맿'), ('먁', '먛'), ('먝', '먷'), ('먹', '멓'), ('멕', '멯'), ('멱', '몋'), ('몍', '몧'), ('목', '뫃'), ('뫅', '뫟'), ('뫡', '뫻'), ('뫽', '묗'), ('묙', '묳'), ('묵', '뭏'), ('뭑', '뭫'), ('뭭', '뮇'), ('뮉', '뮣'), ('뮥', '뮿'), ('믁', '믛'), ('믝', '믷'), ('믹', '밓'), ('박', '밯'), ('백', '뱋'), ('뱍', '뱧'), ('뱩', '벃'), ('벅', '벟'), ('벡', '벻'), ('벽', '볗'), ('볙', '볳'), ('복', '봏'), ('봑', '봫'), ('봭', '뵇'), ('뵉', '뵣'), ('뵥', '뵿'), ('북', '붛'), ('붝', '붷'), ('붹', '뷓'), ('뷕', '뷯'), ('뷱', '븋'), ('븍', '븧'), ('븩', '빃'), ('빅', '빟'), ('빡', '빻'), ('빽', '뺗'), ('뺙', '뺳'), ('뺵', '뻏'), ('뻑', '뻫'), ('뻭', '뼇'), ('뼉', '뼣'), ('뼥', '뼿'), ('뽁', '뽛'), ('뽝', '뽷'), ('뽹', '뾓'), ('뾕', '뾯'), ('뾱', '뿋'), ('뿍', '뿧'), ('뿩', '쀃'), ('쀅', '쀟'), ('쀡', '쀻'), ('쀽', '쁗'), ('쁙', '쁳'), ('쁵', '삏'), ('삑', '삫'), ('삭', '샇'), ('색', '샣'), ('샥', '샿'), ('섁', '섛'), ('석', '섷'), ('섹', '셓'), ('셕', '셯'), ('셱', '솋'), ('속', '솧'), ('솩', '쇃'), ('쇅', '쇟'), ('쇡', '쇻'), ('쇽', '숗'), ('숙', '숳'), ('숵', '쉏'), ('쉑', '쉫'), ('쉭', '슇'), ('슉', '슣'), ('슥', '슿'), ('싁', '싛'), ('식', '싷'), ('싹', '쌓'), ('쌕', '쌯'), ('쌱', '썋'), ('썍', '썧'), ('썩', '쎃'), ('쎅', '쎟'), ('쎡', '쎻'), ('쎽', '쏗'), ('쏙', '쏳'), ('쏵', '쐏'), ('쐑', '쐫'), ('쐭', '쑇'), ('쑉', '쑣'), ('쑥', '쑿'), ('쒁', '쒛'), ('쒝', '쒷'), ('쒹', '쓓'), ('쓕', '쓯'), ('쓱', '씋'), ('씍', '씧'), ('씩', '앃'), ('악', '앟'), ('액', '앻'), ('약', '얗'), ('얙', '얳'), ('억', '엏'), ('엑', '엫'), ('역', '옇'), ('옉', '옣'), ('옥', '옿'), ('왁', '왛'), ('왝', '왷'), ('왹', '욓'), ('욕', '욯'), ('욱', '웋'), ('웍', '웧'), ('웩', '윃'), ('윅', '윟'), ('육', '윻'), ('윽', '읗'), ('읙', '읳'), ('익', '잏'), ('작', '잫'), ('잭', '쟇'), ('쟉', '쟣'), ('쟥', '쟿'), ('적', '젛'), ('젝', '젷'), ('젹', '졓'), ('졕', '졯'), ('족', '좋'), ('좍', '좧'), ('좩', '죃'), ('죅', '죟'), ('죡', '죻'), ('죽', '줗'), ('줙', '줳'), ('줵', '쥏'), ('쥑', '쥫'), ('쥭', '즇'), ('즉', '즣'), ('즥', '즿'), ('직', '짛'), ('짝', '짷'), ('짹', '쨓'), ('쨕', '쨯'), ('쨱', '쩋'), ('쩍', '쩧'), ('쩩', '쪃'), ('쪅', '쪟'), ('쪡', '쪻'), ('쪽', '쫗'), ('쫙', '쫳'), ('쫵', '쬏'), ('쬑', '쬫'), ('쬭', '쭇'), ('쭉', '쭣'), ('쭥', '쭿'), ('쮁', '쮛'), ('쮝', '쮷'), ('쮹', '쯓'), ('쯕', '쯯'), ('쯱', '찋'), ('찍', '찧'), ('착', '챃'), ('책', '챟'), ('챡', '챻'), ('챽', '첗'), ('척', '첳'), ('첵', '쳏'), ('쳑', '쳫'), ('쳭', '촇'), ('촉', '촣'), ('촥', '촿'), ('쵁', '쵛'), ('쵝', '쵷'), ('쵹', '춓'), ('축', '춯'), ('춱', '췋'), ('췍', '췧'), ('췩', '츃'), ('츅', '츟'), ('측', '츻'), ('츽', '칗'), ('칙', '칳'), ('칵', '캏'), ('캑', '캫'), ('캭', '컇'), ('컉', '컣'), ('컥', '컿'), ('켁', '켛'), ('켝', '켷'), ('켹', '콓'), ('콕', '콯'), ('콱', '쾋'), ('쾍', '쾧'), ('쾩', '쿃'), ('쿅', '쿟'), ('쿡', '쿻'), ('쿽', '퀗'), ('퀙', '퀳'), ('퀵', '큏'), ('큑', '큫'), ('큭', '킇'), ('킉', '킣'), ('킥', '킿'), ('탁', '탛'), ('택', '탷'), ('탹', '턓'), ('턕', '턯'), ('턱', '텋'), ('텍', '텧'), ('텩', '톃'), ('톅', '톟'), ('톡', '톻'), ('톽', '퇗'), ('퇙', '퇳'), ('퇵', '툏'), ('툑', '툫'), ('툭', '퉇'), ('퉉', '퉣'), ('퉥', '퉿'), ('튁', '튛'), ('튝', '튷'), ('특', '틓'), ('틕', '틯'), ('틱', '팋'), ('팍', '팧'), ('팩', '퍃'), ('퍅', '퍟'), ('퍡', '퍻'), ('퍽', '펗'), ('펙', '펳'), ('펵', '폏'), ('폑', '폫'), ('폭', '퐇'), ('퐉', '퐣'), ('퐥', '퐿'), ('푁', '푛'), ('푝', '푷'), ('푹', '풓'), ('풕', '풯'), ('풱', '퓋'), ('퓍', '퓧'), ('퓩', '픃'), ('픅', '픟'), ('픡', '픻'), ('픽', '핗'), ('학', '핳'), ('핵', '햏'), ('햑', '햫'), ('햭', '헇'), ('헉', '헣'), ('헥', '헿'), ('혁', '혛'), ('혝', '혷'), ('혹', '홓'), ('확', '홯'), ('홱', '횋'), ('획', '횧'), ('횩', '훃'), ('훅', '훟'), ('훡', '훻'), ('훽', '휗'), ('휙', '휳'), ('휵', '흏'), ('흑', '흫'), ('흭', '힇'), ('힉', '힣'), ]; pub const PREPEND: &'static [(char, char)] = &[ ('\u{600}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{890}', '\u{891}'), ('\u{8e2}', '\u{8e2}'), ('ൎ', 'ൎ'), ('\u{110bd}', '\u{110bd}'), ('\u{110cd}', '\u{110cd}'), ('𑇂', '𑇃'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑨺', '𑨺'), ('𑪄', '𑪉'), ('𑵆', '𑵆'), ('𑼂', '𑼂'), ]; pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; pub const SPACINGMARK: &'static [(char, char)] = &[ ('ः', 'ः'), ('ऻ', 'ऻ'), ('ा', 'ी'), ('ॉ', 'ौ'), ('ॎ', 'ॏ'), ('ং', 'ঃ'), ('ি', 'ী'), ('ে', 'ৈ'), ('ো', 'ৌ'), ('ਃ', 'ਃ'), ('ਾ', 'ੀ'), ('ઃ', 'ઃ'), ('ા', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), ('ଂ', 'ଃ'), ('ୀ', 'ୀ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('ி', 'ி'), ('ு', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), ('ఁ', 'ః'), ('ు', 'ౄ'), ('ಂ', 'ಃ'), ('ಾ', 'ಾ'), ('ೀ', 'ು'), ('ೃ', 'ೄ'), ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), ('ೳ', 'ೳ'), ('ം', 'ഃ'), ('ി', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ං', 'ඃ'), ('ැ', 'ෑ'), ('ෘ', 'ෞ'), ('ෲ', 'ෳ'), ('ำ', 'ำ'), ('ຳ', 'ຳ'), ('༾', '༿'), ('ཿ', 'ཿ'), ('ေ', 'ေ'), ('ျ', 'ြ'), ('ၖ', 'ၗ'), ('ႄ', 'ႄ'), ('᜕', '᜕'), ('᜴', '᜴'), ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), ('ᨙ', 'ᨚ'), ('ᩕ', 'ᩕ'), ('ᩗ', 'ᩗ'), ('ᩭ', 'ᩲ'), ('ᬄ', 'ᬄ'), ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', '᭄'), ('ᮂ', 'ᮂ'), ('ᮡ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), ('ᯧ', 'ᯧ'), ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), ('᯲', '᯳'), ('ᰤ', 'ᰫ'), ('ᰴ', 'ᰵ'), ('᳡', '᳡'), ('᳷', '᳷'), ('ꠣ', 'ꠤ'), ('ꠧ', 'ꠧ'), ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣃ'), ('ꥒ', '꥓'), ('ꦃ', 'ꦃ'), ('ꦴ', 'ꦵ'), ('ꦺ', 'ꦻ'), ('ꦾ', '꧀'), ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), ('ꩍ', 'ꩍ'), ('ꫫ', 'ꫫ'), ('ꫮ', 'ꫯ'), ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯤ'), ('ꯦ', 'ꯧ'), ('ꯩ', 'ꯪ'), ('꯬', '꯬'), ('𑀀', '𑀀'), ('𑀂', '𑀂'), ('𑂂', '𑂂'), ('𑂰', '𑂲'), ('𑂷', '𑂸'), ('𑄬', '𑄬'), ('𑅅', '𑅆'), ('𑆂', '𑆂'), ('𑆳', '𑆵'), ('𑆿', '𑇀'), ('𑇎', '𑇎'), ('𑈬', '𑈮'), ('𑈲', '𑈳'), ('𑈵', '𑈵'), ('𑋠', '𑋢'), ('𑌂', '𑌃'), ('𑌿', '𑌿'), ('𑍁', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍢', '𑍣'), ('𑐵', '𑐷'), ('𑑀', '𑑁'), ('𑑅', '𑑅'), ('𑒱', '𑒲'), ('𑒹', '𑒹'), ('𑒻', '𑒼'), ('𑒾', '𑒾'), ('𑓁', '𑓁'), ('𑖰', '𑖱'), ('𑖸', '𑖻'), ('𑖾', '𑖾'), ('𑘰', '𑘲'), ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑚬', '𑚬'), ('𑚮', '𑚯'), ('𑚶', '𑚶'), ('𑜦', '𑜦'), ('𑠬', '𑠮'), ('𑠸', '𑠸'), ('𑤱', '𑤵'), ('𑤷', '𑤸'), ('𑤽', '𑤽'), ('𑥀', '𑥀'), ('𑥂', '𑥂'), ('𑧑', '𑧓'), ('𑧜', '𑧟'), ('𑧤', '𑧤'), ('𑨹', '𑨹'), ('𑩗', '𑩘'), ('𑪗', '𑪗'), ('𑰯', '𑰯'), ('𑰾', '𑰾'), ('𑲩', '𑲩'), ('𑲱', '𑲱'), ('𑲴', '𑲴'), ('𑶊', '𑶎'), ('𑶓', '𑶔'), ('𑶖', '𑶖'), ('𑻵', '𑻶'), ('𑼃', '𑼃'), ('𑼴', '𑼵'), ('𑼾', '𑼿'), ('𑽁', '𑽁'), ('𖽑', '𖾇'), ('𖿰', '𖿱'), ('𝅦', '𝅦'), ('𝅭', '𝅭'), ]; pub const T: &'static [(char, char)] = &[('ᆨ', 'ᇿ'), ('ퟋ', 'ퟻ')]; pub const V: &'static [(char, char)] = &[('ᅠ', 'ᆧ'), ('ힰ', 'ퟆ')]; pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; regex-syntax-0.8.2/src/unicode_tables/mod.rs000064400000000000000000000023521046102023000171670ustar 00000000000000#[cfg(feature = "unicode-age")] pub mod age; #[cfg(feature = "unicode-case")] pub mod case_folding_simple; #[cfg(feature = "unicode-gencat")] pub mod general_category; #[cfg(feature = "unicode-segment")] pub mod grapheme_cluster_break; #[cfg(all(feature = "unicode-perl", not(feature = "unicode-gencat")))] #[allow(dead_code)] pub mod perl_decimal; #[cfg(all(feature = "unicode-perl", not(feature = "unicode-bool")))] #[allow(dead_code)] pub mod perl_space; #[cfg(feature = "unicode-perl")] pub mod perl_word; #[cfg(feature = "unicode-bool")] pub mod property_bool; #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] pub mod property_names; #[cfg(any( feature = "unicode-age", feature = "unicode-bool", feature = "unicode-gencat", feature = "unicode-perl", feature = "unicode-script", feature = "unicode-segment", ))] pub mod property_values; #[cfg(feature = "unicode-script")] pub mod script; #[cfg(feature = "unicode-script")] pub mod script_extension; #[cfg(feature = "unicode-segment")] pub mod sentence_break; #[cfg(feature = "unicode-segment")] pub mod word_break; regex-syntax-0.8.2/src/unicode_tables/perl_decimal.rs000064400000000000000000000032751046102023000210350ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate general-category ucd-15.0.0 --chars --include decimalnumber // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[("Decimal_Number", DECIMAL_NUMBER)]; pub const DECIMAL_NUMBER: &'static [(char, char)] = &[ ('0', '9'), ('٠', '٩'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), ('𐒠', '𐒩'), ('𐴰', '𐴹'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), ('𑥐', '𑥙'), ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𑶠', '𑶩'), ('𑽐', '𑽙'), ('𖩠', '𖩩'), ('𖫀', '𖫉'), ('𖭐', '𖭙'), ('𝟎', '𝟿'), ('𞅀', '𞅉'), ('𞋰', '𞋹'), ('𞓰', '𞓹'), ('𞥐', '𞥙'), ('🯰', '🯹'), ]; regex-syntax-0.8.2/src/unicode_tables/perl_space.rs000064400000000000000000000012141046102023000205210ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate property-bool ucd-15.0.0 --chars --include whitespace // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[("White_Space", WHITE_SPACE)]; pub const WHITE_SPACE: &'static [(char, char)] = &[ ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; regex-syntax-0.8.2/src/unicode_tables/perl_word.rs000064400000000000000000000423601046102023000204100ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate perl-word ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const PERL_WORD: &'static [(char, char)] = &[ ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{300}', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('\u{483}', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '\u{74a}'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), ('\u{7fd}', '\u{7fd}'), ('ࠀ', '\u{82d}'), ('ࡀ', '\u{85b}'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('\u{898}', '\u{8e1}'), ('\u{8e3}', '\u{963}'), ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '\u{a75}'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૯'), ('ૹ', '\u{aff}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୯'), ('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௯'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), ('ൔ', '\u{d57}'), ('ൟ', '\u{d63}'), ('൦', '൯'), ('ൺ', 'ൿ'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', 'ෳ'), ('ก', '\u{e3a}'), ('เ', '\u{e4e}'), ('๐', '๙'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('\u{f18}', '\u{f19}'), ('༠', '༩'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('က', '၉'), ('ၐ', '\u{109d}'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '\u{135f}'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', '᜕'), ('ᜟ', '᜴'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('ក', '\u{17d3}'), ('ៗ', 'ៗ'), ('ៜ', '\u{17dd}'), ('០', '៩'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'), ('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', 'ᭌ'), ('᭐', '᭙'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '᯳'), ('ᰀ', '\u{1c37}'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', 'ᳺ'), ('ᴀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('\u{200c}', '\u{200d}'), ('‿', '⁀'), ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20f0}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('\u{2de0}', '\u{2dff}'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '\u{302f}'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('\u{3099}', '\u{309a}'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('ꙿ', '\u{a6f1}'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꡀ', 'ꡳ'), ('ꢀ', '\u{a8c5}'), ('꣐', '꣙'), ('\u{a8e0}', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a92d}'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('\u{a980}', '꧀'), ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', '\u{aaf6}'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯪ'), ('꯬', '\u{abed}'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('︳', '︴'), ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('\u{101fd}', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('\u{102e0}', '\u{102e0}'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '\u{10ae6}'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐺀', '𐺩'), ('\u{10eab}', '\u{10eac}'), ('𐺰', '𐺱'), ('\u{10efd}', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '\u{10f50}'), ('𐽰', '\u{10f85}'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀀', '\u{11046}'), ('𑁦', '𑁵'), ('\u{1107f}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑄿'), ('𑅄', '𑅇'), ('𑅐', '\u{11173}'), ('𑅶', '𑅶'), ('\u{11180}', '𑇄'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11237}'), ('\u{1123e}', '\u{11241}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('\u{1145e}', '𑑡'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('𑗘', '\u{115dd}'), ('𑘀', '\u{11640}'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚸'), ('𑛀', '𑛉'), ('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜹'), ('𑝀', '𑝆'), ('𑠀', '\u{1183a}'), ('𑢠', '𑣩'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{11943}'), ('𑥐', '𑥙'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧡'), ('𑧣', '𑧤'), ('𑨀', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('𑩐', '\u{11a99}'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻶'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('𑽐', '𑽙'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('\u{13440}', '\u{13455}'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩰', '𖪾'), ('𖫀', '𖫉'), ('𖫐', '𖫭'), ('\u{16af0}', '\u{16af4}'), ('𖬀', '\u{16b36}'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '\u{16fe4}'), ('𖿰', '𖿱'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅎'), ('𞊐', '\u{1e2ae}'), ('𞋀', '𞋹'), ('𞓐', '𞓹'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('\u{1e8d0}', '\u{1e8d6}'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ('🯰', '🯹'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ('\u{e0100}', '\u{e01ef}'), ]; regex-syntax-0.8.2/src/unicode_tables/property_bool.rs000064400000000000000000007402071046102023000213170ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate property-bool ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("ASCII_Hex_Digit", ASCII_HEX_DIGIT), ("Alphabetic", ALPHABETIC), ("Bidi_Control", BIDI_CONTROL), ("Bidi_Mirrored", BIDI_MIRRORED), ("Case_Ignorable", CASE_IGNORABLE), ("Cased", CASED), ("Changes_When_Casefolded", CHANGES_WHEN_CASEFOLDED), ("Changes_When_Casemapped", CHANGES_WHEN_CASEMAPPED), ("Changes_When_Lowercased", CHANGES_WHEN_LOWERCASED), ("Changes_When_Titlecased", CHANGES_WHEN_TITLECASED), ("Changes_When_Uppercased", CHANGES_WHEN_UPPERCASED), ("Dash", DASH), ("Default_Ignorable_Code_Point", DEFAULT_IGNORABLE_CODE_POINT), ("Deprecated", DEPRECATED), ("Diacritic", DIACRITIC), ("Emoji", EMOJI), ("Emoji_Component", EMOJI_COMPONENT), ("Emoji_Modifier", EMOJI_MODIFIER), ("Emoji_Modifier_Base", EMOJI_MODIFIER_BASE), ("Emoji_Presentation", EMOJI_PRESENTATION), ("Extended_Pictographic", EXTENDED_PICTOGRAPHIC), ("Extender", EXTENDER), ("Grapheme_Base", GRAPHEME_BASE), ("Grapheme_Extend", GRAPHEME_EXTEND), ("Grapheme_Link", GRAPHEME_LINK), ("Hex_Digit", HEX_DIGIT), ("Hyphen", HYPHEN), ("IDS_Binary_Operator", IDS_BINARY_OPERATOR), ("IDS_Trinary_Operator", IDS_TRINARY_OPERATOR), ("ID_Continue", ID_CONTINUE), ("ID_Start", ID_START), ("Ideographic", IDEOGRAPHIC), ("Join_Control", JOIN_CONTROL), ("Logical_Order_Exception", LOGICAL_ORDER_EXCEPTION), ("Lowercase", LOWERCASE), ("Math", MATH), ("Noncharacter_Code_Point", NONCHARACTER_CODE_POINT), ("Other_Alphabetic", OTHER_ALPHABETIC), ("Other_Default_Ignorable_Code_Point", OTHER_DEFAULT_IGNORABLE_CODE_POINT), ("Other_Grapheme_Extend", OTHER_GRAPHEME_EXTEND), ("Other_ID_Continue", OTHER_ID_CONTINUE), ("Other_ID_Start", OTHER_ID_START), ("Other_Lowercase", OTHER_LOWERCASE), ("Other_Math", OTHER_MATH), ("Other_Uppercase", OTHER_UPPERCASE), ("Pattern_Syntax", PATTERN_SYNTAX), ("Pattern_White_Space", PATTERN_WHITE_SPACE), ("Prepended_Concatenation_Mark", PREPENDED_CONCATENATION_MARK), ("Quotation_Mark", QUOTATION_MARK), ("Radical", RADICAL), ("Regional_Indicator", REGIONAL_INDICATOR), ("Sentence_Terminal", SENTENCE_TERMINAL), ("Soft_Dotted", SOFT_DOTTED), ("Terminal_Punctuation", TERMINAL_PUNCTUATION), ("Unified_Ideograph", UNIFIED_IDEOGRAPH), ("Uppercase", UPPERCASE), ("Variation_Selector", VARIATION_SELECTOR), ("White_Space", WHITE_SPACE), ("XID_Continue", XID_CONTINUE), ("XID_Start", XID_START), ]; pub const ASCII_HEX_DIGIT: &'static [(char, char)] = &[('0', '9'), ('A', 'F'), ('a', 'f')]; pub const ALPHABETIC: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{345}', '\u{345}'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{5b0}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '\u{657}'), ('\u{659}', '\u{65f}'), ('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6e1}', '\u{6e8}'), ('\u{6ed}', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '\u{73f}'), ('ݍ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', '\u{817}'), ('ࠚ', '\u{82c}'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('\u{8d4}', '\u{8df}'), ('\u{8e3}', '\u{8e9}'), ('\u{8f0}', 'ऻ'), ('ऽ', 'ौ'), ('ॎ', 'ॐ'), ('\u{955}', '\u{963}'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৌ'), ('ৎ', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4c}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('\u{a70}', '\u{a75}'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', 'ૌ'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('ૹ', '\u{afc}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('\u{b56}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4c}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccc}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('ೱ', 'ೳ'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൎ', 'ൎ'), ('ൔ', '\u{d57}'), ('ൟ', '\u{d63}'), ('ൺ', 'ൿ'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('ก', '\u{e3a}'), ('เ', 'ๆ'), ('\u{e4d}', '\u{e4d}'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', '\u{eb9}'), ('\u{ebb}', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ecd}', '\u{ecd}'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f83}'), ('ྈ', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('က', '\u{1036}'), ('း', 'း'), ('ျ', 'ဿ'), ('ၐ', 'ႏ'), ('ႚ', '\u{109d}'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', '\u{1713}'), ('ᜟ', '\u{1733}'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('ក', 'ឳ'), ('ា', 'ៈ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', 'ᤸ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('ᩡ', '\u{1a74}'), ('ᪧ', 'ᪧ'), ('\u{1abf}', '\u{1ac0}'), ('\u{1acc}', '\u{1ace}'), ('\u{1b00}', 'ᬳ'), ('\u{1b35}', 'ᭃ'), ('ᭅ', 'ᭌ'), ('\u{1b80}', '\u{1ba9}'), ('\u{1bac}', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᯧ', '\u{1bf1}'), ('ᰀ', '\u{1c36}'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('\u{1de7}', '\u{1df4}'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('\u{2de0}', '\u{2dff}'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('\u{a674}', '\u{a67b}'), ('ꙿ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠅ'), ('ꠇ', 'ꠧ'), ('ꡀ', 'ꡳ'), ('ꢀ', 'ꣃ'), ('\u{a8c5}', '\u{a8c5}'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a8ff}'), ('ꤊ', '\u{a92a}'), ('ꤰ', 'ꥒ'), ('ꥠ', 'ꥼ'), ('\u{a980}', 'ꦲ'), ('ꦴ', 'ꦿ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('ꩠ', 'ꩶ'), ('ꩺ', '\u{aabe}'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', 'ꫵ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯪ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐺀', '𐺩'), ('\u{10eab}', '\u{10eac}'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀀', '\u{11045}'), ('𑁱', '𑁵'), ('\u{11080}', '𑂸'), ('\u{110c2}', '\u{110c2}'), ('𑃐', '𑃨'), ('\u{11100}', '\u{11132}'), ('𑅄', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('\u{11180}', '𑆿'), ('𑇁', '𑇄'), ('𑇎', '\u{111cf}'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11234}'), ('\u{11237}', '\u{11237}'), ('\u{1123e}', '\u{11241}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112e8}'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍌'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('𑐀', '𑑁'), ('\u{11443}', '𑑅'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑓁'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '\u{115b5}'), ('𑖸', '𑖾'), ('𑗘', '\u{115dd}'), ('𑘀', '𑘾'), ('\u{11640}', '\u{11640}'), ('𑙄', '𑙄'), ('𑚀', '\u{116b5}'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('\u{1171d}', '\u{1172a}'), ('𑝀', '𑝆'), ('𑠀', '𑠸'), ('𑢠', '𑣟'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{1193c}'), ('𑤿', '𑥂'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧟'), ('𑧡', '𑧡'), ('𑧣', '𑧤'), ('𑨀', '𑨲'), ('\u{11a35}', '\u{11a3e}'), ('𑩐', '𑪗'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑰾'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d41}'), ('\u{11d43}', '\u{11d43}'), ('𑵆', '\u{11d47}'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶖'), ('𑶘', '𑶘'), ('𑻠', '𑻶'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '\u{11f40}'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𖿰', '𖿱'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('\u{1bc9e}', '\u{1bc9e}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('\u{1e947}', '\u{1e947}'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const BIDI_CONTROL: &'static [(char, char)] = &[ ('\u{61c}', '\u{61c}'), ('\u{200e}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2066}', '\u{2069}'), ]; pub const BIDI_MIRRORED: &'static [(char, char)] = &[ ('(', ')'), ('<', '<'), ('>', '>'), ('[', '['), (']', ']'), ('{', '{'), ('}', '}'), ('«', '«'), ('»', '»'), ('༺', '༽'), ('᚛', '᚜'), ('‹', '›'), ('⁅', '⁆'), ('⁽', '⁾'), ('₍', '₎'), ('⅀', '⅀'), ('∁', '∄'), ('∈', '∍'), ('∑', '∑'), ('∕', '∖'), ('√', '∝'), ('∟', '∢'), ('∤', '∤'), ('∦', '∦'), ('∫', '∳'), ('∹', '∹'), ('∻', '≌'), ('≒', '≕'), ('≟', '≠'), ('≢', '≢'), ('≤', '≫'), ('≮', '⊌'), ('⊏', '⊒'), ('⊘', '⊘'), ('⊢', '⊣'), ('⊦', '⊸'), ('⊾', '⊿'), ('⋉', '⋍'), ('⋐', '⋑'), ('⋖', '⋭'), ('⋰', '⋿'), ('⌈', '⌋'), ('⌠', '⌡'), ('〈', '〉'), ('❨', '❵'), ('⟀', '⟀'), ('⟃', '⟆'), ('⟈', '⟉'), ('⟋', '⟍'), ('⟓', '⟖'), ('⟜', '⟞'), ('⟢', '⟯'), ('⦃', '⦘'), ('⦛', '⦠'), ('⦢', '⦯'), ('⦸', '⦸'), ('⧀', '⧅'), ('⧉', '⧉'), ('⧎', '⧒'), ('⧔', '⧕'), ('⧘', '⧜'), ('⧡', '⧡'), ('⧣', '⧥'), ('⧨', '⧩'), ('⧴', '⧹'), ('⧼', '⧽'), ('⨊', '⨜'), ('⨞', '⨡'), ('⨤', '⨤'), ('⨦', '⨦'), ('⨩', '⨩'), ('⨫', '⨮'), ('⨴', '⨵'), ('⨼', '⨾'), ('⩗', '⩘'), ('⩤', '⩥'), ('⩪', '⩭'), ('⩯', '⩰'), ('⩳', '⩴'), ('⩹', '⪣'), ('⪦', '⪭'), ('⪯', '⫖'), ('⫝̸', '⫝̸'), ('⫞', '⫞'), ('⫢', '⫦'), ('⫬', '⫮'), ('⫳', '⫳'), ('⫷', '⫻'), ('⫽', '⫽'), ('⯾', '⯾'), ('⸂', '⸅'), ('⸉', '⸊'), ('⸌', '⸍'), ('⸜', '⸝'), ('⸠', '⸩'), ('⹕', '⹜'), ('〈', '】'), ('〔', '〛'), ('﹙', '﹞'), ('﹤', '﹥'), ('(', ')'), ('<', '<'), ('>', '>'), ('[', '['), (']', ']'), ('{', '{'), ('}', '}'), ('⦅', '⦆'), ('「', '」'), ('𝛛', '𝛛'), ('𝜕', '𝜕'), ('𝝏', '𝝏'), ('𝞉', '𝞉'), ('𝟃', '𝟃'), ]; pub const CASE_IGNORABLE: &'static [(char, char)] = &[ ('\'', '\''), ('.', '.'), (':', ':'), ('^', '^'), ('`', '`'), ('¨', '¨'), ('\u{ad}', '\u{ad}'), ('¯', '¯'), ('´', '´'), ('·', '¸'), ('ʰ', '\u{36f}'), ('ʹ', '͵'), ('ͺ', 'ͺ'), ('΄', '΅'), ('·', '·'), ('\u{483}', '\u{489}'), ('ՙ', 'ՙ'), ('՟', '՟'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('״', '״'), ('\u{600}', '\u{605}'), ('\u{610}', '\u{61a}'), ('\u{61c}', '\u{61c}'), ('ـ', 'ـ'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dd}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{70f}', '\u{70f}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', 'ߵ'), ('ߺ', 'ߺ'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('࢈', '࢈'), ('\u{890}', '\u{891}'), ('\u{898}', '\u{89f}'), ('ࣉ', '\u{902}'), ('\u{93a}', '\u{93a}'), ('\u{93c}', '\u{93c}'), ('\u{941}', '\u{948}'), ('\u{94d}', '\u{94d}'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('ॱ', 'ॱ'), ('\u{981}', '\u{981}'), ('\u{9bc}', '\u{9bc}'), ('\u{9c1}', '\u{9c4}'), ('\u{9cd}', '\u{9cd}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', '\u{a02}'), ('\u{a3c}', '\u{a3c}'), ('\u{a41}', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', '\u{a82}'), ('\u{abc}', '\u{abc}'), ('\u{ac1}', '\u{ac5}'), ('\u{ac7}', '\u{ac8}'), ('\u{acd}', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', '\u{b01}'), ('\u{b3c}', '\u{b3c}'), ('\u{b3f}', '\u{b3f}'), ('\u{b41}', '\u{b44}'), ('\u{b4d}', '\u{b4d}'), ('\u{b55}', '\u{b56}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bc0}', '\u{bc0}'), ('\u{bcd}', '\u{bcd}'), ('\u{c00}', '\u{c00}'), ('\u{c04}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', '\u{c40}'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', '\u{c81}'), ('\u{cbc}', '\u{cbc}'), ('\u{cbf}', '\u{cbf}'), ('\u{cc6}', '\u{cc6}'), ('\u{ccc}', '\u{ccd}'), ('\u{ce2}', '\u{ce3}'), ('\u{d00}', '\u{d01}'), ('\u{d3b}', '\u{d3c}'), ('\u{d41}', '\u{d44}'), ('\u{d4d}', '\u{d4d}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', '\u{d81}'), ('\u{dca}', '\u{dca}'), ('\u{dd2}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('ๆ', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('\u{f71}', '\u{f7e}'), ('\u{f80}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('\u{102d}', '\u{1030}'), ('\u{1032}', '\u{1037}'), ('\u{1039}', '\u{103a}'), ('\u{103d}', '\u{103e}'), ('\u{1058}', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{1082}'), ('\u{1085}', '\u{1086}'), ('\u{108d}', '\u{108d}'), ('\u{109d}', '\u{109d}'), ('ჼ', 'ჼ'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '\u{1714}'), ('\u{1732}', '\u{1733}'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17b5}'), ('\u{17b7}', '\u{17bd}'), ('\u{17c6}', '\u{17c6}'), ('\u{17c9}', '\u{17d3}'), ('ៗ', 'ៗ'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180f}'), ('ᡃ', 'ᡃ'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', '\u{1922}'), ('\u{1927}', '\u{1928}'), ('\u{1932}', '\u{1932}'), ('\u{1939}', '\u{193b}'), ('\u{1a17}', '\u{1a18}'), ('\u{1a1b}', '\u{1a1b}'), ('\u{1a56}', '\u{1a56}'), ('\u{1a58}', '\u{1a5e}'), ('\u{1a60}', '\u{1a60}'), ('\u{1a62}', '\u{1a62}'), ('\u{1a65}', '\u{1a6c}'), ('\u{1a73}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('ᪧ', 'ᪧ'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', '\u{1b03}'), ('\u{1b34}', '\u{1b34}'), ('\u{1b36}', '\u{1b3a}'), ('\u{1b3c}', '\u{1b3c}'), ('\u{1b42}', '\u{1b42}'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1b81}'), ('\u{1ba2}', '\u{1ba5}'), ('\u{1ba8}', '\u{1ba9}'), ('\u{1bab}', '\u{1bad}'), ('\u{1be6}', '\u{1be6}'), ('\u{1be8}', '\u{1be9}'), ('\u{1bed}', '\u{1bed}'), ('\u{1bef}', '\u{1bf1}'), ('\u{1c2c}', '\u{1c33}'), ('\u{1c36}', '\u{1c37}'), ('ᱸ', 'ᱽ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', '\u{1dff}'), ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), ('\u{200b}', '\u{200f}'), ('‘', '’'), ('․', '․'), ('‧', '‧'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20f0}'), ('ⱼ', 'ⱽ'), ('\u{2cef}', '\u{2cf1}'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('ⸯ', 'ⸯ'), ('々', '々'), ('\u{302a}', '\u{302d}'), ('〱', '〵'), ('〻', '〻'), ('\u{3099}', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꓸ', 'ꓽ'), ('ꘌ', 'ꘌ'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('ꙿ', 'ꙿ'), ('ꚜ', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('꜀', '꜡'), ('ꝰ', 'ꝰ'), ('ꞈ', '꞊'), ('ꟲ', 'ꟴ'), ('ꟸ', 'ꟹ'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('\u{a825}', '\u{a826}'), ('\u{a82c}', '\u{a82c}'), ('\u{a8c4}', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '\u{a951}'), ('\u{a980}', '\u{a982}'), ('\u{a9b3}', '\u{a9b3}'), ('\u{a9b6}', '\u{a9b9}'), ('\u{a9bc}', '\u{a9bd}'), ('ꧏ', 'ꧏ'), ('\u{a9e5}', 'ꧦ'), ('\u{aa29}', '\u{aa2e}'), ('\u{aa31}', '\u{aa32}'), ('\u{aa35}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', '\u{aa4c}'), ('ꩰ', 'ꩰ'), ('\u{aa7c}', '\u{aa7c}'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('ꫝ', 'ꫝ'), ('\u{aaec}', '\u{aaed}'), ('ꫳ', 'ꫴ'), ('\u{aaf6}', '\u{aaf6}'), ('꭛', 'ꭟ'), ('ꭩ', '꭫'), ('\u{abe5}', '\u{abe5}'), ('\u{abe8}', '\u{abe8}'), ('\u{abed}', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('﮲', '﯂'), ('\u{fe00}', '\u{fe0f}'), ('︓', '︓'), ('\u{fe20}', '\u{fe2f}'), ('﹒', '﹒'), ('﹕', '﹕'), ('\u{feff}', '\u{feff}'), (''', '''), ('.', '.'), (':', ':'), ('^', '^'), ('`', '`'), ('ー', 'ー'), ('\u{ff9e}', '\u{ff9f}'), (' ̄', ' ̄'), ('\u{fff9}', '\u{fffb}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('\u{11001}', '\u{11001}'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '\u{11081}'), ('\u{110b3}', '\u{110b6}'), ('\u{110b9}', '\u{110ba}'), ('\u{110bd}', '\u{110bd}'), ('\u{110c2}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{1112b}'), ('\u{1112d}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '\u{11181}'), ('\u{111b6}', '\u{111be}'), ('\u{111c9}', '\u{111cc}'), ('\u{111cf}', '\u{111cf}'), ('\u{1122f}', '\u{11231}'), ('\u{11234}', '\u{11234}'), ('\u{11236}', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112df}'), ('\u{112e3}', '\u{112ea}'), ('\u{11300}', '\u{11301}'), ('\u{1133b}', '\u{1133c}'), ('\u{11340}', '\u{11340}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('\u{11438}', '\u{1143f}'), ('\u{11442}', '\u{11444}'), ('\u{11446}', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b3}', '\u{114b8}'), ('\u{114ba}', '\u{114ba}'), ('\u{114bf}', '\u{114c0}'), ('\u{114c2}', '\u{114c3}'), ('\u{115b2}', '\u{115b5}'), ('\u{115bc}', '\u{115bd}'), ('\u{115bf}', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('\u{11633}', '\u{1163a}'), ('\u{1163d}', '\u{1163d}'), ('\u{1163f}', '\u{11640}'), ('\u{116ab}', '\u{116ab}'), ('\u{116ad}', '\u{116ad}'), ('\u{116b0}', '\u{116b5}'), ('\u{116b7}', '\u{116b7}'), ('\u{1171d}', '\u{1171f}'), ('\u{11722}', '\u{11725}'), ('\u{11727}', '\u{1172b}'), ('\u{1182f}', '\u{11837}'), ('\u{11839}', '\u{1183a}'), ('\u{1193b}', '\u{1193c}'), ('\u{1193e}', '\u{1193e}'), ('\u{11943}', '\u{11943}'), ('\u{119d4}', '\u{119d7}'), ('\u{119da}', '\u{119db}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '\u{11a38}'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a56}'), ('\u{11a59}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a96}'), ('\u{11a98}', '\u{11a99}'), ('\u{11c30}', '\u{11c36}'), ('\u{11c38}', '\u{11c3d}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('\u{11caa}', '\u{11cb0}'), ('\u{11cb2}', '\u{11cb3}'), ('\u{11cb5}', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('\u{11d90}', '\u{11d91}'), ('\u{11d95}', '\u{11d95}'), ('\u{11d97}', '\u{11d97}'), ('\u{11ef3}', '\u{11ef4}'), ('\u{11f00}', '\u{11f01}'), ('\u{11f36}', '\u{11f3a}'), ('\u{11f40}', '\u{11f40}'), ('\u{11f42}', '\u{11f42}'), ('\u{13430}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('𖭀', '𖭃'), ('\u{16f4f}', '\u{16f4f}'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '\u{16fe4}'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d173}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '𞄽'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('𞓫', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '𞥋'), ('🏻', '🏿'), ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const CASED: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ƺ'), ('Ƽ', 'ƿ'), ('DŽ', 'ʓ'), ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('\u{345}', '\u{345}'), ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՠ', 'ֈ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ჿ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚝ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'ꞎ'), ('Ꞑ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꟶ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐞀', '𐞀'), ('𐞃', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𖹀', '𖹿'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼉'), ('𝼋', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞤀', '𞥃'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ]; pub const CHANGES_WHEN_CASEFOLDED: &'static [(char, char)] = &[ ('A', 'Z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ß'), ('Ā', 'Ā'), ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('ʼn', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), ('ſ', 'ſ'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('\u{345}', '\u{345}'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('ς', 'ς'), ('Ϗ', 'ϑ'), ('ϕ', 'ϖ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('և', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẚ', 'ẛ'), ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾀ', 'ᾯ'), ('ᾲ', 'ᾴ'), ('ᾷ', 'ᾼ'), ('ῂ', 'ῄ'), ('ῇ', 'ῌ'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῷ', 'ῼ'), ('Ω', 'Ω'), ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱟ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('Ꞹ', 'Ꞹ'), ('Ꞻ', 'Ꞻ'), ('Ꞽ', 'Ꞽ'), ('Ꞿ', 'Ꞿ'), ('Ꟁ', 'Ꟁ'), ('Ꟃ', 'Ꟃ'), ('Ꞔ', 'Ꟈ'), ('Ꟊ', 'Ꟊ'), ('Ꟑ', 'Ꟑ'), ('Ꟗ', 'Ꟗ'), ('Ꟙ', 'Ꟙ'), ('Ꟶ', 'Ꟶ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𖹀', '𖹟'), ('𞤀', '𞤡'), ]; pub const CHANGES_WHEN_CASEMAPPED: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('µ', 'µ'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ķ'), ('Ĺ', 'ƌ'), ('Ǝ', 'ƚ'), ('Ɯ', 'Ʃ'), ('Ƭ', 'ƹ'), ('Ƽ', 'ƽ'), ('ƿ', 'ƿ'), ('DŽ', 'Ƞ'), ('Ȣ', 'ȳ'), ('Ⱥ', 'ɔ'), ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʂ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), ('ʝ', 'ʞ'), ('\u{345}', '\u{345}'), ('Ͱ', 'ͳ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϑ'), ('ϕ', 'ϵ'), ('Ϸ', 'ϻ'), ('Ͻ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ա', 'և'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჽ', 'ჿ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᵹ', 'ᵹ'), ('ᵽ', 'ᵽ'), ('ᶎ', 'ᶎ'), ('Ḁ', 'ẛ'), ('ẞ', 'ẞ'), ('Ạ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('Ω', 'Ω'), ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ⅿ'), ('Ↄ', 'ↄ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'Ɒ'), ('Ⱳ', 'ⱳ'), ('Ⱶ', 'ⱶ'), ('Ȿ', 'ⳣ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('Ꙁ', 'ꙭ'), ('Ꚁ', 'ꚛ'), ('Ꜣ', 'ꜯ'), ('Ꜳ', 'ꝯ'), ('Ꝺ', 'ꞇ'), ('Ꞌ', 'Ɥ'), ('Ꞑ', 'ꞔ'), ('Ꞗ', 'Ɪ'), ('Ʞ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('Ꟗ', 'ꟙ'), ('Ꟶ', 'ꟶ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('A', 'Z'), ('a', 'z'), ('𐐀', '𐑏'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𑢠', '𑣟'), ('𖹀', '𖹿'), ('𞤀', '𞥃'), ]; pub const CHANGES_WHEN_LOWERCASED: &'static [(char, char)] = &[ ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('Ꭰ', 'Ᏽ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾈ', 'ᾏ'), ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('Ᾰ', 'ᾼ'), ('Ὲ', 'ῌ'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'ῼ'), ('Ω', 'Ω'), ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱟ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('Ꞹ', 'Ꞹ'), ('Ꞻ', 'Ꞻ'), ('Ꞽ', 'Ꞽ'), ('Ꞿ', 'Ꞿ'), ('Ꟁ', 'Ꟁ'), ('Ꟃ', 'Ꟃ'), ('Ꞔ', 'Ꟈ'), ('Ꟊ', 'Ꟊ'), ('Ꟑ', 'Ꟑ'), ('Ꟗ', 'Ꟗ'), ('Ꟙ', 'Ꟙ'), ('Ꟶ', 'Ꟶ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𖹀', '𖹟'), ('𞤀', '𞤡'), ]; pub const CHANGES_WHEN_TITLECASED: &'static [(char, char)] = &[ ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), ('DŽ', 'DŽ'), ('dž', 'LJ'), ('lj', 'NJ'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'DZ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʂ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), ('ʝ', 'ʞ'), ('\u{345}', '\u{345}'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), ('ᵽ', 'ᵽ'), ('ᶎ', 'ᶎ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱟ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞔ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꞹ', 'ꞹ'), ('ꞻ', 'ꞻ'), ('ꞽ', 'ꞽ'), ('ꞿ', 'ꞿ'), ('ꟁ', 'ꟁ'), ('ꟃ', 'ꟃ'), ('ꟈ', 'ꟈ'), ('ꟊ', 'ꟊ'), ('ꟑ', 'ꟑ'), ('ꟗ', 'ꟗ'), ('ꟙ', 'ꟙ'), ('ꟶ', 'ꟶ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𖹠', '𖹿'), ('𞤢', '𞥃'), ]; pub const CHANGES_WHEN_UPPERCASED: &'static [(char, char)] = &[ ('a', 'z'), ('µ', 'µ'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ķ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƌ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƚ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƹ'), ('ƽ', 'ƽ'), ('ƿ', 'ƿ'), ('Dž', 'dž'), ('Lj', 'lj'), ('Nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('Dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȳ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ɔ'), ('ɖ', 'ɗ'), ('ə', 'ə'), ('ɛ', 'ɜ'), ('ɠ', 'ɡ'), ('ɣ', 'ɣ'), ('ɥ', 'ɦ'), ('ɨ', 'ɬ'), ('ɯ', 'ɯ'), ('ɱ', 'ɲ'), ('ɵ', 'ɵ'), ('ɽ', 'ɽ'), ('ʀ', 'ʀ'), ('ʂ', 'ʃ'), ('ʇ', 'ʌ'), ('ʒ', 'ʒ'), ('ʝ', 'ʞ'), ('\u{345}', '\u{345}'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϻ'), ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ա', 'և'), ('ა', 'ჺ'), ('ჽ', 'ჿ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᵹ', 'ᵹ'), ('ᵽ', 'ᵽ'), ('ᶎ', 'ᶎ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẛ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ᾼ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῌ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ῼ', 'ῼ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱟ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱳ', 'ⱳ'), ('ⱶ', 'ⱶ'), ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳣ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚛ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜯ'), ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝯ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞔ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꞹ', 'ꞹ'), ('ꞻ', 'ꞻ'), ('ꞽ', 'ꞽ'), ('ꞿ', 'ꞿ'), ('ꟁ', 'ꟁ'), ('ꟃ', 'ꟃ'), ('ꟈ', 'ꟈ'), ('ꟊ', 'ꟊ'), ('ꟑ', 'ꟑ'), ('ꟗ', 'ꟗ'), ('ꟙ', 'ꟙ'), ('ꟶ', 'ꟶ'), ('ꭓ', 'ꭓ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𖹠', '𖹿'), ('𞤢', '𞥃'), ]; pub const DASH: &'static [(char, char)] = &[ ('-', '-'), ('֊', '֊'), ('־', '־'), ('᐀', '᐀'), ('᠆', '᠆'), ('‐', '―'), ('⁓', '⁓'), ('⁻', '⁻'), ('₋', '₋'), ('−', '−'), ('⸗', '⸗'), ('⸚', '⸚'), ('⸺', '⸻'), ('⹀', '⹀'), ('⹝', '⹝'), ('〜', '〜'), ('〰', '〰'), ('゠', '゠'), ('︱', '︲'), ('﹘', '﹘'), ('﹣', '﹣'), ('-', '-'), ('𐺭', '𐺭'), ]; pub const DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ ('\u{ad}', '\u{ad}'), ('\u{34f}', '\u{34f}'), ('\u{61c}', '\u{61c}'), ('ᅟ', 'ᅠ'), ('\u{17b4}', '\u{17b5}'), ('\u{180b}', '\u{180f}'), ('\u{200b}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{206f}'), ('ㅤ', 'ㅤ'), ('\u{fe00}', '\u{fe0f}'), ('\u{feff}', '\u{feff}'), ('ᅠ', 'ᅠ'), ('\u{fff0}', '\u{fff8}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), ('\u{e0000}', '\u{e0fff}'), ]; pub const DEPRECATED: &'static [(char, char)] = &[ ('ʼn', 'ʼn'), ('ٳ', 'ٳ'), ('\u{f77}', '\u{f77}'), ('\u{f79}', '\u{f79}'), ('ឣ', 'ឤ'), ('\u{206a}', '\u{206f}'), ('〈', '〉'), ('\u{e0001}', '\u{e0001}'), ]; pub const DIACRITIC: &'static [(char, char)] = &[ ('^', '^'), ('`', '`'), ('¨', '¨'), ('¯', '¯'), ('´', '´'), ('·', '¸'), ('ʰ', '\u{34e}'), ('\u{350}', '\u{357}'), ('\u{35d}', '\u{362}'), ('ʹ', '͵'), ('ͺ', 'ͺ'), ('΄', '΅'), ('\u{483}', '\u{487}'), ('ՙ', 'ՙ'), ('\u{591}', '\u{5a1}'), ('\u{5a3}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c4}'), ('\u{64b}', '\u{652}'), ('\u{657}', '\u{658}'), ('\u{6df}', '\u{6e0}'), ('ۥ', 'ۦ'), ('\u{6ea}', '\u{6ec}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', 'ߵ'), ('\u{818}', '\u{819}'), ('\u{898}', '\u{89f}'), ('ࣉ', '\u{8d2}'), ('\u{8e3}', '\u{8fe}'), ('\u{93c}', '\u{93c}'), ('\u{94d}', '\u{94d}'), ('\u{951}', '\u{954}'), ('ॱ', 'ॱ'), ('\u{9bc}', '\u{9bc}'), ('\u{9cd}', '\u{9cd}'), ('\u{a3c}', '\u{a3c}'), ('\u{a4d}', '\u{a4d}'), ('\u{abc}', '\u{abc}'), ('\u{acd}', '\u{acd}'), ('\u{afd}', '\u{aff}'), ('\u{b3c}', '\u{b3c}'), ('\u{b4d}', '\u{b4d}'), ('\u{b55}', '\u{b55}'), ('\u{bcd}', '\u{bcd}'), ('\u{c3c}', '\u{c3c}'), ('\u{c4d}', '\u{c4d}'), ('\u{cbc}', '\u{cbc}'), ('\u{ccd}', '\u{ccd}'), ('\u{d3b}', '\u{d3c}'), ('\u{d4d}', '\u{d4d}'), ('\u{dca}', '\u{dca}'), ('\u{e47}', '\u{e4c}'), ('\u{e4e}', '\u{e4e}'), ('\u{eba}', '\u{eba}'), ('\u{ec8}', '\u{ecc}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', '༿'), ('\u{f82}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{fc6}', '\u{fc6}'), ('\u{1037}', '\u{1037}'), ('\u{1039}', '\u{103a}'), ('ၣ', 'ၤ'), ('ၩ', 'ၭ'), ('ႇ', '\u{108d}'), ('ႏ', 'ႏ'), ('ႚ', 'ႛ'), ('\u{135d}', '\u{135f}'), ('\u{1714}', '᜕'), ('\u{17c9}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{1939}', '\u{193b}'), ('\u{1a75}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1abe}'), ('\u{1ac1}', '\u{1acb}'), ('\u{1b34}', '\u{1b34}'), ('᭄', '᭄'), ('\u{1b6b}', '\u{1b73}'), ('᮪', '\u{1bab}'), ('\u{1c36}', '\u{1c37}'), ('ᱸ', 'ᱽ'), ('\u{1cd0}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('᳷', '\u{1cf9}'), ('ᴬ', 'ᵪ'), ('\u{1dc4}', '\u{1dcf}'), ('\u{1df5}', '\u{1dff}'), ('᾽', '᾽'), ('᾿', '῁'), ('῍', '῏'), ('῝', '῟'), ('῭', '`'), ('´', '῾'), ('\u{2cef}', '\u{2cf1}'), ('ⸯ', 'ⸯ'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '゜'), ('ー', 'ー'), ('\u{a66f}', '\u{a66f}'), ('\u{a67c}', '\u{a67d}'), ('ꙿ', 'ꙿ'), ('ꚜ', 'ꚝ'), ('\u{a6f0}', '\u{a6f1}'), ('꜀', '꜡'), ('ꞈ', '꞊'), ('ꟸ', 'ꟹ'), ('\u{a8c4}', '\u{a8c4}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a92b}', '꤮'), ('꥓', '꥓'), ('\u{a9b3}', '\u{a9b3}'), ('꧀', '꧀'), ('\u{a9e5}', '\u{a9e5}'), ('ꩻ', 'ꩽ'), ('\u{aabf}', 'ꫂ'), ('\u{aaf6}', '\u{aaf6}'), ('꭛', 'ꭟ'), ('ꭩ', '꭫'), ('꯬', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe20}', '\u{fe2f}'), ('^', '^'), ('`', '`'), ('ー', 'ー'), ('\u{ff9e}', '\u{ff9f}'), (' ̄', ' ̄'), ('\u{102e0}', '\u{102e0}'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('\u{10ae5}', '\u{10ae6}'), ('𐴢', '\u{10d27}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('\u{11046}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{110b9}', '\u{110ba}'), ('\u{11133}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('𑇀', '𑇀'), ('\u{111ca}', '\u{111cc}'), ('𑈵', '\u{11236}'), ('\u{112e9}', '\u{112ea}'), ('\u{1133c}', '\u{1133c}'), ('𑍍', '𑍍'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('\u{11442}', '\u{11442}'), ('\u{11446}', '\u{11446}'), ('\u{114c2}', '\u{114c3}'), ('\u{115bf}', '\u{115c0}'), ('\u{1163f}', '\u{1163f}'), ('𑚶', '\u{116b7}'), ('\u{1172b}', '\u{1172b}'), ('\u{11839}', '\u{1183a}'), ('𑤽', '\u{1193e}'), ('\u{11943}', '\u{11943}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a34}', '\u{11a34}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a99}', '\u{11a99}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11d42}', '\u{11d42}'), ('\u{11d44}', '\u{11d45}'), ('\u{11d97}', '\u{11d97}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f8f}', '𖾟'), ('𖿰', '𖿱'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d167}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('𞀰', '𞁭'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e946}'), ('\u{1e948}', '\u{1e94a}'), ]; pub const EMOJI: &'static [(char, char)] = &[ ('#', '#'), ('*', '*'), ('0', '9'), ('©', '©'), ('®', '®'), ('‼', '‼'), ('⁉', '⁉'), ('™', '™'), ('ℹ', 'ℹ'), ('↔', '↙'), ('↩', '↪'), ('⌚', '⌛'), ('⌨', '⌨'), ('⏏', '⏏'), ('⏩', '⏳'), ('⏸', '⏺'), ('Ⓜ', 'Ⓜ'), ('▪', '▫'), ('▶', '▶'), ('◀', '◀'), ('◻', '◾'), ('☀', '☄'), ('☎', '☎'), ('☑', '☑'), ('☔', '☕'), ('☘', '☘'), ('☝', '☝'), ('☠', '☠'), ('☢', '☣'), ('☦', '☦'), ('☪', '☪'), ('☮', '☯'), ('☸', '☺'), ('♀', '♀'), ('♂', '♂'), ('♈', '♓'), ('♟', '♠'), ('♣', '♣'), ('♥', '♦'), ('♨', '♨'), ('♻', '♻'), ('♾', '♿'), ('⚒', '⚗'), ('⚙', '⚙'), ('⚛', '⚜'), ('⚠', '⚡'), ('⚧', '⚧'), ('⚪', '⚫'), ('⚰', '⚱'), ('⚽', '⚾'), ('⛄', '⛅'), ('⛈', '⛈'), ('⛎', '⛏'), ('⛑', '⛑'), ('⛓', '⛔'), ('⛩', '⛪'), ('⛰', '⛵'), ('⛷', '⛺'), ('⛽', '⛽'), ('✂', '✂'), ('✅', '✅'), ('✈', '✍'), ('✏', '✏'), ('✒', '✒'), ('✔', '✔'), ('✖', '✖'), ('✝', '✝'), ('✡', '✡'), ('✨', '✨'), ('✳', '✴'), ('❄', '❄'), ('❇', '❇'), ('❌', '❌'), ('❎', '❎'), ('❓', '❕'), ('❗', '❗'), ('❣', '❤'), ('➕', '➗'), ('➡', '➡'), ('➰', '➰'), ('➿', '➿'), ('⤴', '⤵'), ('⬅', '⬇'), ('⬛', '⬜'), ('⭐', '⭐'), ('⭕', '⭕'), ('〰', '〰'), ('〽', '〽'), ('㊗', '㊗'), ('㊙', '㊙'), ('🀄', '🀄'), ('🃏', '🃏'), ('🅰', '🅱'), ('🅾', '🅿'), ('🆎', '🆎'), ('🆑', '🆚'), ('🇦', '🇿'), ('🈁', '🈂'), ('🈚', '🈚'), ('🈯', '🈯'), ('🈲', '🈺'), ('🉐', '🉑'), ('🌀', '🌡'), ('🌤', '🎓'), ('🎖', '🎗'), ('🎙', '🎛'), ('🎞', '🏰'), ('🏳', '🏵'), ('🏷', '📽'), ('📿', '🔽'), ('🕉', '🕎'), ('🕐', '🕧'), ('🕯', '🕰'), ('🕳', '🕺'), ('🖇', '🖇'), ('🖊', '🖍'), ('🖐', '🖐'), ('🖕', '🖖'), ('🖤', '🖥'), ('🖨', '🖨'), ('🖱', '🖲'), ('🖼', '🖼'), ('🗂', '🗄'), ('🗑', '🗓'), ('🗜', '🗞'), ('🗡', '🗡'), ('🗣', '🗣'), ('🗨', '🗨'), ('🗯', '🗯'), ('🗳', '🗳'), ('🗺', '🙏'), ('🚀', '🛅'), ('🛋', '🛒'), ('🛕', '🛗'), ('🛜', '🛥'), ('🛩', '🛩'), ('🛫', '🛬'), ('🛰', '🛰'), ('🛳', '🛼'), ('🟠', '🟫'), ('🟰', '🟰'), ('🤌', '🤺'), ('🤼', '🥅'), ('🥇', '🧿'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ]; pub const EMOJI_COMPONENT: &'static [(char, char)] = &[ ('#', '#'), ('*', '*'), ('0', '9'), ('\u{200d}', '\u{200d}'), ('\u{20e3}', '\u{20e3}'), ('\u{fe0f}', '\u{fe0f}'), ('🇦', '🇿'), ('🏻', '🏿'), ('🦰', '🦳'), ('\u{e0020}', '\u{e007f}'), ]; pub const EMOJI_MODIFIER: &'static [(char, char)] = &[('🏻', '🏿')]; pub const EMOJI_MODIFIER_BASE: &'static [(char, char)] = &[ ('☝', '☝'), ('⛹', '⛹'), ('✊', '✍'), ('🎅', '🎅'), ('🏂', '🏄'), ('🏇', '🏇'), ('🏊', '🏌'), ('👂', '👃'), ('👆', '👐'), ('👦', '👸'), ('👼', '👼'), ('💁', '💃'), ('💅', '💇'), ('💏', '💏'), ('💑', '💑'), ('💪', '💪'), ('🕴', '🕵'), ('🕺', '🕺'), ('🖐', '🖐'), ('🖕', '🖖'), ('🙅', '🙇'), ('🙋', '🙏'), ('🚣', '🚣'), ('🚴', '🚶'), ('🛀', '🛀'), ('🛌', '🛌'), ('🤌', '🤌'), ('🤏', '🤏'), ('🤘', '🤟'), ('🤦', '🤦'), ('🤰', '🤹'), ('🤼', '🤾'), ('🥷', '🥷'), ('🦵', '🦶'), ('🦸', '🦹'), ('🦻', '🦻'), ('🧍', '🧏'), ('🧑', '🧝'), ('🫃', '🫅'), ('🫰', '🫸'), ]; pub const EMOJI_PRESENTATION: &'static [(char, char)] = &[ ('⌚', '⌛'), ('⏩', '⏬'), ('⏰', '⏰'), ('⏳', '⏳'), ('◽', '◾'), ('☔', '☕'), ('♈', '♓'), ('♿', '♿'), ('⚓', '⚓'), ('⚡', '⚡'), ('⚪', '⚫'), ('⚽', '⚾'), ('⛄', '⛅'), ('⛎', '⛎'), ('⛔', '⛔'), ('⛪', '⛪'), ('⛲', '⛳'), ('⛵', '⛵'), ('⛺', '⛺'), ('⛽', '⛽'), ('✅', '✅'), ('✊', '✋'), ('✨', '✨'), ('❌', '❌'), ('❎', '❎'), ('❓', '❕'), ('❗', '❗'), ('➕', '➗'), ('➰', '➰'), ('➿', '➿'), ('⬛', '⬜'), ('⭐', '⭐'), ('⭕', '⭕'), ('🀄', '🀄'), ('🃏', '🃏'), ('🆎', '🆎'), ('🆑', '🆚'), ('🇦', '🇿'), ('🈁', '🈁'), ('🈚', '🈚'), ('🈯', '🈯'), ('🈲', '🈶'), ('🈸', '🈺'), ('🉐', '🉑'), ('🌀', '🌠'), ('🌭', '🌵'), ('🌷', '🍼'), ('🍾', '🎓'), ('🎠', '🏊'), ('🏏', '🏓'), ('🏠', '🏰'), ('🏴', '🏴'), ('🏸', '🐾'), ('👀', '👀'), ('👂', '📼'), ('📿', '🔽'), ('🕋', '🕎'), ('🕐', '🕧'), ('🕺', '🕺'), ('🖕', '🖖'), ('🖤', '🖤'), ('🗻', '🙏'), ('🚀', '🛅'), ('🛌', '🛌'), ('🛐', '🛒'), ('🛕', '🛗'), ('🛜', '🛟'), ('🛫', '🛬'), ('🛴', '🛼'), ('🟠', '🟫'), ('🟰', '🟰'), ('🤌', '🤺'), ('🤼', '🥅'), ('🥇', '🧿'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ]; pub const EXTENDED_PICTOGRAPHIC: &'static [(char, char)] = &[ ('©', '©'), ('®', '®'), ('‼', '‼'), ('⁉', '⁉'), ('™', '™'), ('ℹ', 'ℹ'), ('↔', '↙'), ('↩', '↪'), ('⌚', '⌛'), ('⌨', '⌨'), ('⎈', '⎈'), ('⏏', '⏏'), ('⏩', '⏳'), ('⏸', '⏺'), ('Ⓜ', 'Ⓜ'), ('▪', '▫'), ('▶', '▶'), ('◀', '◀'), ('◻', '◾'), ('☀', '★'), ('☇', '☒'), ('☔', '⚅'), ('⚐', '✅'), ('✈', '✒'), ('✔', '✔'), ('✖', '✖'), ('✝', '✝'), ('✡', '✡'), ('✨', '✨'), ('✳', '✴'), ('❄', '❄'), ('❇', '❇'), ('❌', '❌'), ('❎', '❎'), ('❓', '❕'), ('❗', '❗'), ('❣', '❧'), ('➕', '➗'), ('➡', '➡'), ('➰', '➰'), ('➿', '➿'), ('⤴', '⤵'), ('⬅', '⬇'), ('⬛', '⬜'), ('⭐', '⭐'), ('⭕', '⭕'), ('〰', '〰'), ('〽', '〽'), ('㊗', '㊗'), ('㊙', '㊙'), ('🀀', '\u{1f0ff}'), ('🄍', '🄏'), ('🄯', '🄯'), ('🅬', '🅱'), ('🅾', '🅿'), ('🆎', '🆎'), ('🆑', '🆚'), ('🆭', '\u{1f1e5}'), ('🈁', '\u{1f20f}'), ('🈚', '🈚'), ('🈯', '🈯'), ('🈲', '🈺'), ('\u{1f23c}', '\u{1f23f}'), ('\u{1f249}', '🏺'), ('🐀', '🔽'), ('🕆', '🙏'), ('🚀', '\u{1f6ff}'), ('🝴', '🝿'), ('🟕', '\u{1f7ff}'), ('\u{1f80c}', '\u{1f80f}'), ('\u{1f848}', '\u{1f84f}'), ('\u{1f85a}', '\u{1f85f}'), ('\u{1f888}', '\u{1f88f}'), ('\u{1f8ae}', '\u{1f8ff}'), ('🤌', '🤺'), ('🤼', '🥅'), ('🥇', '\u{1faff}'), ('\u{1fc00}', '\u{1fffd}'), ]; pub const EXTENDER: &'static [(char, char)] = &[ ('·', '·'), ('ː', 'ˑ'), ('ـ', 'ـ'), ('ߺ', 'ߺ'), ('\u{b55}', '\u{b55}'), ('ๆ', 'ๆ'), ('ໆ', 'ໆ'), ('᠊', '᠊'), ('ᡃ', 'ᡃ'), ('ᪧ', 'ᪧ'), ('\u{1c36}', '\u{1c36}'), ('ᱻ', 'ᱻ'), ('々', '々'), ('〱', '〵'), ('ゝ', 'ゞ'), ('ー', 'ヾ'), ('ꀕ', 'ꀕ'), ('ꘌ', 'ꘌ'), ('ꧏ', 'ꧏ'), ('ꧦ', 'ꧦ'), ('ꩰ', 'ꩰ'), ('ꫝ', 'ꫝ'), ('ꫳ', 'ꫴ'), ('ー', 'ー'), ('𐞁', '𐞂'), ('𑍝', '𑍝'), ('𑗆', '𑗈'), ('\u{11a98}', '\u{11a98}'), ('𖭂', '𖭃'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𞄼', '𞄽'), ('\u{1e944}', '\u{1e946}'), ]; pub const GRAPHEME_BASE: &'static [(char, char)] = &[ (' ', '~'), ('\u{a0}', '¬'), ('®', '˿'), ('Ͱ', 'ͷ'), ('ͺ', 'Ϳ'), ('΄', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', '҂'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('־', '־'), ('׀', '׀'), ('׃', '׃'), ('׆', '׆'), ('א', 'ת'), ('ׯ', '״'), ('؆', '؏'), ('؛', '؛'), ('؝', 'ي'), ('٠', 'ٯ'), ('ٱ', 'ە'), ('۞', '۞'), ('ۥ', 'ۦ'), ('۩', '۩'), ('ۮ', '܍'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('߀', 'ߪ'), ('ߴ', 'ߺ'), ('߾', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('࠰', '࠾'), ('ࡀ', 'ࡘ'), ('࡞', '࡞'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ः', 'ह'), ('ऻ', 'ऻ'), ('ऽ', 'ी'), ('ॉ', 'ौ'), ('ॎ', 'ॐ'), ('क़', 'ॡ'), ('।', 'ঀ'), ('ং', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ি', 'ী'), ('ে', 'ৈ'), ('ো', 'ৌ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('০', '৽'), ('ਃ', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਾ', 'ੀ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '੯'), ('ੲ', 'ੴ'), ('੶', '੶'), ('ઃ', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ી'), ('ૉ', 'ૉ'), ('ો', 'ૌ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('૦', '૱'), ('ૹ', 'ૹ'), ('ଂ', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ୀ', 'ୀ'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('୦', '୷'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ி', 'ி'), ('ு', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), ('ௐ', 'ௐ'), ('௦', '௺'), ('ఁ', 'ః'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ు', 'ౄ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('౦', '౯'), ('౷', 'ಀ'), ('ಂ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಾ'), ('ೀ', 'ು'), ('ೃ', 'ೄ'), ('ೇ', 'ೈ'), ('ೊ', 'ೋ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('೦', '೯'), ('ೱ', 'ೳ'), ('ം', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ി', 'ീ'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('ൎ', '൏'), ('ൔ', 'ൖ'), ('൘', 'ൡ'), ('൦', 'ൿ'), ('ං', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ැ', 'ෑ'), ('ෘ', 'ෞ'), ('෦', '෯'), ('ෲ', '෴'), ('ก', 'ะ'), ('า', 'ำ'), ('฿', 'ๆ'), ('๏', '๛'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', '༗'), ('༚', '༴'), ('༶', '༶'), ('༸', '༸'), ('༺', 'ཇ'), ('ཉ', 'ཬ'), ('ཿ', 'ཿ'), ('྅', '྅'), ('ྈ', 'ྌ'), ('྾', '࿅'), ('࿇', '࿌'), ('࿎', '࿚'), ('က', 'ာ'), ('ေ', 'ေ'), ('း', 'း'), ('ျ', 'ြ'), ('ဿ', 'ၗ'), ('ၚ', 'ၝ'), ('ၡ', 'ၰ'), ('ၵ', 'ႁ'), ('ႃ', 'ႄ'), ('ႇ', 'ႌ'), ('ႎ', 'ႜ'), ('႞', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('፠', '፼'), ('ᎀ', '᎙'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('᐀', '᚜'), ('ᚠ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('᜕', '᜕'), ('ᜟ', 'ᜱ'), ('᜴', '᜶'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ា', 'ា'), ('ើ', 'ៅ'), ('ះ', 'ៈ'), ('។', 'ៜ'), ('០', '៩'), ('៰', '៹'), ('᠀', '᠊'), ('᠐', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᤣ', 'ᤦ'), ('ᤩ', 'ᤫ'), ('ᤰ', 'ᤱ'), ('ᤳ', 'ᤸ'), ('᥀', '᥀'), ('᥄', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', 'ᨖ'), ('ᨙ', 'ᨚ'), ('᨞', 'ᩕ'), ('ᩗ', 'ᩗ'), ('ᩡ', 'ᩡ'), ('ᩣ', 'ᩤ'), ('ᩭ', 'ᩲ'), ('᪀', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ('ᬄ', 'ᬳ'), ('ᬻ', 'ᬻ'), ('ᬽ', 'ᭁ'), ('ᭃ', 'ᭌ'), ('᭐', '᭪'), ('᭴', '᭾'), ('ᮂ', 'ᮡ'), ('ᮦ', 'ᮧ'), ('᮪', '᮪'), ('ᮮ', 'ᯥ'), ('ᯧ', 'ᯧ'), ('ᯪ', 'ᯬ'), ('ᯮ', 'ᯮ'), ('᯲', '᯳'), ('᯼', 'ᰫ'), ('ᰴ', 'ᰵ'), ('᰻', '᱉'), ('ᱍ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', '᳇'), ('᳓', '᳓'), ('᳡', '᳡'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', '᳷'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), ('\u{2000}', '\u{200a}'), ('‐', '‧'), ('\u{202f}', '\u{205f}'), ('⁰', 'ⁱ'), ('⁴', '₎'), ('ₐ', 'ₜ'), ('₠', '⃀'), ('℀', '↋'), ('←', '␦'), ('⑀', '⑊'), ('①', '⭳'), ('⭶', '⮕'), ('⮗', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('⳹', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('⸀', '⹝'), ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('⿰', '⿻'), ('\u{3000}', '〩'), ('〰', '〿'), ('ぁ', 'ゖ'), ('゛', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('㆐', '㇣'), ('ㇰ', '㈞'), ('㈠', 'ꒌ'), ('꒐', '꓆'), ('ꓐ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('꙳', '꙳'), ('꙾', 'ꚝ'), ('ꚠ', 'ꛯ'), ('꛲', '꛷'), ('꜀', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠤ'), ('ꠧ', '꠫'), ('꠰', '꠹'), ('ꡀ', '꡷'), ('ꢀ', 'ꣃ'), ('꣎', '꣙'), ('ꣲ', 'ꣾ'), ('꤀', 'ꤥ'), ('꤮', 'ꥆ'), ('ꥒ', '꥓'), ('꥟', 'ꥼ'), ('ꦃ', 'ꦲ'), ('ꦴ', 'ꦵ'), ('ꦺ', 'ꦻ'), ('ꦾ', '꧍'), ('ꧏ', '꧙'), ('꧞', 'ꧤ'), ('ꧦ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꨯ', 'ꨰ'), ('ꨳ', 'ꨴ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩍ', 'ꩍ'), ('꩐', '꩙'), ('꩜', 'ꩻ'), ('ꩽ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫫ'), ('ꫮ', 'ꫵ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', '꭫'), ('ꭰ', 'ꯤ'), ('ꯦ', 'ꯧ'), ('ꯩ', '꯬'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', '﯂'), ('ﯓ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('﷏', '﷏'), ('ﷰ', '﷿'), ('︐', '︙'), ('︰', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('!', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('¢', '₩'), ('│', '○'), ('', '�'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐆎'), ('𐆐', '𐆜'), ('𐆠', '𐆠'), ('𐇐', '𐇼'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐋡', '𐋻'), ('𐌀', '𐌣'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎟', '𐏃'), ('𐏈', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕯', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡗', '𐢞'), ('𐢧', '𐢯'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐤛'), ('𐤟', '𐤹'), ('𐤿', '𐤿'), ('𐦀', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩀', '𐩈'), ('𐩐', '𐩘'), ('𐩠', '𐪟'), ('𐫀', '𐫤'), ('𐫫', '𐫶'), ('𐬀', '𐬵'), ('𐬹', '𐭕'), ('𐭘', '𐭲'), ('𐭸', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐴣'), ('𐴰', '𐴹'), ('𐹠', '𐹾'), ('𐺀', '𐺩'), ('𐺭', '𐺭'), ('𐺰', '𐺱'), ('𐼀', '𐼧'), ('𐼰', '𐽅'), ('𐽑', '𐽙'), ('𐽰', '𐾁'), ('𐾆', '𐾉'), ('𐾰', '𐿋'), ('𐿠', '𐿶'), ('𑀀', '𑀀'), ('𑀂', '𑀷'), ('𑁇', '𑁍'), ('𑁒', '𑁯'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂂', '𑂲'), ('𑂷', '𑂸'), ('𑂻', '𑂼'), ('𑂾', '𑃁'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('𑄃', '𑄦'), ('𑄬', '𑄬'), ('𑄶', '𑅇'), ('𑅐', '𑅲'), ('𑅴', '𑅶'), ('𑆂', '𑆵'), ('𑆿', '𑇈'), ('𑇍', '𑇎'), ('𑇐', '𑇟'), ('𑇡', '𑇴'), ('𑈀', '𑈑'), ('𑈓', '𑈮'), ('𑈲', '𑈳'), ('𑈵', '𑈵'), ('𑈸', '𑈽'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩'), ('𑊰', '𑋞'), ('𑋠', '𑋢'), ('𑋰', '𑋹'), ('𑌂', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑌿', '𑌿'), ('𑍁', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('𑍝', '𑍣'), ('𑐀', '𑐷'), ('𑑀', '𑑁'), ('𑑅', '𑑅'), ('𑑇', '𑑛'), ('𑑝', '𑑝'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑒱', '𑒲'), ('𑒹', '𑒹'), ('𑒻', '𑒼'), ('𑒾', '𑒾'), ('𑓁', '𑓁'), ('𑓄', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '𑖮'), ('𑖰', '𑖱'), ('𑖸', '𑖻'), ('𑖾', '𑖾'), ('𑗁', '𑗛'), ('𑘀', '𑘲'), ('𑘻', '𑘼'), ('𑘾', '𑘾'), ('𑙁', '𑙄'), ('𑙐', '𑙙'), ('𑙠', '𑙬'), ('𑚀', '𑚪'), ('𑚬', '𑚬'), ('𑚮', '𑚯'), ('𑚶', '𑚶'), ('𑚸', '𑚹'), ('𑛀', '𑛉'), ('𑜀', '𑜚'), ('𑜠', '𑜡'), ('𑜦', '𑜦'), ('𑜰', '𑝆'), ('𑠀', '𑠮'), ('𑠸', '𑠸'), ('𑠻', '𑠻'), ('𑢠', '𑣲'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤱', '𑤵'), ('𑤷', '𑤸'), ('𑤽', '𑤽'), ('𑤿', '𑥂'), ('𑥄', '𑥆'), ('𑥐', '𑥙'), ('𑦠', '𑦧'), ('𑦪', '𑧓'), ('𑧜', '𑧟'), ('𑧡', '𑧤'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨹', '𑨺'), ('𑨿', '𑩆'), ('𑩐', '𑩐'), ('𑩗', '𑩘'), ('𑩜', '𑪉'), ('𑪗', '𑪗'), ('𑪚', '𑪢'), ('𑪰', '𑫸'), ('𑬀', '𑬉'), ('𑰀', '𑰈'), ('𑰊', '𑰯'), ('𑰾', '𑰾'), ('𑱀', '𑱅'), ('𑱐', '𑱬'), ('𑱰', '𑲏'), ('𑲩', '𑲩'), ('𑲱', '𑲱'), ('𑲴', '𑲴'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵐', '𑵙'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('𑶓', '𑶔'), ('𑶖', '𑶖'), ('𑶘', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻲'), ('𑻵', '𑻸'), ('𑼂', '𑼐'), ('𑼒', '𑼵'), ('𑼾', '𑼿'), ('𑽁', '𑽁'), ('𑽃', '𑽙'), ('𑾰', '𑾰'), ('𑿀', '𑿱'), ('𑿿', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃'), ('𒾐', '𒿲'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖪾'), ('𖫀', '𖫉'), ('𖫐', '𖫭'), ('𖫵', '𖫵'), ('𖬀', '𖬯'), ('𖬷', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖺚'), ('𖼀', '𖽊'), ('𖽐', '𖾇'), ('𖾓', '𖾟'), ('𖿠', '𖿣'), ('𖿰', '𖿱'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲜'), ('𛲟', '𛲟'), ('𜽐', '𜿃'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅘𝅥𝅲'), ('𝅦', '𝅦'), ('𝅪', '𝅭'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇪'), ('𝈀', '𝉁'), ('𝉅', '𝉅'), ('𝋀', '𝋓'), ('𝋠', '𝋳'), ('𝌀', '𝍖'), ('𝍠', '𝍸'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝧿'), ('𝨷', '𝨺'), ('𝩭', '𝩴'), ('𝩶', '𝪃'), ('𝪅', '𝪋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞋰', '𞋹'), ('𞋿', '𞋿'), ('𞓐', '𞓫'), ('𞓰', '𞓹'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞣇', '𞣏'), ('𞤀', '𞥃'), ('𞥋', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟'), ('𞱱', '𞲴'), ('𞴁', '𞴽'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🆭'), ('🇦', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🛗'), ('🛜', '🛬'), ('🛰', '🛼'), ('🜀', '🝶'), ('🝻', '🟙'), ('🟠', '🟫'), ('🟰', '🟰'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🢰', '🢱'), ('🤀', '🩓'), ('🩠', '🩭'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ('🬀', '🮒'), ('🮔', '🯊'), ('🯰', '🯹'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const GRAPHEME_EXTEND: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{489}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', '\u{902}'), ('\u{93a}', '\u{93a}'), ('\u{93c}', '\u{93c}'), ('\u{941}', '\u{948}'), ('\u{94d}', '\u{94d}'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', '\u{981}'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9be}'), ('\u{9c1}', '\u{9c4}'), ('\u{9cd}', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', '\u{a02}'), ('\u{a3c}', '\u{a3c}'), ('\u{a41}', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', '\u{a82}'), ('\u{abc}', '\u{abc}'), ('\u{ac1}', '\u{ac5}'), ('\u{ac7}', '\u{ac8}'), ('\u{acd}', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', '\u{b01}'), ('\u{b3c}', '\u{b3c}'), ('\u{b3e}', '\u{b3f}'), ('\u{b41}', '\u{b44}'), ('\u{b4d}', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', '\u{bbe}'), ('\u{bc0}', '\u{bc0}'), ('\u{bcd}', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c00}'), ('\u{c04}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', '\u{c40}'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', '\u{c81}'), ('\u{cbc}', '\u{cbc}'), ('\u{cbf}', '\u{cbf}'), ('\u{cc2}', '\u{cc2}'), ('\u{cc6}', '\u{cc6}'), ('\u{ccc}', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('\u{d00}', '\u{d01}'), ('\u{d3b}', '\u{d3c}'), ('\u{d3e}', '\u{d3e}'), ('\u{d41}', '\u{d44}'), ('\u{d4d}', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', '\u{d81}'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dcf}'), ('\u{dd2}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('\u{ddf}', '\u{ddf}'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('\u{f71}', '\u{f7e}'), ('\u{f80}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('\u{102d}', '\u{1030}'), ('\u{1032}', '\u{1037}'), ('\u{1039}', '\u{103a}'), ('\u{103d}', '\u{103e}'), ('\u{1058}', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{1082}'), ('\u{1085}', '\u{1086}'), ('\u{108d}', '\u{108d}'), ('\u{109d}', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '\u{1714}'), ('\u{1732}', '\u{1733}'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17b5}'), ('\u{17b7}', '\u{17bd}'), ('\u{17c6}', '\u{17c6}'), ('\u{17c9}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', '\u{1922}'), ('\u{1927}', '\u{1928}'), ('\u{1932}', '\u{1932}'), ('\u{1939}', '\u{193b}'), ('\u{1a17}', '\u{1a18}'), ('\u{1a1b}', '\u{1a1b}'), ('\u{1a56}', '\u{1a56}'), ('\u{1a58}', '\u{1a5e}'), ('\u{1a60}', '\u{1a60}'), ('\u{1a62}', '\u{1a62}'), ('\u{1a65}', '\u{1a6c}'), ('\u{1a73}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', '\u{1b03}'), ('\u{1b34}', '\u{1b3a}'), ('\u{1b3c}', '\u{1b3c}'), ('\u{1b42}', '\u{1b42}'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1b81}'), ('\u{1ba2}', '\u{1ba5}'), ('\u{1ba8}', '\u{1ba9}'), ('\u{1bab}', '\u{1bad}'), ('\u{1be6}', '\u{1be6}'), ('\u{1be8}', '\u{1be9}'), ('\u{1bed}', '\u{1bed}'), ('\u{1bef}', '\u{1bf1}'), ('\u{1c2c}', '\u{1c33}'), ('\u{1c36}', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{200c}', '\u{200c}'), ('\u{20d0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('\u{a825}', '\u{a826}'), ('\u{a82c}', '\u{a82c}'), ('\u{a8c4}', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '\u{a951}'), ('\u{a980}', '\u{a982}'), ('\u{a9b3}', '\u{a9b3}'), ('\u{a9b6}', '\u{a9b9}'), ('\u{a9bc}', '\u{a9bd}'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa2e}'), ('\u{aa31}', '\u{aa32}'), ('\u{aa35}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', '\u{aa4c}'), ('\u{aa7c}', '\u{aa7c}'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('\u{aaec}', '\u{aaed}'), ('\u{aaf6}', '\u{aaf6}'), ('\u{abe5}', '\u{abe5}'), ('\u{abe8}', '\u{abe8}'), ('\u{abed}', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{ff9e}', '\u{ff9f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('\u{11001}', '\u{11001}'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '\u{11081}'), ('\u{110b3}', '\u{110b6}'), ('\u{110b9}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{1112b}'), ('\u{1112d}', '\u{11134}'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '\u{11181}'), ('\u{111b6}', '\u{111be}'), ('\u{111c9}', '\u{111cc}'), ('\u{111cf}', '\u{111cf}'), ('\u{1122f}', '\u{11231}'), ('\u{11234}', '\u{11234}'), ('\u{11236}', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112df}'), ('\u{112e3}', '\u{112ea}'), ('\u{11300}', '\u{11301}'), ('\u{1133b}', '\u{1133c}'), ('\u{1133e}', '\u{1133e}'), ('\u{11340}', '\u{11340}'), ('\u{11357}', '\u{11357}'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('\u{11438}', '\u{1143f}'), ('\u{11442}', '\u{11444}'), ('\u{11446}', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b0}', '\u{114b0}'), ('\u{114b3}', '\u{114b8}'), ('\u{114ba}', '\u{114ba}'), ('\u{114bd}', '\u{114bd}'), ('\u{114bf}', '\u{114c0}'), ('\u{114c2}', '\u{114c3}'), ('\u{115af}', '\u{115af}'), ('\u{115b2}', '\u{115b5}'), ('\u{115bc}', '\u{115bd}'), ('\u{115bf}', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('\u{11633}', '\u{1163a}'), ('\u{1163d}', '\u{1163d}'), ('\u{1163f}', '\u{11640}'), ('\u{116ab}', '\u{116ab}'), ('\u{116ad}', '\u{116ad}'), ('\u{116b0}', '\u{116b5}'), ('\u{116b7}', '\u{116b7}'), ('\u{1171d}', '\u{1171f}'), ('\u{11722}', '\u{11725}'), ('\u{11727}', '\u{1172b}'), ('\u{1182f}', '\u{11837}'), ('\u{11839}', '\u{1183a}'), ('\u{11930}', '\u{11930}'), ('\u{1193b}', '\u{1193c}'), ('\u{1193e}', '\u{1193e}'), ('\u{11943}', '\u{11943}'), ('\u{119d4}', '\u{119d7}'), ('\u{119da}', '\u{119db}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '\u{11a38}'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a56}'), ('\u{11a59}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a96}'), ('\u{11a98}', '\u{11a99}'), ('\u{11c30}', '\u{11c36}'), ('\u{11c38}', '\u{11c3d}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('\u{11caa}', '\u{11cb0}'), ('\u{11cb2}', '\u{11cb3}'), ('\u{11cb5}', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('\u{11d90}', '\u{11d91}'), ('\u{11d95}', '\u{11d95}'), ('\u{11d97}', '\u{11d97}'), ('\u{11ef3}', '\u{11ef4}'), ('\u{11f00}', '\u{11f01}'), ('\u{11f36}', '\u{11f3a}'), ('\u{11f40}', '\u{11f40}'), ('\u{11f42}', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d165}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d16e}', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('\u{e0020}', '\u{e007f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const GRAPHEME_LINK: &'static [(char, char)] = &[ ('\u{94d}', '\u{94d}'), ('\u{9cd}', '\u{9cd}'), ('\u{a4d}', '\u{a4d}'), ('\u{acd}', '\u{acd}'), ('\u{b4d}', '\u{b4d}'), ('\u{bcd}', '\u{bcd}'), ('\u{c4d}', '\u{c4d}'), ('\u{ccd}', '\u{ccd}'), ('\u{d3b}', '\u{d3c}'), ('\u{d4d}', '\u{d4d}'), ('\u{dca}', '\u{dca}'), ('\u{e3a}', '\u{e3a}'), ('\u{eba}', '\u{eba}'), ('\u{f84}', '\u{f84}'), ('\u{1039}', '\u{103a}'), ('\u{1714}', '᜕'), ('᜴', '᜴'), ('\u{17d2}', '\u{17d2}'), ('\u{1a60}', '\u{1a60}'), ('᭄', '᭄'), ('᮪', '\u{1bab}'), ('᯲', '᯳'), ('\u{2d7f}', '\u{2d7f}'), ('\u{a806}', '\u{a806}'), ('\u{a82c}', '\u{a82c}'), ('\u{a8c4}', '\u{a8c4}'), ('꥓', '꥓'), ('꧀', '꧀'), ('\u{aaf6}', '\u{aaf6}'), ('\u{abed}', '\u{abed}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{11046}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{1107f}', '\u{1107f}'), ('\u{110b9}', '\u{110b9}'), ('\u{11133}', '\u{11134}'), ('𑇀', '𑇀'), ('𑈵', '𑈵'), ('\u{112ea}', '\u{112ea}'), ('𑍍', '𑍍'), ('\u{11442}', '\u{11442}'), ('\u{114c2}', '\u{114c2}'), ('\u{115bf}', '\u{115bf}'), ('\u{1163f}', '\u{1163f}'), ('𑚶', '𑚶'), ('\u{1172b}', '\u{1172b}'), ('\u{11839}', '\u{11839}'), ('𑤽', '\u{1193e}'), ('\u{119e0}', '\u{119e0}'), ('\u{11a34}', '\u{11a34}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a99}', '\u{11a99}'), ('\u{11c3f}', '\u{11c3f}'), ('\u{11d44}', '\u{11d45}'), ('\u{11d97}', '\u{11d97}'), ('𑽁', '\u{11f42}'), ]; pub const HEX_DIGIT: &'static [(char, char)] = &[ ('0', '9'), ('A', 'F'), ('a', 'f'), ('0', '9'), ('A', 'F'), ('a', 'f'), ]; pub const HYPHEN: &'static [(char, char)] = &[ ('-', '-'), ('\u{ad}', '\u{ad}'), ('֊', '֊'), ('᠆', '᠆'), ('‐', '‑'), ('⸗', '⸗'), ('・', '・'), ('﹣', '﹣'), ('-', '-'), ('・', '・'), ]; pub const IDS_BINARY_OPERATOR: &'static [(char, char)] = &[('⿰', '⿱'), ('⿴', '⿻')]; pub const IDS_TRINARY_OPERATOR: &'static [(char, char)] = &[('⿲', '⿳')]; pub const ID_CONTINUE: &'static [(char, char)] = &[ ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{300}', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('\u{483}', '\u{487}'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '\u{74a}'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), ('\u{7fd}', '\u{7fd}'), ('ࠀ', '\u{82d}'), ('ࡀ', '\u{85b}'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('\u{898}', '\u{8e1}'), ('\u{8e3}', '\u{963}'), ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '\u{a75}'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૯'), ('ૹ', '\u{aff}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୯'), ('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௯'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), ('ൔ', '\u{d57}'), ('ൟ', '\u{d63}'), ('൦', '൯'), ('ൺ', 'ൿ'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', 'ෳ'), ('ก', '\u{e3a}'), ('เ', '\u{e4e}'), ('๐', '๙'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('\u{f18}', '\u{f19}'), ('༠', '༩'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('က', '၉'), ('ၐ', '\u{109d}'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '\u{135f}'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', '᜕'), ('ᜟ', '᜴'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('ក', '\u{17d3}'), ('ៗ', 'ៗ'), ('ៜ', '\u{17dd}'), ('០', '៩'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('\u{1ab0}', '\u{1abd}'), ('\u{1abf}', '\u{1ace}'), ('\u{1b00}', 'ᭌ'), ('᭐', '᭙'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '᯳'), ('ᰀ', '\u{1c37}'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', 'ᳺ'), ('ᴀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20f0}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('\u{2de0}', '\u{2dff}'), ('々', '〇'), ('〡', '\u{302f}'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('\u{3099}', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '\u{a66f}'), ('\u{a674}', '\u{a67d}'), ('ꙿ', '\u{a6f1}'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꡀ', 'ꡳ'), ('ꢀ', '\u{a8c5}'), ('꣐', '꣙'), ('\u{a8e0}', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a92d}'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('\u{a980}', '꧀'), ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', '\u{aaf6}'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯪ'), ('꯬', '\u{abed}'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('︳', '︴'), ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('\u{101fd}', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('\u{102e0}', '\u{102e0}'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '\u{10ae6}'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐺀', '𐺩'), ('\u{10eab}', '\u{10eac}'), ('𐺰', '𐺱'), ('\u{10efd}', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '\u{10f50}'), ('𐽰', '\u{10f85}'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀀', '\u{11046}'), ('𑁦', '𑁵'), ('\u{1107f}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑄿'), ('𑅄', '𑅇'), ('𑅐', '\u{11173}'), ('𑅶', '𑅶'), ('\u{11180}', '𑇄'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11237}'), ('\u{1123e}', '\u{11241}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('\u{1145e}', '𑑡'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('𑗘', '\u{115dd}'), ('𑘀', '\u{11640}'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚸'), ('𑛀', '𑛉'), ('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜹'), ('𑝀', '𑝆'), ('𑠀', '\u{1183a}'), ('𑢠', '𑣩'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{11943}'), ('𑥐', '𑥙'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧡'), ('𑧣', '𑧤'), ('𑨀', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('𑩐', '\u{11a99}'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻶'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('𑽐', '𑽙'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('\u{13440}', '\u{13455}'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩰', '𖪾'), ('𖫀', '𖫉'), ('𖫐', '𖫭'), ('\u{16af0}', '\u{16af4}'), ('𖬀', '\u{16b36}'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '\u{16fe4}'), ('𖿰', '𖿱'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅎'), ('𞊐', '\u{1e2ae}'), ('𞋀', '𞋹'), ('𞓐', '𞓹'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('\u{1e8d0}', '\u{1e8d6}'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🯰', '🯹'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ('\u{e0100}', '\u{e01ef}'), ]; pub const ID_START: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('א', 'ת'), ('ׯ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('゛', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('𑝀', '𑝆'), ('𑠀', '𑠫'), ('𑢠', '𑣟'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const IDEOGRAPHIC: &'static [(char, char)] = &[ ('〆', '〇'), ('〡', '〩'), ('〸', '〺'), ('㐀', '䶿'), ('一', '鿿'), ('豈', '舘'), ('並', '龎'), ('\u{16fe4}', '\u{16fe4}'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𛅰', '𛋻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const JOIN_CONTROL: &'static [(char, char)] = &[('\u{200c}', '\u{200d}')]; pub const LOGICAL_ORDER_EXCEPTION: &'static [(char, char)] = &[ ('เ', 'ไ'), ('ເ', 'ໄ'), ('ᦵ', 'ᦷ'), ('ᦺ', 'ᦺ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪹ'), ('ꪻ', 'ꪼ'), ]; pub const LOWERCASE: &'static [(char, char)] = &[ ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('\u{345}', '\u{345}'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ՠ', 'ֈ'), ('ა', 'ჺ'), ('ჼ', 'ჿ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), ('ẟ', 'ẟ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), ('ℓ', 'ℓ'), ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℽ'), ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱟ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱽ'), ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚝ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝸ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞯ', 'ꞯ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꞹ', 'ꞹ'), ('ꞻ', 'ꞻ'), ('ꞽ', 'ꞽ'), ('ꞿ', 'ꞿ'), ('ꟁ', 'ꟁ'), ('ꟃ', 'ꟃ'), ('ꟈ', 'ꟈ'), ('ꟊ', 'ꟊ'), ('ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟕ'), ('ꟗ', 'ꟗ'), ('ꟙ', 'ꟙ'), ('ꟲ', 'ꟴ'), ('ꟶ', 'ꟶ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐞀', '𐞀'), ('𐞃', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𖹠', '𖹿'), ('𝐚', '𝐳'), ('𝑎', '𝑔'), ('𝑖', '𝑧'), ('𝒂', '𝒛'), ('𝒶', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝓏'), ('𝓪', '𝔃'), ('𝔞', '𝔷'), ('𝕒', '𝕫'), ('𝖆', '𝖟'), ('𝖺', '𝗓'), ('𝗮', '𝘇'), ('𝘢', '𝘻'), ('𝙖', '𝙯'), ('𝚊', '𝚥'), ('𝛂', '𝛚'), ('𝛜', '𝛡'), ('𝛼', '𝜔'), ('𝜖', '𝜛'), ('𝜶', '𝝎'), ('𝝐', '𝝕'), ('𝝰', '𝞈'), ('𝞊', '𝞏'), ('𝞪', '𝟂'), ('𝟄', '𝟉'), ('𝟋', '𝟋'), ('𝼀', '𝼉'), ('𝼋', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞤢', '𞥃'), ]; pub const MATH: &'static [(char, char)] = &[ ('+', '+'), ('<', '>'), ('^', '^'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('±', '±'), ('×', '×'), ('÷', '÷'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), ('ϰ', 'ϱ'), ('ϴ', '϶'), ('؆', '؈'), ('‖', '‖'), ('′', '‴'), ('⁀', '⁀'), ('⁄', '⁄'), ('⁒', '⁒'), ('\u{2061}', '\u{2064}'), ('⁺', '⁾'), ('₊', '₎'), ('\u{20d0}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20e6}'), ('\u{20eb}', '\u{20ef}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), ('ℼ', 'ⅉ'), ('⅋', '⅋'), ('←', '↧'), ('↩', '↮'), ('↰', '↱'), ('↶', '↷'), ('↼', '⇛'), ('⇝', '⇝'), ('⇤', '⇥'), ('⇴', '⋿'), ('⌈', '⌋'), ('⌠', '⌡'), ('⍼', '⍼'), ('⎛', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), ('⏜', '⏢'), ('■', '□'), ('▮', '▷'), ('▼', '◁'), ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), ('◤', '◤'), ('◧', '◬'), ('◸', '◿'), ('★', '☆'), ('♀', '♀'), ('♂', '♂'), ('♠', '♣'), ('♭', '♯'), ('⟀', '⟿'), ('⤀', '⫿'), ('⬰', '⭄'), ('⭇', '⭌'), ('﬩', '﬩'), ('﹡', '﹦'), ('﹨', '﹨'), ('+', '+'), ('<', '>'), ('\', '\'), ('^', '^'), ('|', '|'), ('~', '~'), ('¬', '¬'), ('←', '↓'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ]; pub const NONCHARACTER_CODE_POINT: &'static [(char, char)] = &[ ('\u{fdd0}', '\u{fdef}'), ('\u{fffe}', '\u{ffff}'), ('\u{1fffe}', '\u{1ffff}'), ('\u{2fffe}', '\u{2ffff}'), ('\u{3fffe}', '\u{3ffff}'), ('\u{4fffe}', '\u{4ffff}'), ('\u{5fffe}', '\u{5ffff}'), ('\u{6fffe}', '\u{6ffff}'), ('\u{7fffe}', '\u{7ffff}'), ('\u{8fffe}', '\u{8ffff}'), ('\u{9fffe}', '\u{9ffff}'), ('\u{afffe}', '\u{affff}'), ('\u{bfffe}', '\u{bffff}'), ('\u{cfffe}', '\u{cffff}'), ('\u{dfffe}', '\u{dffff}'), ('\u{efffe}', '\u{effff}'), ('\u{ffffe}', '\u{fffff}'), ('\u{10fffe}', '\u{10ffff}'), ]; pub const OTHER_ALPHABETIC: &'static [(char, char)] = &[ ('\u{345}', '\u{345}'), ('\u{5b0}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{657}'), ('\u{659}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6e1}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ed}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{73f}'), ('\u{7a6}', '\u{7b0}'), ('\u{816}', '\u{817}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82c}'), ('\u{8d4}', '\u{8df}'), ('\u{8e3}', '\u{8e9}'), ('\u{8f0}', 'ः'), ('\u{93a}', 'ऻ'), ('ा', 'ौ'), ('ॎ', 'ॏ'), ('\u{955}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', 'ঃ'), ('\u{9be}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৌ'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{a01}', 'ਃ'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4c}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', 'ઃ'), ('ા', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', 'ૌ'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{afc}'), ('\u{b01}', 'ଃ'), ('\u{b3e}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', 'ୌ'), ('\u{b56}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', 'ௌ'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c04}'), ('\u{c3e}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4c}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', 'ಃ'), ('ಾ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccc}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('ೳ', 'ೳ'), ('\u{d00}', 'ഃ'), ('\u{d3e}', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൌ'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', 'ඃ'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e4d}', '\u{e4d}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{eb9}'), ('\u{ebb}', '\u{ebc}'), ('\u{ecd}', '\u{ecd}'), ('\u{f71}', '\u{f83}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('ါ', '\u{1036}'), ('း', 'း'), ('ျ', '\u{103e}'), ('ၖ', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{108d}'), ('ႏ', 'ႏ'), ('ႚ', '\u{109d}'), ('\u{1712}', '\u{1713}'), ('\u{1732}', '\u{1733}'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('ា', 'ៈ'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', 'ᤫ'), ('ᤰ', 'ᤸ'), ('\u{1a17}', '\u{1a1b}'), ('ᩕ', '\u{1a5e}'), ('ᩡ', '\u{1a74}'), ('\u{1abf}', '\u{1ac0}'), ('\u{1acc}', '\u{1ace}'), ('\u{1b00}', 'ᬄ'), ('\u{1b35}', 'ᭃ'), ('\u{1b80}', 'ᮂ'), ('ᮡ', '\u{1ba9}'), ('\u{1bac}', '\u{1bad}'), ('ᯧ', '\u{1bf1}'), ('ᰤ', '\u{1c36}'), ('\u{1de7}', '\u{1df4}'), ('Ⓐ', 'ⓩ'), ('\u{2de0}', '\u{2dff}'), ('\u{a674}', '\u{a67b}'), ('\u{a69e}', '\u{a69f}'), ('\u{a802}', '\u{a802}'), ('\u{a80b}', '\u{a80b}'), ('ꠣ', 'ꠧ'), ('ꢀ', 'ꢁ'), ('ꢴ', 'ꣃ'), ('\u{a8c5}', '\u{a8c5}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92a}'), ('\u{a947}', 'ꥒ'), ('\u{a980}', 'ꦃ'), ('ꦴ', 'ꦿ'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', 'ꩍ'), ('ꩻ', 'ꩽ'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabe}'), ('ꫫ', 'ꫯ'), ('ꫵ', 'ꫵ'), ('ꯣ', 'ꯪ'), ('\u{fb1e}', '\u{fb1e}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('𑀀', '𑀂'), ('\u{11038}', '\u{11045}'), ('\u{11073}', '\u{11074}'), ('\u{11080}', '𑂂'), ('𑂰', '𑂸'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{11132}'), ('𑅅', '𑅆'), ('\u{11180}', '𑆂'), ('𑆳', '𑆿'), ('𑇎', '\u{111cf}'), ('𑈬', '\u{11234}'), ('\u{11237}', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112e8}'), ('\u{11300}', '𑌃'), ('\u{1133e}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍌'), ('\u{11357}', '\u{11357}'), ('𑍢', '𑍣'), ('𑐵', '𑑁'), ('\u{11443}', '𑑅'), ('\u{114b0}', '𑓁'), ('\u{115af}', '\u{115b5}'), ('𑖸', '𑖾'), ('\u{115dc}', '\u{115dd}'), ('𑘰', '𑘾'), ('\u{11640}', '\u{11640}'), ('\u{116ab}', '\u{116b5}'), ('\u{1171d}', '\u{1172a}'), ('𑠬', '𑠸'), ('\u{11930}', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{1193c}'), ('𑥀', '𑥀'), ('𑥂', '𑥂'), ('𑧑', '\u{119d7}'), ('\u{119da}', '𑧟'), ('𑧤', '𑧤'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a35}', '𑨹'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a51}', '\u{11a5b}'), ('\u{11a8a}', '𑪗'), ('𑰯', '\u{11c36}'), ('\u{11c38}', '𑰾'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d41}'), ('\u{11d43}', '\u{11d43}'), ('\u{11d47}', '\u{11d47}'), ('𑶊', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶖'), ('\u{11ef3}', '𑻶'), ('\u{11f00}', '\u{11f01}'), ('𑼃', '𑼃'), ('𑼴', '\u{11f3a}'), ('𑼾', '\u{11f40}'), ('\u{16f4f}', '\u{16f4f}'), ('𖽑', '𖾇'), ('\u{16f8f}', '\u{16f92}'), ('𖿰', '𖿱'), ('\u{1bc9e}', '\u{1bc9e}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e947}', '\u{1e947}'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ]; pub const OTHER_DEFAULT_IGNORABLE_CODE_POINT: &'static [(char, char)] = &[ ('\u{34f}', '\u{34f}'), ('ᅟ', 'ᅠ'), ('\u{17b4}', '\u{17b5}'), ('\u{2065}', '\u{2065}'), ('ㅤ', 'ㅤ'), ('ᅠ', 'ᅠ'), ('\u{fff0}', '\u{fff8}'), ('\u{e0000}', '\u{e0000}'), ('\u{e0002}', '\u{e001f}'), ('\u{e0080}', '\u{e00ff}'), ('\u{e01f0}', '\u{e0fff}'), ]; pub const OTHER_GRAPHEME_EXTEND: &'static [(char, char)] = &[ ('\u{9be}', '\u{9be}'), ('\u{9d7}', '\u{9d7}'), ('\u{b3e}', '\u{b3e}'), ('\u{b57}', '\u{b57}'), ('\u{bbe}', '\u{bbe}'), ('\u{bd7}', '\u{bd7}'), ('\u{cc2}', '\u{cc2}'), ('\u{cd5}', '\u{cd6}'), ('\u{d3e}', '\u{d3e}'), ('\u{d57}', '\u{d57}'), ('\u{dcf}', '\u{dcf}'), ('\u{ddf}', '\u{ddf}'), ('\u{1b35}', '\u{1b35}'), ('\u{200c}', '\u{200c}'), ('\u{302e}', '\u{302f}'), ('\u{ff9e}', '\u{ff9f}'), ('\u{1133e}', '\u{1133e}'), ('\u{11357}', '\u{11357}'), ('\u{114b0}', '\u{114b0}'), ('\u{114bd}', '\u{114bd}'), ('\u{115af}', '\u{115af}'), ('\u{11930}', '\u{11930}'), ('\u{1d165}', '\u{1d165}'), ('\u{1d16e}', '\u{1d172}'), ('\u{e0020}', '\u{e007f}'), ]; pub const OTHER_ID_CONTINUE: &'static [(char, char)] = &[('·', '·'), ('·', '·'), ('፩', '፱'), ('᧚', '᧚')]; pub const OTHER_ID_START: &'static [(char, char)] = &[('\u{1885}', '\u{1886}'), ('℘', '℘'), ('℮', '℮'), ('゛', '゜')]; pub const OTHER_LOWERCASE: &'static [(char, char)] = &[ ('ª', 'ª'), ('º', 'º'), ('ʰ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('\u{345}', '\u{345}'), ('ͺ', 'ͺ'), ('ჼ', 'ჼ'), ('ᴬ', 'ᵪ'), ('ᵸ', 'ᵸ'), ('ᶛ', 'ᶿ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ⅰ', 'ⅿ'), ('ⓐ', 'ⓩ'), ('ⱼ', 'ⱽ'), ('ꚜ', 'ꚝ'), ('ꝰ', 'ꝰ'), ('ꟲ', 'ꟴ'), ('ꟸ', 'ꟹ'), ('ꭜ', 'ꭟ'), ('ꭩ', 'ꭩ'), ('𐞀', '𐞀'), ('𐞃', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𞀰', '𞁭'), ]; pub const OTHER_MATH: &'static [(char, char)] = &[ ('^', '^'), ('ϐ', 'ϒ'), ('ϕ', 'ϕ'), ('ϰ', 'ϱ'), ('ϴ', 'ϵ'), ('‖', '‖'), ('′', '‴'), ('⁀', '⁀'), ('\u{2061}', '\u{2064}'), ('⁽', '⁾'), ('₍', '₎'), ('\u{20d0}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20e6}'), ('\u{20eb}', '\u{20ef}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('ℨ', '℩'), ('ℬ', 'ℭ'), ('ℯ', 'ℱ'), ('ℳ', 'ℸ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('↕', '↙'), ('↜', '↟'), ('↡', '↢'), ('↤', '↥'), ('↧', '↧'), ('↩', '↭'), ('↰', '↱'), ('↶', '↷'), ('↼', '⇍'), ('⇐', '⇑'), ('⇓', '⇓'), ('⇕', '⇛'), ('⇝', '⇝'), ('⇤', '⇥'), ('⌈', '⌋'), ('⎴', '⎵'), ('⎷', '⎷'), ('⏐', '⏐'), ('⏢', '⏢'), ('■', '□'), ('▮', '▶'), ('▼', '◀'), ('◆', '◇'), ('◊', '○'), ('●', '◓'), ('◢', '◢'), ('◤', '◤'), ('◧', '◬'), ('★', '☆'), ('♀', '♀'), ('♂', '♂'), ('♠', '♣'), ('♭', '♮'), ('⟅', '⟆'), ('⟦', '⟯'), ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), ('﹡', '﹡'), ('﹣', '﹣'), ('﹨', '﹨'), ('\', '\'), ('^', '^'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ]; pub const OTHER_UPPERCASE: &'static [(char, char)] = &[('Ⅰ', 'Ⅿ'), ('Ⓐ', 'Ⓩ'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉')]; pub const PATTERN_SYNTAX: &'static [(char, char)] = &[ ('!', '/'), (':', '@'), ('[', '^'), ('`', '`'), ('{', '~'), ('¡', '§'), ('©', '©'), ('«', '¬'), ('®', '®'), ('°', '±'), ('¶', '¶'), ('»', '»'), ('¿', '¿'), ('×', '×'), ('÷', '÷'), ('‐', '‧'), ('‰', '‾'), ('⁁', '⁓'), ('⁕', '⁞'), ('←', '\u{245f}'), ('─', '❵'), ('➔', '⯿'), ('⸀', '\u{2e7f}'), ('、', '〃'), ('〈', '〠'), ('〰', '〰'), ('﴾', '﴿'), ('﹅', '﹆'), ]; pub const PATTERN_WHITE_SPACE: &'static [(char, char)] = &[ ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{200e}', '\u{200f}'), ('\u{2028}', '\u{2029}'), ]; pub const PREPENDED_CONCATENATION_MARK: &'static [(char, char)] = &[ ('\u{600}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{890}', '\u{891}'), ('\u{8e2}', '\u{8e2}'), ('\u{110bd}', '\u{110bd}'), ('\u{110cd}', '\u{110cd}'), ]; pub const QUOTATION_MARK: &'static [(char, char)] = &[ ('"', '"'), ('\'', '\''), ('«', '«'), ('»', '»'), ('‘', '‟'), ('‹', '›'), ('⹂', '⹂'), ('「', '』'), ('〝', '〟'), ('﹁', '﹄'), ('"', '"'), (''', '''), ('「', '」'), ]; pub const RADICAL: &'static [(char, char)] = &[('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕')]; pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; pub const SENTENCE_TERMINAL: &'static [(char, char)] = &[ ('!', '!'), ('.', '.'), ('?', '?'), ('։', '։'), ('؝', '؟'), ('۔', '۔'), ('܀', '܂'), ('߹', '߹'), ('࠷', '࠷'), ('࠹', '࠹'), ('࠽', '࠾'), ('।', '॥'), ('၊', '။'), ('።', '።'), ('፧', '፨'), ('᙮', '᙮'), ('᜵', '᜶'), ('᠃', '᠃'), ('᠉', '᠉'), ('᥄', '᥅'), ('᪨', '᪫'), ('᭚', '᭛'), ('᭞', '᭟'), ('᭽', '᭾'), ('᰻', '᰼'), ('᱾', '᱿'), ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), ('⸼', '⸼'), ('⹓', '⹔'), ('。', '。'), ('꓿', '꓿'), ('꘎', '꘏'), ('꛳', '꛳'), ('꛷', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), ('꤯', '꤯'), ('꧈', '꧉'), ('꩝', '꩟'), ('꫰', '꫱'), ('꯫', '꯫'), ('﹒', '﹒'), ('﹖', '﹗'), ('!', '!'), ('.', '.'), ('?', '?'), ('。', '。'), ('𐩖', '𐩗'), ('𐽕', '𐽙'), ('𐾆', '𐾉'), ('𑁇', '𑁈'), ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), ('𑇞', '𑇟'), ('𑈸', '𑈹'), ('𑈻', '𑈼'), ('𑊩', '𑊩'), ('𑑋', '𑑌'), ('𑗂', '𑗃'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), ('𑜼', '𑜾'), ('𑥄', '𑥄'), ('𑥆', '𑥆'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑱁', '𑱂'), ('𑻷', '𑻸'), ('𑽃', '𑽄'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬸'), ('𖭄', '𖭄'), ('𖺘', '𖺘'), ('𛲟', '𛲟'), ('𝪈', '𝪈'), ]; pub const SOFT_DOTTED: &'static [(char, char)] = &[ ('i', 'j'), ('į', 'į'), ('ɉ', 'ɉ'), ('ɨ', 'ɨ'), ('ʝ', 'ʝ'), ('ʲ', 'ʲ'), ('ϳ', 'ϳ'), ('і', 'і'), ('ј', 'ј'), ('ᵢ', 'ᵢ'), ('ᶖ', 'ᶖ'), ('ᶤ', 'ᶤ'), ('ᶨ', 'ᶨ'), ('ḭ', 'ḭ'), ('ị', 'ị'), ('ⁱ', 'ⁱ'), ('ⅈ', 'ⅉ'), ('ⱼ', 'ⱼ'), ('𝐢', '𝐣'), ('𝑖', '𝑗'), ('𝒊', '𝒋'), ('𝒾', '𝒿'), ('𝓲', '𝓳'), ('𝔦', '𝔧'), ('𝕚', '𝕛'), ('𝖎', '𝖏'), ('𝗂', '𝗃'), ('𝗶', '𝗷'), ('𝘪', '𝘫'), ('𝙞', '𝙟'), ('𝚒', '𝚓'), ('𝼚', '𝼚'), ('𞁌', '𞁍'), ('𞁨', '𞁨'), ]; pub const TERMINAL_PUNCTUATION: &'static [(char, char)] = &[ ('!', '!'), (',', ','), ('.', '.'), (':', ';'), ('?', '?'), (';', ';'), ('·', '·'), ('։', '։'), ('׃', '׃'), ('،', '،'), ('؛', '؛'), ('؝', '؟'), ('۔', '۔'), ('܀', '܊'), ('܌', '܌'), ('߸', '߹'), ('࠰', '࠾'), ('࡞', '࡞'), ('।', '॥'), ('๚', '๛'), ('༈', '༈'), ('།', '༒'), ('၊', '။'), ('፡', '፨'), ('᙮', '᙮'), ('᛫', '᛭'), ('᜵', '᜶'), ('។', '៖'), ('៚', '៚'), ('᠂', '᠅'), ('᠈', '᠉'), ('᥄', '᥅'), ('᪨', '᪫'), ('᭚', '᭛'), ('᭝', '᭟'), ('᭽', '᭾'), ('᰻', '᰿'), ('᱾', '᱿'), ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), ('⸼', '⸼'), ('⹁', '⹁'), ('⹌', '⹌'), ('⹎', '⹏'), ('⹓', '⹔'), ('、', '。'), ('꓾', '꓿'), ('꘍', '꘏'), ('꛳', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), ('꤯', '꤯'), ('꧇', '꧉'), ('꩝', '꩟'), ('꫟', '꫟'), ('꫰', '꫱'), ('꯫', '꯫'), ('﹐', '﹒'), ('﹔', '﹗'), ('!', '!'), (',', ','), ('.', '.'), (':', ';'), ('?', '?'), ('。', '。'), ('、', '、'), ('𐎟', '𐎟'), ('𐏐', '𐏐'), ('𐡗', '𐡗'), ('𐤟', '𐤟'), ('𐩖', '𐩗'), ('𐫰', '𐫵'), ('𐬺', '𐬿'), ('𐮙', '𐮜'), ('𐽕', '𐽙'), ('𐾆', '𐾉'), ('𑁇', '𑁍'), ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), ('𑇞', '𑇟'), ('𑈸', '𑈼'), ('𑊩', '𑊩'), ('𑑋', '𑑍'), ('𑑚', '𑑛'), ('𑗂', '𑗅'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), ('𑜼', '𑜾'), ('𑥄', '𑥄'), ('𑥆', '𑥆'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑪡', '𑪢'), ('𑱁', '𑱃'), ('𑱱', '𑱱'), ('𑻷', '𑻸'), ('𑽃', '𑽄'), ('𒑰', '𒑴'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬹'), ('𖭄', '𖭄'), ('𖺗', '𖺘'), ('𛲟', '𛲟'), ('𝪇', '𝪊'), ]; pub const UNIFIED_IDEOGRAPH: &'static [(char, char)] = &[ ('㐀', '䶿'), ('一', '鿿'), ('﨎', '﨏'), ('﨑', '﨑'), ('﨓', '﨔'), ('﨟', '﨟'), ('﨡', '﨡'), ('﨣', '﨤'), ('﨧', '﨩'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const UPPERCASE: &'static [(char, char)] = &[ ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'DŽ'), ('LJ', 'LJ'), ('NJ', 'NJ'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'DZ'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('Ꭰ', 'Ᏽ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('Ᾰ', 'Ά'), ('Ὲ', 'Ή'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'Ώ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), ('ⅅ', 'ⅅ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱟ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('Ꞹ', 'Ꞹ'), ('Ꞻ', 'Ꞻ'), ('Ꞽ', 'Ꞽ'), ('Ꞿ', 'Ꞿ'), ('Ꟁ', 'Ꟁ'), ('Ꟃ', 'Ꟃ'), ('Ꞔ', 'Ꟈ'), ('Ꟊ', 'Ꟊ'), ('Ꟑ', 'Ꟑ'), ('Ꟗ', 'Ꟗ'), ('Ꟙ', 'Ꟙ'), ('Ꟶ', 'Ꟶ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𖹀', '𖹟'), ('𝐀', '𝐙'), ('𝐴', '𝑍'), ('𝑨', '𝒁'), ('𝒜', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒵'), ('𝓐', '𝓩'), ('𝔄', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔸', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕬', '𝖅'), ('𝖠', '𝖹'), ('𝗔', '𝗭'), ('𝘈', '𝘡'), ('𝘼', '𝙕'), ('𝙰', '𝚉'), ('𝚨', '𝛀'), ('𝛢', '𝛺'), ('𝜜', '𝜴'), ('𝝖', '𝝮'), ('𝞐', '𝞨'), ('𝟊', '𝟊'), ('𞤀', '𞤡'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ]; pub const VARIATION_SELECTOR: &'static [(char, char)] = &[ ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{fe00}', '\u{fe0f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const WHITE_SPACE: &'static [(char, char)] = &[ ('\t', '\r'), (' ', ' '), ('\u{85}', '\u{85}'), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; pub const XID_CONTINUE: &'static [(char, char)] = &[ ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('·', '·'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{300}', 'ʹ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('\u{483}', '\u{487}'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '\u{74a}'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), ('\u{7fd}', '\u{7fd}'), ('ࠀ', '\u{82d}'), ('ࡀ', '\u{85b}'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('\u{898}', '\u{8e1}'), ('\u{8e3}', '\u{963}'), ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '\u{a75}'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૯'), ('ૹ', '\u{aff}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୯'), ('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௯'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), ('ൔ', '\u{d57}'), ('ൟ', '\u{d63}'), ('൦', '൯'), ('ൺ', 'ൿ'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', 'ෳ'), ('ก', '\u{e3a}'), ('เ', '\u{e4e}'), ('๐', '๙'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('\u{f18}', '\u{f19}'), ('༠', '༩'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('က', '၉'), ('ၐ', '\u{109d}'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '\u{135f}'), ('፩', '፱'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', '᜕'), ('ᜟ', '᜴'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('ក', '\u{17d3}'), ('ៗ', 'ៗ'), ('ៜ', '\u{17dd}'), ('០', '៩'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('\u{1ab0}', '\u{1abd}'), ('\u{1abf}', '\u{1ace}'), ('\u{1b00}', 'ᭌ'), ('᭐', '᭙'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '᯳'), ('ᰀ', '\u{1c37}'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', 'ᳺ'), ('ᴀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('‿', '⁀'), ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20dc}'), ('\u{20e1}', '\u{20e1}'), ('\u{20e5}', '\u{20f0}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('\u{2de0}', '\u{2dff}'), ('々', '〇'), ('〡', '\u{302f}'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('\u{3099}', '\u{309a}'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '\u{a66f}'), ('\u{a674}', '\u{a67d}'), ('ꙿ', '\u{a6f1}'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꡀ', 'ꡳ'), ('ꢀ', '\u{a8c5}'), ('꣐', '꣙'), ('\u{a8e0}', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a92d}'), ('ꤰ', '꥓'), ('ꥠ', 'ꥼ'), ('\u{a980}', '꧀'), ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', '\u{aaf6}'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯪ'), ('꯬', '\u{abed}'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), ('ﱤ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('︳', '︴'), ('﹍', '﹏'), ('ﹱ', 'ﹱ'), ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), ('ﹻ', 'ﹻ'), ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('\u{101fd}', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('\u{102e0}', '\u{102e0}'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '\u{10ae6}'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐺀', '𐺩'), ('\u{10eab}', '\u{10eac}'), ('𐺰', '𐺱'), ('\u{10efd}', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '\u{10f50}'), ('𐽰', '\u{10f85}'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀀', '\u{11046}'), ('𑁦', '𑁵'), ('\u{1107f}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑄿'), ('𑅄', '𑅇'), ('𑅐', '\u{11173}'), ('𑅶', '𑅶'), ('\u{11180}', '𑇄'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11237}'), ('\u{1123e}', '\u{11241}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('\u{1145e}', '𑑡'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('𑗘', '\u{115dd}'), ('𑘀', '\u{11640}'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚸'), ('𑛀', '𑛉'), ('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜹'), ('𑝀', '𑝆'), ('𑠀', '\u{1183a}'), ('𑢠', '𑣩'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{11943}'), ('𑥐', '𑥙'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧡'), ('𑧣', '𑧤'), ('𑨀', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('𑩐', '\u{11a99}'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻶'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('𑽐', '𑽙'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('\u{13440}', '\u{13455}'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩰', '𖪾'), ('𖫀', '𖫉'), ('𖫐', '𖫭'), ('\u{16af0}', '\u{16af4}'), ('𖬀', '\u{16b36}'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '\u{16fe4}'), ('𖿰', '𖿱'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅎'), ('𞊐', '\u{1e2ae}'), ('𞋀', '𞋹'), ('𞓐', '𞓹'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('\u{1e8d0}', '\u{1e8d6}'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🯰', '🯹'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ('\u{e0100}', '\u{e01ef}'), ]; pub const XID_START: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͻ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('א', 'ת'), ('ׯ', 'ײ'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'า'), ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'າ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('℘', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('々', '〇'), ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﱝ'), ('ﱤ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷹ'), ('ﹱ', 'ﹱ'), ('ﹳ', 'ﹳ'), ('ﹷ', 'ﹷ'), ('ﹹ', 'ﹹ'), ('ﹻ', 'ﹻ'), ('ﹽ', 'ﹽ'), ('ﹿ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ヲ', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('𑝀', '𑝆'), ('𑠀', '𑠫'), ('𑢠', '𑣟'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; regex-syntax-0.8.2/src/unicode_tables/property_names.rs000064400000000000000000000240031046102023000214540ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate property-names ucd-15.0.0 // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const PROPERTY_NAMES: &'static [(&'static str, &'static str)] = &[ ("age", "Age"), ("ahex", "ASCII_Hex_Digit"), ("alpha", "Alphabetic"), ("alphabetic", "Alphabetic"), ("asciihexdigit", "ASCII_Hex_Digit"), ("bc", "Bidi_Class"), ("bidic", "Bidi_Control"), ("bidiclass", "Bidi_Class"), ("bidicontrol", "Bidi_Control"), ("bidim", "Bidi_Mirrored"), ("bidimirrored", "Bidi_Mirrored"), ("bidimirroringglyph", "Bidi_Mirroring_Glyph"), ("bidipairedbracket", "Bidi_Paired_Bracket"), ("bidipairedbrackettype", "Bidi_Paired_Bracket_Type"), ("blk", "Block"), ("block", "Block"), ("bmg", "Bidi_Mirroring_Glyph"), ("bpb", "Bidi_Paired_Bracket"), ("bpt", "Bidi_Paired_Bracket_Type"), ("canonicalcombiningclass", "Canonical_Combining_Class"), ("cased", "Cased"), ("casefolding", "Case_Folding"), ("caseignorable", "Case_Ignorable"), ("ccc", "Canonical_Combining_Class"), ("ce", "Composition_Exclusion"), ("cf", "Case_Folding"), ("changeswhencasefolded", "Changes_When_Casefolded"), ("changeswhencasemapped", "Changes_When_Casemapped"), ("changeswhenlowercased", "Changes_When_Lowercased"), ("changeswhennfkccasefolded", "Changes_When_NFKC_Casefolded"), ("changeswhentitlecased", "Changes_When_Titlecased"), ("changeswhenuppercased", "Changes_When_Uppercased"), ("ci", "Case_Ignorable"), ("cjkaccountingnumeric", "kAccountingNumeric"), ("cjkcompatibilityvariant", "kCompatibilityVariant"), ("cjkiicore", "kIICore"), ("cjkirggsource", "kIRG_GSource"), ("cjkirghsource", "kIRG_HSource"), ("cjkirgjsource", "kIRG_JSource"), ("cjkirgkpsource", "kIRG_KPSource"), ("cjkirgksource", "kIRG_KSource"), ("cjkirgmsource", "kIRG_MSource"), ("cjkirgssource", "kIRG_SSource"), ("cjkirgtsource", "kIRG_TSource"), ("cjkirguksource", "kIRG_UKSource"), ("cjkirgusource", "kIRG_USource"), ("cjkirgvsource", "kIRG_VSource"), ("cjkothernumeric", "kOtherNumeric"), ("cjkprimarynumeric", "kPrimaryNumeric"), ("cjkrsunicode", "kRSUnicode"), ("compex", "Full_Composition_Exclusion"), ("compositionexclusion", "Composition_Exclusion"), ("cwcf", "Changes_When_Casefolded"), ("cwcm", "Changes_When_Casemapped"), ("cwkcf", "Changes_When_NFKC_Casefolded"), ("cwl", "Changes_When_Lowercased"), ("cwt", "Changes_When_Titlecased"), ("cwu", "Changes_When_Uppercased"), ("dash", "Dash"), ("decompositionmapping", "Decomposition_Mapping"), ("decompositiontype", "Decomposition_Type"), ("defaultignorablecodepoint", "Default_Ignorable_Code_Point"), ("dep", "Deprecated"), ("deprecated", "Deprecated"), ("di", "Default_Ignorable_Code_Point"), ("dia", "Diacritic"), ("diacritic", "Diacritic"), ("dm", "Decomposition_Mapping"), ("dt", "Decomposition_Type"), ("ea", "East_Asian_Width"), ("eastasianwidth", "East_Asian_Width"), ("ebase", "Emoji_Modifier_Base"), ("ecomp", "Emoji_Component"), ("emod", "Emoji_Modifier"), ("emoji", "Emoji"), ("emojicomponent", "Emoji_Component"), ("emojimodifier", "Emoji_Modifier"), ("emojimodifierbase", "Emoji_Modifier_Base"), ("emojipresentation", "Emoji_Presentation"), ("epres", "Emoji_Presentation"), ("equideo", "Equivalent_Unified_Ideograph"), ("equivalentunifiedideograph", "Equivalent_Unified_Ideograph"), ("expandsonnfc", "Expands_On_NFC"), ("expandsonnfd", "Expands_On_NFD"), ("expandsonnfkc", "Expands_On_NFKC"), ("expandsonnfkd", "Expands_On_NFKD"), ("ext", "Extender"), ("extendedpictographic", "Extended_Pictographic"), ("extender", "Extender"), ("extpict", "Extended_Pictographic"), ("fcnfkc", "FC_NFKC_Closure"), ("fcnfkcclosure", "FC_NFKC_Closure"), ("fullcompositionexclusion", "Full_Composition_Exclusion"), ("gc", "General_Category"), ("gcb", "Grapheme_Cluster_Break"), ("generalcategory", "General_Category"), ("graphemebase", "Grapheme_Base"), ("graphemeclusterbreak", "Grapheme_Cluster_Break"), ("graphemeextend", "Grapheme_Extend"), ("graphemelink", "Grapheme_Link"), ("grbase", "Grapheme_Base"), ("grext", "Grapheme_Extend"), ("grlink", "Grapheme_Link"), ("hangulsyllabletype", "Hangul_Syllable_Type"), ("hex", "Hex_Digit"), ("hexdigit", "Hex_Digit"), ("hst", "Hangul_Syllable_Type"), ("hyphen", "Hyphen"), ("idc", "ID_Continue"), ("idcontinue", "ID_Continue"), ("ideo", "Ideographic"), ("ideographic", "Ideographic"), ("ids", "ID_Start"), ("idsb", "IDS_Binary_Operator"), ("idsbinaryoperator", "IDS_Binary_Operator"), ("idst", "IDS_Trinary_Operator"), ("idstart", "ID_Start"), ("idstrinaryoperator", "IDS_Trinary_Operator"), ("indicpositionalcategory", "Indic_Positional_Category"), ("indicsyllabiccategory", "Indic_Syllabic_Category"), ("inpc", "Indic_Positional_Category"), ("insc", "Indic_Syllabic_Category"), ("isc", "ISO_Comment"), ("jamoshortname", "Jamo_Short_Name"), ("jg", "Joining_Group"), ("joinc", "Join_Control"), ("joincontrol", "Join_Control"), ("joininggroup", "Joining_Group"), ("joiningtype", "Joining_Type"), ("jsn", "Jamo_Short_Name"), ("jt", "Joining_Type"), ("kaccountingnumeric", "kAccountingNumeric"), ("kcompatibilityvariant", "kCompatibilityVariant"), ("kiicore", "kIICore"), ("kirggsource", "kIRG_GSource"), ("kirghsource", "kIRG_HSource"), ("kirgjsource", "kIRG_JSource"), ("kirgkpsource", "kIRG_KPSource"), ("kirgksource", "kIRG_KSource"), ("kirgmsource", "kIRG_MSource"), ("kirgssource", "kIRG_SSource"), ("kirgtsource", "kIRG_TSource"), ("kirguksource", "kIRG_UKSource"), ("kirgusource", "kIRG_USource"), ("kirgvsource", "kIRG_VSource"), ("kothernumeric", "kOtherNumeric"), ("kprimarynumeric", "kPrimaryNumeric"), ("krsunicode", "kRSUnicode"), ("lb", "Line_Break"), ("lc", "Lowercase_Mapping"), ("linebreak", "Line_Break"), ("loe", "Logical_Order_Exception"), ("logicalorderexception", "Logical_Order_Exception"), ("lower", "Lowercase"), ("lowercase", "Lowercase"), ("lowercasemapping", "Lowercase_Mapping"), ("math", "Math"), ("na", "Name"), ("na1", "Unicode_1_Name"), ("name", "Name"), ("namealias", "Name_Alias"), ("nchar", "Noncharacter_Code_Point"), ("nfcqc", "NFC_Quick_Check"), ("nfcquickcheck", "NFC_Quick_Check"), ("nfdqc", "NFD_Quick_Check"), ("nfdquickcheck", "NFD_Quick_Check"), ("nfkccasefold", "NFKC_Casefold"), ("nfkccf", "NFKC_Casefold"), ("nfkcqc", "NFKC_Quick_Check"), ("nfkcquickcheck", "NFKC_Quick_Check"), ("nfkdqc", "NFKD_Quick_Check"), ("nfkdquickcheck", "NFKD_Quick_Check"), ("noncharactercodepoint", "Noncharacter_Code_Point"), ("nt", "Numeric_Type"), ("numerictype", "Numeric_Type"), ("numericvalue", "Numeric_Value"), ("nv", "Numeric_Value"), ("oalpha", "Other_Alphabetic"), ("ocomment", "ISO_Comment"), ("odi", "Other_Default_Ignorable_Code_Point"), ("ogrext", "Other_Grapheme_Extend"), ("oidc", "Other_ID_Continue"), ("oids", "Other_ID_Start"), ("olower", "Other_Lowercase"), ("omath", "Other_Math"), ("otheralphabetic", "Other_Alphabetic"), ("otherdefaultignorablecodepoint", "Other_Default_Ignorable_Code_Point"), ("othergraphemeextend", "Other_Grapheme_Extend"), ("otheridcontinue", "Other_ID_Continue"), ("otheridstart", "Other_ID_Start"), ("otherlowercase", "Other_Lowercase"), ("othermath", "Other_Math"), ("otheruppercase", "Other_Uppercase"), ("oupper", "Other_Uppercase"), ("patsyn", "Pattern_Syntax"), ("patternsyntax", "Pattern_Syntax"), ("patternwhitespace", "Pattern_White_Space"), ("patws", "Pattern_White_Space"), ("pcm", "Prepended_Concatenation_Mark"), ("prependedconcatenationmark", "Prepended_Concatenation_Mark"), ("qmark", "Quotation_Mark"), ("quotationmark", "Quotation_Mark"), ("radical", "Radical"), ("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"), ("sb", "Sentence_Break"), ("sc", "Script"), ("scf", "Simple_Case_Folding"), ("script", "Script"), ("scriptextensions", "Script_Extensions"), ("scx", "Script_Extensions"), ("sd", "Soft_Dotted"), ("sentencebreak", "Sentence_Break"), ("sentenceterminal", "Sentence_Terminal"), ("sfc", "Simple_Case_Folding"), ("simplecasefolding", "Simple_Case_Folding"), ("simplelowercasemapping", "Simple_Lowercase_Mapping"), ("simpletitlecasemapping", "Simple_Titlecase_Mapping"), ("simpleuppercasemapping", "Simple_Uppercase_Mapping"), ("slc", "Simple_Lowercase_Mapping"), ("softdotted", "Soft_Dotted"), ("space", "White_Space"), ("stc", "Simple_Titlecase_Mapping"), ("sterm", "Sentence_Terminal"), ("suc", "Simple_Uppercase_Mapping"), ("tc", "Titlecase_Mapping"), ("term", "Terminal_Punctuation"), ("terminalpunctuation", "Terminal_Punctuation"), ("titlecasemapping", "Titlecase_Mapping"), ("uc", "Uppercase_Mapping"), ("uideo", "Unified_Ideograph"), ("unicode1name", "Unicode_1_Name"), ("unicoderadicalstroke", "kRSUnicode"), ("unifiedideograph", "Unified_Ideograph"), ("upper", "Uppercase"), ("uppercase", "Uppercase"), ("uppercasemapping", "Uppercase_Mapping"), ("urs", "kRSUnicode"), ("variationselector", "Variation_Selector"), ("verticalorientation", "Vertical_Orientation"), ("vo", "Vertical_Orientation"), ("vs", "Variation_Selector"), ("wb", "Word_Break"), ("whitespace", "White_Space"), ("wordbreak", "Word_Break"), ("wspace", "White_Space"), ("xidc", "XID_Continue"), ("xidcontinue", "XID_Continue"), ("xids", "XID_Start"), ("xidstart", "XID_Start"), ("xonfc", "Expands_On_NFC"), ("xonfd", "Expands_On_NFD"), ("xonfkc", "Expands_On_NFKC"), ("xonfkd", "Expands_On_NFKD"), ]; regex-syntax-0.8.2/src/unicode_tables/property_values.rs000064400000000000000000000777551046102023000216760ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate property-values ucd-15.0.0 --include gc,script,scx,age,gcb,wb,sb // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const PROPERTY_VALUES: &'static [( &'static str, &'static [(&'static str, &'static str)], )] = &[ ( "Age", &[ ("1.1", "V1_1"), ("10.0", "V10_0"), ("11.0", "V11_0"), ("12.0", "V12_0"), ("12.1", "V12_1"), ("13.0", "V13_0"), ("14.0", "V14_0"), ("15.0", "V15_0"), ("2.0", "V2_0"), ("2.1", "V2_1"), ("3.0", "V3_0"), ("3.1", "V3_1"), ("3.2", "V3_2"), ("4.0", "V4_0"), ("4.1", "V4_1"), ("5.0", "V5_0"), ("5.1", "V5_1"), ("5.2", "V5_2"), ("6.0", "V6_0"), ("6.1", "V6_1"), ("6.2", "V6_2"), ("6.3", "V6_3"), ("7.0", "V7_0"), ("8.0", "V8_0"), ("9.0", "V9_0"), ("na", "Unassigned"), ("unassigned", "Unassigned"), ("v100", "V10_0"), ("v11", "V1_1"), ("v110", "V11_0"), ("v120", "V12_0"), ("v121", "V12_1"), ("v130", "V13_0"), ("v140", "V14_0"), ("v150", "V15_0"), ("v20", "V2_0"), ("v21", "V2_1"), ("v30", "V3_0"), ("v31", "V3_1"), ("v32", "V3_2"), ("v40", "V4_0"), ("v41", "V4_1"), ("v50", "V5_0"), ("v51", "V5_1"), ("v52", "V5_2"), ("v60", "V6_0"), ("v61", "V6_1"), ("v62", "V6_2"), ("v63", "V6_3"), ("v70", "V7_0"), ("v80", "V8_0"), ("v90", "V9_0"), ], ), ( "General_Category", &[ ("c", "Other"), ("casedletter", "Cased_Letter"), ("cc", "Control"), ("cf", "Format"), ("closepunctuation", "Close_Punctuation"), ("cn", "Unassigned"), ("cntrl", "Control"), ("co", "Private_Use"), ("combiningmark", "Mark"), ("connectorpunctuation", "Connector_Punctuation"), ("control", "Control"), ("cs", "Surrogate"), ("currencysymbol", "Currency_Symbol"), ("dashpunctuation", "Dash_Punctuation"), ("decimalnumber", "Decimal_Number"), ("digit", "Decimal_Number"), ("enclosingmark", "Enclosing_Mark"), ("finalpunctuation", "Final_Punctuation"), ("format", "Format"), ("initialpunctuation", "Initial_Punctuation"), ("l", "Letter"), ("lc", "Cased_Letter"), ("letter", "Letter"), ("letternumber", "Letter_Number"), ("lineseparator", "Line_Separator"), ("ll", "Lowercase_Letter"), ("lm", "Modifier_Letter"), ("lo", "Other_Letter"), ("lowercaseletter", "Lowercase_Letter"), ("lt", "Titlecase_Letter"), ("lu", "Uppercase_Letter"), ("m", "Mark"), ("mark", "Mark"), ("mathsymbol", "Math_Symbol"), ("mc", "Spacing_Mark"), ("me", "Enclosing_Mark"), ("mn", "Nonspacing_Mark"), ("modifierletter", "Modifier_Letter"), ("modifiersymbol", "Modifier_Symbol"), ("n", "Number"), ("nd", "Decimal_Number"), ("nl", "Letter_Number"), ("no", "Other_Number"), ("nonspacingmark", "Nonspacing_Mark"), ("number", "Number"), ("openpunctuation", "Open_Punctuation"), ("other", "Other"), ("otherletter", "Other_Letter"), ("othernumber", "Other_Number"), ("otherpunctuation", "Other_Punctuation"), ("othersymbol", "Other_Symbol"), ("p", "Punctuation"), ("paragraphseparator", "Paragraph_Separator"), ("pc", "Connector_Punctuation"), ("pd", "Dash_Punctuation"), ("pe", "Close_Punctuation"), ("pf", "Final_Punctuation"), ("pi", "Initial_Punctuation"), ("po", "Other_Punctuation"), ("privateuse", "Private_Use"), ("ps", "Open_Punctuation"), ("punct", "Punctuation"), ("punctuation", "Punctuation"), ("s", "Symbol"), ("sc", "Currency_Symbol"), ("separator", "Separator"), ("sk", "Modifier_Symbol"), ("sm", "Math_Symbol"), ("so", "Other_Symbol"), ("spaceseparator", "Space_Separator"), ("spacingmark", "Spacing_Mark"), ("surrogate", "Surrogate"), ("symbol", "Symbol"), ("titlecaseletter", "Titlecase_Letter"), ("unassigned", "Unassigned"), ("uppercaseletter", "Uppercase_Letter"), ("z", "Separator"), ("zl", "Line_Separator"), ("zp", "Paragraph_Separator"), ("zs", "Space_Separator"), ], ), ( "Grapheme_Cluster_Break", &[ ("cn", "Control"), ("control", "Control"), ("cr", "CR"), ("eb", "E_Base"), ("ebase", "E_Base"), ("ebasegaz", "E_Base_GAZ"), ("ebg", "E_Base_GAZ"), ("em", "E_Modifier"), ("emodifier", "E_Modifier"), ("ex", "Extend"), ("extend", "Extend"), ("gaz", "Glue_After_Zwj"), ("glueafterzwj", "Glue_After_Zwj"), ("l", "L"), ("lf", "LF"), ("lv", "LV"), ("lvt", "LVT"), ("other", "Other"), ("pp", "Prepend"), ("prepend", "Prepend"), ("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"), ("sm", "SpacingMark"), ("spacingmark", "SpacingMark"), ("t", "T"), ("v", "V"), ("xx", "Other"), ("zwj", "ZWJ"), ], ), ( "Script", &[ ("adlam", "Adlam"), ("adlm", "Adlam"), ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), ("arabic", "Arabic"), ("armenian", "Armenian"), ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), ("canadianaboriginal", "Canadian_Aboriginal"), ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), ("chorasmian", "Chorasmian"), ("chrs", "Chorasmian"), ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), ("cpmn", "Cypro_Minoan"), ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), ("cyprominoan", "Cypro_Minoan"), ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("diak", "Dives_Akuru"), ("divesakuru", "Dives_Akuru"), ("dogr", "Dogra"), ("dogra", "Dogra"), ("dsrt", "Deseret"), ("dupl", "Duployan"), ("duployan", "Duployan"), ("egyp", "Egyptian_Hieroglyphs"), ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), ("elbasan", "Elbasan"), ("elym", "Elymaic"), ("elymaic", "Elymaic"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), ("glagolitic", "Glagolitic"), ("gong", "Gunjala_Gondi"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), ("gujr", "Gujarati"), ("gunjalagondi", "Gunjala_Gondi"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), ("hanifirohingya", "Hanifi_Rohingya"), ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), ("hira", "Hiragana"), ("hiragana", "Hiragana"), ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), ("hmnp", "Nyiakeng_Puachue_Hmong"), ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), ("kawi", "Kawi"), ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), ("kharoshthi", "Kharoshthi"), ("khitansmallscript", "Khitan_Small_Script"), ("khmer", "Khmer"), ("khmr", "Khmer"), ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), ("kits", "Khitan_Small_Script"), ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), ("maka", "Makasar"), ("makasar", "Makasar"), ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), ("medefaidrin", "Medefaidrin"), ("medf", "Medefaidrin"), ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), ("nagm", "Nag_Mundari"), ("nagmundari", "Nag_Mundari"), ("nand", "Nandinagari"), ("nandinagari", "Nandinagari"), ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), ("nshu", "Nushu"), ("nushu", "Nushu"), ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), ("ogam", "Ogham"), ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), ("oldpersian", "Old_Persian"), ("oldsogdian", "Old_Sogdian"), ("oldsoutharabian", "Old_South_Arabian"), ("oldturkic", "Old_Turkic"), ("olduyghur", "Old_Uyghur"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), ("osma", "Osmanya"), ("osmanya", "Osmanya"), ("ougr", "Old_Uyghur"), ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), ("phoenician", "Phoenician"), ("plrd", "Miao"), ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), ("rjng", "Rejang"), ("rohg", "Hanifi_Rohingya"), ("runic", "Runic"), ("runr", "Runic"), ("samaritan", "Samaritan"), ("samr", "Samaritan"), ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), ("sinhala", "Sinhala"), ("sogd", "Sogdian"), ("sogdian", "Sogdian"), ("sogo", "Old_Sogdian"), ("sora", "Sora_Sompeng"), ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), ("tangsa", "Tangsa"), ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("tnsa", "Tangsa"), ("toto", "Toto"), ("ugar", "Ugaritic"), ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), ("vaii", "Vai"), ("vith", "Vithkuqi"), ("vithkuqi", "Vithkuqi"), ("wancho", "Wancho"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), ("wcho", "Wancho"), ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yezi", "Yezidi"), ("yezidi", "Yezidi"), ("yi", "Yi"), ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), ("zzzz", "Unknown"), ], ), ( "Script_Extensions", &[ ("adlam", "Adlam"), ("adlm", "Adlam"), ("aghb", "Caucasian_Albanian"), ("ahom", "Ahom"), ("anatolianhieroglyphs", "Anatolian_Hieroglyphs"), ("arab", "Arabic"), ("arabic", "Arabic"), ("armenian", "Armenian"), ("armi", "Imperial_Aramaic"), ("armn", "Armenian"), ("avestan", "Avestan"), ("avst", "Avestan"), ("bali", "Balinese"), ("balinese", "Balinese"), ("bamu", "Bamum"), ("bamum", "Bamum"), ("bass", "Bassa_Vah"), ("bassavah", "Bassa_Vah"), ("batak", "Batak"), ("batk", "Batak"), ("beng", "Bengali"), ("bengali", "Bengali"), ("bhaiksuki", "Bhaiksuki"), ("bhks", "Bhaiksuki"), ("bopo", "Bopomofo"), ("bopomofo", "Bopomofo"), ("brah", "Brahmi"), ("brahmi", "Brahmi"), ("brai", "Braille"), ("braille", "Braille"), ("bugi", "Buginese"), ("buginese", "Buginese"), ("buhd", "Buhid"), ("buhid", "Buhid"), ("cakm", "Chakma"), ("canadianaboriginal", "Canadian_Aboriginal"), ("cans", "Canadian_Aboriginal"), ("cari", "Carian"), ("carian", "Carian"), ("caucasianalbanian", "Caucasian_Albanian"), ("chakma", "Chakma"), ("cham", "Cham"), ("cher", "Cherokee"), ("cherokee", "Cherokee"), ("chorasmian", "Chorasmian"), ("chrs", "Chorasmian"), ("common", "Common"), ("copt", "Coptic"), ("coptic", "Coptic"), ("cpmn", "Cypro_Minoan"), ("cprt", "Cypriot"), ("cuneiform", "Cuneiform"), ("cypriot", "Cypriot"), ("cyprominoan", "Cypro_Minoan"), ("cyrillic", "Cyrillic"), ("cyrl", "Cyrillic"), ("deseret", "Deseret"), ("deva", "Devanagari"), ("devanagari", "Devanagari"), ("diak", "Dives_Akuru"), ("divesakuru", "Dives_Akuru"), ("dogr", "Dogra"), ("dogra", "Dogra"), ("dsrt", "Deseret"), ("dupl", "Duployan"), ("duployan", "Duployan"), ("egyp", "Egyptian_Hieroglyphs"), ("egyptianhieroglyphs", "Egyptian_Hieroglyphs"), ("elba", "Elbasan"), ("elbasan", "Elbasan"), ("elym", "Elymaic"), ("elymaic", "Elymaic"), ("ethi", "Ethiopic"), ("ethiopic", "Ethiopic"), ("geor", "Georgian"), ("georgian", "Georgian"), ("glag", "Glagolitic"), ("glagolitic", "Glagolitic"), ("gong", "Gunjala_Gondi"), ("gonm", "Masaram_Gondi"), ("goth", "Gothic"), ("gothic", "Gothic"), ("gran", "Grantha"), ("grantha", "Grantha"), ("greek", "Greek"), ("grek", "Greek"), ("gujarati", "Gujarati"), ("gujr", "Gujarati"), ("gunjalagondi", "Gunjala_Gondi"), ("gurmukhi", "Gurmukhi"), ("guru", "Gurmukhi"), ("han", "Han"), ("hang", "Hangul"), ("hangul", "Hangul"), ("hani", "Han"), ("hanifirohingya", "Hanifi_Rohingya"), ("hano", "Hanunoo"), ("hanunoo", "Hanunoo"), ("hatr", "Hatran"), ("hatran", "Hatran"), ("hebr", "Hebrew"), ("hebrew", "Hebrew"), ("hira", "Hiragana"), ("hiragana", "Hiragana"), ("hluw", "Anatolian_Hieroglyphs"), ("hmng", "Pahawh_Hmong"), ("hmnp", "Nyiakeng_Puachue_Hmong"), ("hrkt", "Katakana_Or_Hiragana"), ("hung", "Old_Hungarian"), ("imperialaramaic", "Imperial_Aramaic"), ("inherited", "Inherited"), ("inscriptionalpahlavi", "Inscriptional_Pahlavi"), ("inscriptionalparthian", "Inscriptional_Parthian"), ("ital", "Old_Italic"), ("java", "Javanese"), ("javanese", "Javanese"), ("kaithi", "Kaithi"), ("kali", "Kayah_Li"), ("kana", "Katakana"), ("kannada", "Kannada"), ("katakana", "Katakana"), ("katakanaorhiragana", "Katakana_Or_Hiragana"), ("kawi", "Kawi"), ("kayahli", "Kayah_Li"), ("khar", "Kharoshthi"), ("kharoshthi", "Kharoshthi"), ("khitansmallscript", "Khitan_Small_Script"), ("khmer", "Khmer"), ("khmr", "Khmer"), ("khoj", "Khojki"), ("khojki", "Khojki"), ("khudawadi", "Khudawadi"), ("kits", "Khitan_Small_Script"), ("knda", "Kannada"), ("kthi", "Kaithi"), ("lana", "Tai_Tham"), ("lao", "Lao"), ("laoo", "Lao"), ("latin", "Latin"), ("latn", "Latin"), ("lepc", "Lepcha"), ("lepcha", "Lepcha"), ("limb", "Limbu"), ("limbu", "Limbu"), ("lina", "Linear_A"), ("linb", "Linear_B"), ("lineara", "Linear_A"), ("linearb", "Linear_B"), ("lisu", "Lisu"), ("lyci", "Lycian"), ("lycian", "Lycian"), ("lydi", "Lydian"), ("lydian", "Lydian"), ("mahajani", "Mahajani"), ("mahj", "Mahajani"), ("maka", "Makasar"), ("makasar", "Makasar"), ("malayalam", "Malayalam"), ("mand", "Mandaic"), ("mandaic", "Mandaic"), ("mani", "Manichaean"), ("manichaean", "Manichaean"), ("marc", "Marchen"), ("marchen", "Marchen"), ("masaramgondi", "Masaram_Gondi"), ("medefaidrin", "Medefaidrin"), ("medf", "Medefaidrin"), ("meeteimayek", "Meetei_Mayek"), ("mend", "Mende_Kikakui"), ("mendekikakui", "Mende_Kikakui"), ("merc", "Meroitic_Cursive"), ("mero", "Meroitic_Hieroglyphs"), ("meroiticcursive", "Meroitic_Cursive"), ("meroitichieroglyphs", "Meroitic_Hieroglyphs"), ("miao", "Miao"), ("mlym", "Malayalam"), ("modi", "Modi"), ("mong", "Mongolian"), ("mongolian", "Mongolian"), ("mro", "Mro"), ("mroo", "Mro"), ("mtei", "Meetei_Mayek"), ("mult", "Multani"), ("multani", "Multani"), ("myanmar", "Myanmar"), ("mymr", "Myanmar"), ("nabataean", "Nabataean"), ("nagm", "Nag_Mundari"), ("nagmundari", "Nag_Mundari"), ("nand", "Nandinagari"), ("nandinagari", "Nandinagari"), ("narb", "Old_North_Arabian"), ("nbat", "Nabataean"), ("newa", "Newa"), ("newtailue", "New_Tai_Lue"), ("nko", "Nko"), ("nkoo", "Nko"), ("nshu", "Nushu"), ("nushu", "Nushu"), ("nyiakengpuachuehmong", "Nyiakeng_Puachue_Hmong"), ("ogam", "Ogham"), ("ogham", "Ogham"), ("olchiki", "Ol_Chiki"), ("olck", "Ol_Chiki"), ("oldhungarian", "Old_Hungarian"), ("olditalic", "Old_Italic"), ("oldnortharabian", "Old_North_Arabian"), ("oldpermic", "Old_Permic"), ("oldpersian", "Old_Persian"), ("oldsogdian", "Old_Sogdian"), ("oldsoutharabian", "Old_South_Arabian"), ("oldturkic", "Old_Turkic"), ("olduyghur", "Old_Uyghur"), ("oriya", "Oriya"), ("orkh", "Old_Turkic"), ("orya", "Oriya"), ("osage", "Osage"), ("osge", "Osage"), ("osma", "Osmanya"), ("osmanya", "Osmanya"), ("ougr", "Old_Uyghur"), ("pahawhhmong", "Pahawh_Hmong"), ("palm", "Palmyrene"), ("palmyrene", "Palmyrene"), ("pauc", "Pau_Cin_Hau"), ("paucinhau", "Pau_Cin_Hau"), ("perm", "Old_Permic"), ("phag", "Phags_Pa"), ("phagspa", "Phags_Pa"), ("phli", "Inscriptional_Pahlavi"), ("phlp", "Psalter_Pahlavi"), ("phnx", "Phoenician"), ("phoenician", "Phoenician"), ("plrd", "Miao"), ("prti", "Inscriptional_Parthian"), ("psalterpahlavi", "Psalter_Pahlavi"), ("qaac", "Coptic"), ("qaai", "Inherited"), ("rejang", "Rejang"), ("rjng", "Rejang"), ("rohg", "Hanifi_Rohingya"), ("runic", "Runic"), ("runr", "Runic"), ("samaritan", "Samaritan"), ("samr", "Samaritan"), ("sarb", "Old_South_Arabian"), ("saur", "Saurashtra"), ("saurashtra", "Saurashtra"), ("sgnw", "SignWriting"), ("sharada", "Sharada"), ("shavian", "Shavian"), ("shaw", "Shavian"), ("shrd", "Sharada"), ("sidd", "Siddham"), ("siddham", "Siddham"), ("signwriting", "SignWriting"), ("sind", "Khudawadi"), ("sinh", "Sinhala"), ("sinhala", "Sinhala"), ("sogd", "Sogdian"), ("sogdian", "Sogdian"), ("sogo", "Old_Sogdian"), ("sora", "Sora_Sompeng"), ("sorasompeng", "Sora_Sompeng"), ("soyo", "Soyombo"), ("soyombo", "Soyombo"), ("sund", "Sundanese"), ("sundanese", "Sundanese"), ("sylo", "Syloti_Nagri"), ("sylotinagri", "Syloti_Nagri"), ("syrc", "Syriac"), ("syriac", "Syriac"), ("tagalog", "Tagalog"), ("tagb", "Tagbanwa"), ("tagbanwa", "Tagbanwa"), ("taile", "Tai_Le"), ("taitham", "Tai_Tham"), ("taiviet", "Tai_Viet"), ("takr", "Takri"), ("takri", "Takri"), ("tale", "Tai_Le"), ("talu", "New_Tai_Lue"), ("tamil", "Tamil"), ("taml", "Tamil"), ("tang", "Tangut"), ("tangsa", "Tangsa"), ("tangut", "Tangut"), ("tavt", "Tai_Viet"), ("telu", "Telugu"), ("telugu", "Telugu"), ("tfng", "Tifinagh"), ("tglg", "Tagalog"), ("thaa", "Thaana"), ("thaana", "Thaana"), ("thai", "Thai"), ("tibetan", "Tibetan"), ("tibt", "Tibetan"), ("tifinagh", "Tifinagh"), ("tirh", "Tirhuta"), ("tirhuta", "Tirhuta"), ("tnsa", "Tangsa"), ("toto", "Toto"), ("ugar", "Ugaritic"), ("ugaritic", "Ugaritic"), ("unknown", "Unknown"), ("vai", "Vai"), ("vaii", "Vai"), ("vith", "Vithkuqi"), ("vithkuqi", "Vithkuqi"), ("wancho", "Wancho"), ("wara", "Warang_Citi"), ("warangciti", "Warang_Citi"), ("wcho", "Wancho"), ("xpeo", "Old_Persian"), ("xsux", "Cuneiform"), ("yezi", "Yezidi"), ("yezidi", "Yezidi"), ("yi", "Yi"), ("yiii", "Yi"), ("zanabazarsquare", "Zanabazar_Square"), ("zanb", "Zanabazar_Square"), ("zinh", "Inherited"), ("zyyy", "Common"), ("zzzz", "Unknown"), ], ), ( "Sentence_Break", &[ ("at", "ATerm"), ("aterm", "ATerm"), ("cl", "Close"), ("close", "Close"), ("cr", "CR"), ("ex", "Extend"), ("extend", "Extend"), ("fo", "Format"), ("format", "Format"), ("le", "OLetter"), ("lf", "LF"), ("lo", "Lower"), ("lower", "Lower"), ("nu", "Numeric"), ("numeric", "Numeric"), ("oletter", "OLetter"), ("other", "Other"), ("sc", "SContinue"), ("scontinue", "SContinue"), ("se", "Sep"), ("sep", "Sep"), ("sp", "Sp"), ("st", "STerm"), ("sterm", "STerm"), ("up", "Upper"), ("upper", "Upper"), ("xx", "Other"), ], ), ( "Word_Break", &[ ("aletter", "ALetter"), ("cr", "CR"), ("doublequote", "Double_Quote"), ("dq", "Double_Quote"), ("eb", "E_Base"), ("ebase", "E_Base"), ("ebasegaz", "E_Base_GAZ"), ("ebg", "E_Base_GAZ"), ("em", "E_Modifier"), ("emodifier", "E_Modifier"), ("ex", "ExtendNumLet"), ("extend", "Extend"), ("extendnumlet", "ExtendNumLet"), ("fo", "Format"), ("format", "Format"), ("gaz", "Glue_After_Zwj"), ("glueafterzwj", "Glue_After_Zwj"), ("hebrewletter", "Hebrew_Letter"), ("hl", "Hebrew_Letter"), ("ka", "Katakana"), ("katakana", "Katakana"), ("le", "ALetter"), ("lf", "LF"), ("mb", "MidNumLet"), ("midletter", "MidLetter"), ("midnum", "MidNum"), ("midnumlet", "MidNumLet"), ("ml", "MidLetter"), ("mn", "MidNum"), ("newline", "Newline"), ("nl", "Newline"), ("nu", "Numeric"), ("numeric", "Numeric"), ("other", "Other"), ("regionalindicator", "Regional_Indicator"), ("ri", "Regional_Indicator"), ("singlequote", "Single_Quote"), ("sq", "Single_Quote"), ("wsegspace", "WSegSpace"), ("xx", "Other"), ("zwj", "ZWJ"), ], ), ]; regex-syntax-0.8.2/src/unicode_tables/script.rs000064400000000000000000001013571046102023000177210ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate script ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("Adlam", ADLAM), ("Ahom", AHOM), ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), ("Chorasmian", CHORASMIAN), ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), ("Cypriot", CYPRIOT), ("Cypro_Minoan", CYPRO_MINOAN), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), ("Devanagari", DEVANAGARI), ("Dives_Akuru", DIVES_AKURU), ("Dogra", DOGRA), ("Duployan", DUPLOYAN), ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), ("Elymaic", ELYMAIC), ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), ("Gujarati", GUJARATI), ("Gunjala_Gondi", GUNJALA_GONDI), ("Gurmukhi", GURMUKHI), ("Han", HAN), ("Hangul", HANGUL), ("Hanifi_Rohingya", HANIFI_ROHINGYA), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), ("Kawi", KAWI), ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), ("Khmer", KHMER), ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), ("Makasar", MAKASAR), ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), ("Medefaidrin", MEDEFAIDRIN), ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), ("Meroitic_Cursive", MEROITIC_CURSIVE), ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), ("Nag_Mundari", NAG_MUNDARI), ("Nandinagari", NANDINAGARI), ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), ("Nushu", NUSHU), ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), ("Old_Persian", OLD_PERSIAN), ("Old_Sogdian", OLD_SOGDIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), ("Old_Turkic", OLD_TURKIC), ("Old_Uyghur", OLD_UYGHUR), ("Oriya", ORIYA), ("Osage", OSAGE), ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), ("Sinhala", SINHALA), ("Sogdian", SOGDIAN), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangsa", TANGSA), ("Tangut", TANGUT), ("Telugu", TELUGU), ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Toto", TOTO), ("Ugaritic", UGARITIC), ("Vai", VAI), ("Vithkuqi", VITHKUQI), ("Wancho", WANCHO), ("Warang_Citi", WARANG_CITI), ("Yezidi", YEZIDI), ("Yi", YI), ("Zanabazar_Square", ZANABAZAR_SQUARE), ]; pub const ADLAM: &'static [(char, char)] = &[('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; pub const AHOM: &'static [(char, char)] = &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; pub const ARABIC: &'static [(char, char)] = &[ ('\u{600}', '\u{604}'), ('؆', '؋'), ('؍', '\u{61a}'), ('\u{61c}', '؞'), ('ؠ', 'ؿ'), ('ف', 'ي'), ('\u{656}', 'ٯ'), ('ٱ', '\u{6dc}'), ('۞', 'ۿ'), ('ݐ', 'ݿ'), ('ࡰ', 'ࢎ'), ('\u{890}', '\u{891}'), ('\u{898}', '\u{8e1}'), ('\u{8e3}', '\u{8ff}'), ('ﭐ', '﯂'), ('ﯓ', 'ﴽ'), ('﵀', 'ﶏ'), ('ﶒ', 'ﷇ'), ('﷏', '﷏'), ('ﷰ', '﷿'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('𐹠', '𐹾'), ('\u{10efd}', '\u{10eff}'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ]; pub const ARMENIAN: &'static [(char, char)] = &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; pub const BASSA_VAH: &'static [(char, char)] = &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; pub const BENGALI: &'static [(char, char)] = &[ ('ঀ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', '\u{9fe}'), ]; pub const BHAIKSUKI: &'static [(char, char)] = &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; pub const BOPOMOFO: &'static [(char, char)] = &[('˪', '˫'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆿ')]; pub const BRAHMI: &'static [(char, char)] = &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟')]; pub const BUHID: &'static [(char, char)] = &[('ᝀ', '\u{1753}')]; pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; pub const CHAKMA: &'static [(char, char)] = &[('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; pub const CHAM: &'static [(char, char)] = &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; pub const CHEROKEE: &'static [(char, char)] = &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; pub const COMMON: &'static [(char, char)] = &[ ('\0', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), ('\u{605}', '\u{605}'), ('،', '،'), ('؛', '؛'), ('؟', '؟'), ('ـ', 'ـ'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), ('।', '॥'), ('฿', '฿'), ('࿕', '࿘'), ('჻', '჻'), ('᛫', '᛭'), ('᜵', '᜶'), ('᠂', '᠃'), ('᠅', '᠅'), ('᳓', '᳓'), ('᳡', '᳡'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', '᳷'), ('ᳺ', 'ᳺ'), ('\u{2000}', '\u{200b}'), ('\u{200e}', '\u{2064}'), ('\u{2066}', '⁰'), ('⁴', '⁾'), ('₀', '₎'), ('₠', '⃀'), ('℀', '℥'), ('℧', '℩'), ('ℬ', 'ℱ'), ('ℳ', '⅍'), ('⅏', '⅟'), ('↉', '↋'), ('←', '␦'), ('⑀', '⑊'), ('①', '⟿'), ('⤀', '⭳'), ('⭶', '⮕'), ('⮗', '⯿'), ('⸀', '⹝'), ('⿰', '⿻'), ('\u{3000}', '〄'), ('〆', '〆'), ('〈', '〠'), ('〰', '〷'), ('〼', '〿'), ('゛', '゜'), ('゠', '゠'), ('・', 'ー'), ('㆐', '㆟'), ('㇀', '㇣'), ('㈠', '㉟'), ('㉿', '㋏'), ('㋿', '㋿'), ('㍘', '㏿'), ('䷀', '䷿'), ('꜀', '꜡'), ('ꞈ', '꞊'), ('꠰', '꠹'), ('꤮', '꤮'), ('ꧏ', 'ꧏ'), ('꭛', '꭛'), ('꭪', '꭫'), ('﴾', '﴿'), ('︐', '︙'), ('︰', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), ('[', '`'), ('{', '・'), ('ー', 'ー'), ('\u{ff9e}', '\u{ff9f}'), ('¢', '₩'), ('│', '○'), ('\u{fff9}', '�'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐆐', '𐆜'), ('𐇐', '𐇼'), ('𐋡', '𐋻'), ('\u{1bca0}', '\u{1bca3}'), ('𜽐', '𜿃'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇪'), ('𝋀', '𝋓'), ('𝋠', '𝋳'), ('𝌀', '𝍖'), ('𝍠', '𝍸'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), ('𞱱', '𞲴'), ('𞴁', '𞴽'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🆭'), ('🇦', '🇿'), ('🈁', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉐', '🉑'), ('🉠', '🉥'), ('🌀', '🛗'), ('🛜', '🛬'), ('🛰', '🛼'), ('🜀', '🝶'), ('🝻', '🟙'), ('🟠', '🟫'), ('🟰', '🟰'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🢰', '🢱'), ('🤀', '🩓'), ('🩠', '🩭'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ('🬀', '🮒'), ('🮔', '🯊'), ('🯰', '🯹'), ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), ]; pub const COPTIC: &'static [(char, char)] = &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿')]; pub const CUNEIFORM: &'static [(char, char)] = &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; pub const CYPRIOT: &'static [(char, char)] = &[('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿')]; pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𒾐', '𒿲')]; pub const CYRILLIC: &'static [(char, char)] = &[ ('Ѐ', '\u{484}'), ('\u{487}', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), ('\u{2de0}', '\u{2dff}'), ('Ꙁ', '\u{a69f}'), ('\u{fe2e}', '\u{fe2f}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ]; pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; pub const DEVANAGARI: &'static [(char, char)] = &[ ('\u{900}', 'ॐ'), ('\u{955}', '\u{963}'), ('०', 'ॿ'), ('\u{a8e0}', '\u{a8ff}'), ('𑬀', '𑬉'), ]; pub const DIVES_AKURU: &'static [(char, char)] = &[ ('𑤀', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '𑥆'), ('𑥐', '𑥙'), ]; pub const DOGRA: &'static [(char, char)] = &[('𑠀', '𑠻')]; pub const DUPLOYAN: &'static [(char, char)] = &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '𛲟')]; pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𓀀', '\u{13455}')]; pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; pub const ETHIOPIC: &'static [(char, char)] = &[ ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ]; pub const GEORGIAN: &'static [(char, char)] = &[ ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ჿ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ]; pub const GLAGOLITIC: &'static [(char, char)] = &[ ('Ⰰ', 'ⱟ'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ]; pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; pub const GRANTHA: &'static [(char, char)] = &[ ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133c}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ]; pub const GREEK: &'static [(char, char)] = &[ ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('΄', '΄'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϡ'), ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), ('ᵦ', 'ᵪ'), ('ᶿ', 'ᶿ'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), ('𐅀', '𐆎'), ('𐆠', '𐆠'), ('𝈀', '𝉅'), ]; pub const GUJARATI: &'static [(char, char)] = &[ ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૱'), ('ૹ', '\u{aff}'), ]; pub const GUNJALA_GONDI: &'static [(char, char)] = &[ ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ]; pub const GURMUKHI: &'static [(char, char)] = &[ ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '੶'), ]; pub const HAN: &'static [(char, char)] = &[ ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('々', '々'), ('〇', '〇'), ('〡', '〩'), ('〸', '〻'), ('㐀', '䶿'), ('一', '鿿'), ('豈', '舘'), ('並', '龎'), ('𖿢', '𖿣'), ('𖿰', '𖿱'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const HANGUL: &'static [(char, char)] = &[ ('ᄀ', 'ᇿ'), ('\u{302e}', '\u{302f}'), ('ㄱ', 'ㆎ'), ('㈀', '㈞'), ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ]; pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[('𐴀', '\u{10d27}'), ('𐴰', '𐴹')]; pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜴')]; pub const HATRAN: &'static [(char, char)] = &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; pub const HEBREW: &'static [(char, char)] = &[ ('\u{591}', '\u{5c7}'), ('א', 'ת'), ('ׯ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), ]; pub const HIRAGANA: &'static [(char, char)] = &[ ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('𛀁', '𛄟'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('🈀', '🈀'), ]; pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; pub const INHERITED: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{485}', '\u{486}'), ('\u{64b}', '\u{655}'), ('\u{670}', '\u{670}'), ('\u{951}', '\u{954}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce0}'), ('\u{1ce2}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{200c}', '\u{200d}'), ('\u{20d0}', '\u{20f0}'), ('\u{302a}', '\u{302d}'), ('\u{3099}', '\u{309a}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2d}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{1133b}', '\u{1133b}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; pub const JAVANESE: &'static [(char, char)] = &[('\u{a980}', '꧍'), ('꧐', '꧙'), ('꧞', '꧟')]; pub const KAITHI: &'static [(char, char)] = &[('\u{11080}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}')]; pub const KANNADA: &'static [(char, char)] = &[ ('ಀ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ]; pub const KATAKANA: &'static [(char, char)] = &[ ('ァ', 'ヺ'), ('ヽ', 'ヿ'), ('ㇰ', 'ㇿ'), ('㋐', '㋾'), ('㌀', '㍗'), ('ヲ', 'ッ'), ('ア', 'ン'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛀀'), ('𛄠', '𛄢'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ]; pub const KAWI: &'static [(char, char)] = &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '\u{a92d}'), ('꤯', '꤯')]; pub const KHAROSHTHI: &'static [(char, char)] = &[ ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '𐩈'), ('𐩐', '𐩘'), ]; pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; pub const KHMER: &'static [(char, char)] = &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; pub const KHOJKI: &'static [(char, char)] = &[('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; pub const KHUDAWADI: &'static [(char, char)] = &[('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; pub const LAO: &'static [(char, char)] = &[ ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ]; pub const LATIN: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⱡ', 'Ɀ'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꟿ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), ('ꭦ', 'ꭩ'), ('ff', 'st'), ('A', 'Z'), ('a', 'z'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ]; pub const LEPCHA: &'static [(char, char)] = &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; pub const LIMBU: &'static [(char, char)] = &[ ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥀', '᥀'), ('᥄', '᥏'), ]; pub const LINEAR_A: &'static [(char, char)] = &[('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; pub const LINEAR_B: &'static [(char, char)] = &[ ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ]; pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; pub const MAHAJANI: &'static [(char, char)] = &[('𑅐', '𑅶')]; pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; pub const MALAYALAM: &'static [(char, char)] = &[ ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', '൏'), ('ൔ', '\u{d63}'), ('൦', 'ൿ'), ]; pub const MANDAIC: &'static [(char, char)] = &[('ࡀ', '\u{85b}'), ('࡞', '࡞')]; pub const MANICHAEAN: &'static [(char, char)] = &[('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; pub const MARCHEN: &'static [(char, char)] = &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; pub const MASARAM_GONDI: &'static [(char, char)] = &[ ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ]; pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; pub const MEETEI_MAYEK: &'static [(char, char)] = &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; pub const MENDE_KIKAKUI: &'static [(char, char)] = &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; pub const MIAO: &'static [(char, char)] = &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; pub const MODI: &'static [(char, char)] = &[('𑘀', '𑙄'), ('𑙐', '𑙙')]; pub const MONGOLIAN: &'static [(char, char)] = &[('᠀', '᠁'), ('᠄', '᠄'), ('᠆', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('𑙠', '𑙬')]; pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; pub const MULTANI: &'static [(char, char)] = &[('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; pub const MYANMAR: &'static [(char, char)] = &[('က', '႟'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; pub const NANDINAGARI: &'static [(char, char)] = &[('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤')]; pub const NEW_TAI_LUE: &'static [(char, char)] = &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; pub const NKO: &'static [(char, char)] = &[('߀', 'ߺ'), ('\u{7fd}', '߿')]; pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; pub const OLD_HUNGARIAN: &'static [(char, char)] = &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; pub const OLD_PERMIC: &'static [(char, char)] = &[('𐍐', '\u{1037a}')]; pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; pub const OLD_UYGHUR: &'static [(char, char)] = &[('𐽰', '𐾉')]; pub const ORIYA: &'static [(char, char)] = &[ ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୷'), ]; pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; pub const PAHAWH_HMONG: &'static [(char, char)] = &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; pub const PHAGS_PA: &'static [(char, char)] = &[('ꡀ', '꡷')]; pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; pub const SAURASHTRA: &'static [(char, char)] = &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; pub const SHARADA: &'static [(char, char)] = &[('\u{11180}', '𑇟')]; pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; pub const SIDDHAM: &'static [(char, char)] = &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; pub const SIGNWRITING: &'static [(char, char)] = &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; pub const SINHALA: &'static [(char, char)] = &[ ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', '෴'), ('𑇡', '𑇴'), ]; pub const SOGDIAN: &'static [(char, char)] = &[('𐼰', '𐽙')]; pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; pub const SUNDANESE: &'static [(char, char)] = &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('ꠀ', '\u{a82c}')]; pub const SYRIAC: &'static [(char, char)] = &[('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ')]; pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ')]; pub const TAGBANWA: &'static [(char, char)] = &[('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; pub const TAI_LE: &'static [(char, char)] = &[('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; pub const TAI_THAM: &'static [(char, char)] = &[ ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ]; pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; pub const TAKRI: &'static [(char, char)] = &[('𑚀', '𑚹'), ('𑛀', '𑛉')]; pub const TAMIL: &'static [(char, char)] = &[ ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௺'), ('𑿀', '𑿱'), ('𑿿', '𑿿'), ]; pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; pub const TANGUT: &'static [(char, char)] = &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; pub const TELUGU: &'static [(char, char)] = &[ ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('౷', '౿'), ]; pub const THAANA: &'static [(char, char)] = &[('ހ', 'ޱ')]; pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; pub const TIBETAN: &'static [(char, char)] = &[ ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), ]; pub const TIFINAGH: &'static [(char, char)] = &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; pub const TIRHUTA: &'static [(char, char)] = &[('𑒀', '𑓇'), ('𑓐', '𑓙')]; pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; pub const VITHKUQI: &'static [(char, char)] = &[ ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ]; pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; pub const YEZIDI: &'static [(char, char)] = &[('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱')]; pub const YI: &'static [(char, char)] = &[('ꀀ', 'ꒌ'), ('꒐', '꓆')]; pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; regex-syntax-0.8.2/src/unicode_tables/script_extension.rs000064400000000000000000001112001046102023000220010ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate script-extension ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("Adlam", ADLAM), ("Ahom", AHOM), ("Anatolian_Hieroglyphs", ANATOLIAN_HIEROGLYPHS), ("Arabic", ARABIC), ("Armenian", ARMENIAN), ("Avestan", AVESTAN), ("Balinese", BALINESE), ("Bamum", BAMUM), ("Bassa_Vah", BASSA_VAH), ("Batak", BATAK), ("Bengali", BENGALI), ("Bhaiksuki", BHAIKSUKI), ("Bopomofo", BOPOMOFO), ("Brahmi", BRAHMI), ("Braille", BRAILLE), ("Buginese", BUGINESE), ("Buhid", BUHID), ("Canadian_Aboriginal", CANADIAN_ABORIGINAL), ("Carian", CARIAN), ("Caucasian_Albanian", CAUCASIAN_ALBANIAN), ("Chakma", CHAKMA), ("Cham", CHAM), ("Cherokee", CHEROKEE), ("Chorasmian", CHORASMIAN), ("Common", COMMON), ("Coptic", COPTIC), ("Cuneiform", CUNEIFORM), ("Cypriot", CYPRIOT), ("Cypro_Minoan", CYPRO_MINOAN), ("Cyrillic", CYRILLIC), ("Deseret", DESERET), ("Devanagari", DEVANAGARI), ("Dives_Akuru", DIVES_AKURU), ("Dogra", DOGRA), ("Duployan", DUPLOYAN), ("Egyptian_Hieroglyphs", EGYPTIAN_HIEROGLYPHS), ("Elbasan", ELBASAN), ("Elymaic", ELYMAIC), ("Ethiopic", ETHIOPIC), ("Georgian", GEORGIAN), ("Glagolitic", GLAGOLITIC), ("Gothic", GOTHIC), ("Grantha", GRANTHA), ("Greek", GREEK), ("Gujarati", GUJARATI), ("Gunjala_Gondi", GUNJALA_GONDI), ("Gurmukhi", GURMUKHI), ("Han", HAN), ("Hangul", HANGUL), ("Hanifi_Rohingya", HANIFI_ROHINGYA), ("Hanunoo", HANUNOO), ("Hatran", HATRAN), ("Hebrew", HEBREW), ("Hiragana", HIRAGANA), ("Imperial_Aramaic", IMPERIAL_ARAMAIC), ("Inherited", INHERITED), ("Inscriptional_Pahlavi", INSCRIPTIONAL_PAHLAVI), ("Inscriptional_Parthian", INSCRIPTIONAL_PARTHIAN), ("Javanese", JAVANESE), ("Kaithi", KAITHI), ("Kannada", KANNADA), ("Katakana", KATAKANA), ("Kawi", KAWI), ("Kayah_Li", KAYAH_LI), ("Kharoshthi", KHAROSHTHI), ("Khitan_Small_Script", KHITAN_SMALL_SCRIPT), ("Khmer", KHMER), ("Khojki", KHOJKI), ("Khudawadi", KHUDAWADI), ("Lao", LAO), ("Latin", LATIN), ("Lepcha", LEPCHA), ("Limbu", LIMBU), ("Linear_A", LINEAR_A), ("Linear_B", LINEAR_B), ("Lisu", LISU), ("Lycian", LYCIAN), ("Lydian", LYDIAN), ("Mahajani", MAHAJANI), ("Makasar", MAKASAR), ("Malayalam", MALAYALAM), ("Mandaic", MANDAIC), ("Manichaean", MANICHAEAN), ("Marchen", MARCHEN), ("Masaram_Gondi", MASARAM_GONDI), ("Medefaidrin", MEDEFAIDRIN), ("Meetei_Mayek", MEETEI_MAYEK), ("Mende_Kikakui", MENDE_KIKAKUI), ("Meroitic_Cursive", MEROITIC_CURSIVE), ("Meroitic_Hieroglyphs", MEROITIC_HIEROGLYPHS), ("Miao", MIAO), ("Modi", MODI), ("Mongolian", MONGOLIAN), ("Mro", MRO), ("Multani", MULTANI), ("Myanmar", MYANMAR), ("Nabataean", NABATAEAN), ("Nag_Mundari", NAG_MUNDARI), ("Nandinagari", NANDINAGARI), ("New_Tai_Lue", NEW_TAI_LUE), ("Newa", NEWA), ("Nko", NKO), ("Nushu", NUSHU), ("Nyiakeng_Puachue_Hmong", NYIAKENG_PUACHUE_HMONG), ("Ogham", OGHAM), ("Ol_Chiki", OL_CHIKI), ("Old_Hungarian", OLD_HUNGARIAN), ("Old_Italic", OLD_ITALIC), ("Old_North_Arabian", OLD_NORTH_ARABIAN), ("Old_Permic", OLD_PERMIC), ("Old_Persian", OLD_PERSIAN), ("Old_Sogdian", OLD_SOGDIAN), ("Old_South_Arabian", OLD_SOUTH_ARABIAN), ("Old_Turkic", OLD_TURKIC), ("Old_Uyghur", OLD_UYGHUR), ("Oriya", ORIYA), ("Osage", OSAGE), ("Osmanya", OSMANYA), ("Pahawh_Hmong", PAHAWH_HMONG), ("Palmyrene", PALMYRENE), ("Pau_Cin_Hau", PAU_CIN_HAU), ("Phags_Pa", PHAGS_PA), ("Phoenician", PHOENICIAN), ("Psalter_Pahlavi", PSALTER_PAHLAVI), ("Rejang", REJANG), ("Runic", RUNIC), ("Samaritan", SAMARITAN), ("Saurashtra", SAURASHTRA), ("Sharada", SHARADA), ("Shavian", SHAVIAN), ("Siddham", SIDDHAM), ("SignWriting", SIGNWRITING), ("Sinhala", SINHALA), ("Sogdian", SOGDIAN), ("Sora_Sompeng", SORA_SOMPENG), ("Soyombo", SOYOMBO), ("Sundanese", SUNDANESE), ("Syloti_Nagri", SYLOTI_NAGRI), ("Syriac", SYRIAC), ("Tagalog", TAGALOG), ("Tagbanwa", TAGBANWA), ("Tai_Le", TAI_LE), ("Tai_Tham", TAI_THAM), ("Tai_Viet", TAI_VIET), ("Takri", TAKRI), ("Tamil", TAMIL), ("Tangsa", TANGSA), ("Tangut", TANGUT), ("Telugu", TELUGU), ("Thaana", THAANA), ("Thai", THAI), ("Tibetan", TIBETAN), ("Tifinagh", TIFINAGH), ("Tirhuta", TIRHUTA), ("Toto", TOTO), ("Ugaritic", UGARITIC), ("Vai", VAI), ("Vithkuqi", VITHKUQI), ("Wancho", WANCHO), ("Warang_Citi", WARANG_CITI), ("Yezidi", YEZIDI), ("Yi", YI), ("Zanabazar_Square", ZANABAZAR_SQUARE), ]; pub const ADLAM: &'static [(char, char)] = &[('؟', '؟'), ('ـ', 'ـ'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞥞', '𞥟')]; pub const AHOM: &'static [(char, char)] = &[('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑝆')]; pub const ANATOLIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𔐀', '𔙆')]; pub const ARABIC: &'static [(char, char)] = &[ ('\u{600}', '\u{604}'), ('؆', '\u{6dc}'), ('۞', 'ۿ'), ('ݐ', 'ݿ'), ('ࡰ', 'ࢎ'), ('\u{890}', '\u{891}'), ('\u{898}', '\u{8e1}'), ('\u{8e3}', '\u{8ff}'), ('ﭐ', '﯂'), ('ﯓ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('﷏', '﷏'), ('ﷰ', '﷿'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('\u{102e0}', '𐋻'), ('𐹠', '𐹾'), ('\u{10efd}', '\u{10eff}'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𞻰', '𞻱'), ]; pub const ARMENIAN: &'static [(char, char)] = &[('Ա', 'Ֆ'), ('ՙ', '֊'), ('֍', '֏'), ('ﬓ', 'ﬗ')]; pub const AVESTAN: &'static [(char, char)] = &[('𐬀', '𐬵'), ('𐬹', '𐬿')]; pub const BALINESE: &'static [(char, char)] = &[('\u{1b00}', 'ᭌ'), ('᭐', '᭾')]; pub const BAMUM: &'static [(char, char)] = &[('ꚠ', '꛷'), ('𖠀', '𖨸')]; pub const BASSA_VAH: &'static [(char, char)] = &[('𖫐', '𖫭'), ('\u{16af0}', '𖫵')]; pub const BATAK: &'static [(char, char)] = &[('ᯀ', '᯳'), ('᯼', '᯿')]; pub const BENGALI: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('ঀ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', '\u{9fe}'), ('\u{1cd0}', '\u{1cd0}'), ('\u{1cd2}', '\u{1cd2}'), ('\u{1cd5}', '\u{1cd6}'), ('\u{1cd8}', '\u{1cd8}'), ('᳡', '᳡'), ('ᳪ', 'ᳪ'), ('\u{1ced}', '\u{1ced}'), ('ᳲ', 'ᳲ'), ('ᳵ', '᳷'), ('\u{a8f1}', '\u{a8f1}'), ]; pub const BHAIKSUKI: &'static [(char, char)] = &[('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱅'), ('𑱐', '𑱬')]; pub const BOPOMOFO: &'static [(char, char)] = &[ ('˪', '˫'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('\u{302a}', '\u{302d}'), ('〰', '〰'), ('〷', '〷'), ('・', '・'), ('ㄅ', 'ㄯ'), ('ㆠ', 'ㆿ'), ('﹅', '﹆'), ('。', '・'), ]; pub const BRAHMI: &'static [(char, char)] = &[('𑀀', '𑁍'), ('𑁒', '𑁵'), ('\u{1107f}', '\u{1107f}')]; pub const BRAILLE: &'static [(char, char)] = &[('⠀', '⣿')]; pub const BUGINESE: &'static [(char, char)] = &[('ᨀ', '\u{1a1b}'), ('᨞', '᨟'), ('ꧏ', 'ꧏ')]; pub const BUHID: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝀ', '\u{1753}')]; pub const CANADIAN_ABORIGINAL: &'static [(char, char)] = &[('᐀', 'ᙿ'), ('ᢰ', 'ᣵ'), ('𑪰', '𑪿')]; pub const CARIAN: &'static [(char, char)] = &[('𐊠', '𐋐')]; pub const CAUCASIAN_ALBANIAN: &'static [(char, char)] = &[('𐔰', '𐕣'), ('𐕯', '𐕯')]; pub const CHAKMA: &'static [(char, char)] = &[('০', '৯'), ('၀', '၉'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑅇')]; pub const CHAM: &'static [(char, char)] = &[('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('꩜', '꩟')]; pub const CHEROKEE: &'static [(char, char)] = &[('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ꭰ', 'ꮿ')]; pub const CHORASMIAN: &'static [(char, char)] = &[('𐾰', '𐿋')]; pub const COMMON: &'static [(char, char)] = &[ ('\0', '@'), ('[', '`'), ('{', '©'), ('«', '¹'), ('»', '¿'), ('×', '×'), ('÷', '÷'), ('ʹ', '˟'), ('˥', '˩'), ('ˬ', '˿'), ('ʹ', 'ʹ'), (';', ';'), ('΅', '΅'), ('·', '·'), ('\u{605}', '\u{605}'), ('\u{6dd}', '\u{6dd}'), ('\u{8e2}', '\u{8e2}'), ('฿', '฿'), ('࿕', '࿘'), ('᛫', '᛭'), ('\u{2000}', '\u{200b}'), ('\u{200e}', '\u{202e}'), ('‰', '\u{2064}'), ('\u{2066}', '⁰'), ('⁴', '⁾'), ('₀', '₎'), ('₠', '⃀'), ('℀', '℥'), ('℧', '℩'), ('ℬ', 'ℱ'), ('ℳ', '⅍'), ('⅏', '⅟'), ('↉', '↋'), ('←', '␦'), ('⑀', '⑊'), ('①', '⟿'), ('⤀', '⭳'), ('⭶', '⮕'), ('⮗', '⯿'), ('⸀', '⹂'), ('⹄', '⹝'), ('⿰', '⿻'), ('\u{3000}', '\u{3000}'), ('〄', '〄'), ('〒', '〒'), ('〠', '〠'), ('〶', '〶'), ('㉈', '㉟'), ('㉿', '㉿'), ('㊱', '㊿'), ('㋌', '㋏'), ('㍱', '㍺'), ('㎀', '㏟'), ('㏿', '㏿'), ('䷀', '䷿'), ('꜈', '꜡'), ('ꞈ', '꞊'), ('꭛', '꭛'), ('꭪', '꭫'), ('︐', '︙'), ('︰', '﹄'), ('﹇', '﹒'), ('﹔', '﹦'), ('﹨', '﹫'), ('\u{feff}', '\u{feff}'), ('!', '@'), ('[', '`'), ('{', '⦆'), ('¢', '₩'), ('│', '○'), ('\u{fff9}', '�'), ('𐆐', '𐆜'), ('𐇐', '𐇼'), ('𜽐', '𜿃'), ('𝀀', '𝃵'), ('𝄀', '𝄦'), ('𝄩', '𝅦'), ('𝅪', '\u{1d17a}'), ('𝆃', '𝆄'), ('𝆌', '𝆩'), ('𝆮', '𝇪'), ('𝋀', '𝋓'), ('𝋠', '𝋳'), ('𝌀', '𝍖'), ('𝍲', '𝍸'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝟋'), ('𝟎', '𝟿'), ('𞱱', '𞲴'), ('𞴁', '𞴽'), ('🀀', '🀫'), ('🀰', '🂓'), ('🂠', '🂮'), ('🂱', '🂿'), ('🃁', '🃏'), ('🃑', '🃵'), ('🄀', '🆭'), ('🇦', '🇿'), ('🈁', '🈂'), ('🈐', '🈻'), ('🉀', '🉈'), ('🉠', '🉥'), ('🌀', '🛗'), ('🛜', '🛬'), ('🛰', '🛼'), ('🜀', '🝶'), ('🝻', '🟙'), ('🟠', '🟫'), ('🟰', '🟰'), ('🠀', '🠋'), ('🠐', '🡇'), ('🡐', '🡙'), ('🡠', '🢇'), ('🢐', '🢭'), ('🢰', '🢱'), ('🤀', '🩓'), ('🩠', '🩭'), ('🩰', '🩼'), ('🪀', '🪈'), ('🪐', '🪽'), ('🪿', '🫅'), ('🫎', '🫛'), ('🫠', '🫨'), ('🫰', '🫸'), ('🬀', '🮒'), ('🮔', '🯊'), ('🯰', '🯹'), ('\u{e0001}', '\u{e0001}'), ('\u{e0020}', '\u{e007f}'), ]; pub const COPTIC: &'static [(char, char)] = &[('Ϣ', 'ϯ'), ('Ⲁ', 'ⳳ'), ('⳹', '⳿'), ('\u{102e0}', '𐋻')]; pub const CUNEIFORM: &'static [(char, char)] = &[('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒑰', '𒑴'), ('𒒀', '𒕃')]; pub const CYPRIOT: &'static [(char, char)] = &[ ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐠿'), ]; pub const CYPRO_MINOAN: &'static [(char, char)] = &[('𐄀', '𐄁'), ('𒾐', '𒿲')]; pub const CYRILLIC: &'static [(char, char)] = &[ ('Ѐ', 'ԯ'), ('ᲀ', 'ᲈ'), ('ᴫ', 'ᴫ'), ('ᵸ', 'ᵸ'), ('\u{1df8}', '\u{1df8}'), ('\u{2de0}', '\u{2dff}'), ('⹃', '⹃'), ('Ꙁ', '\u{a69f}'), ('\u{fe2e}', '\u{fe2f}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ]; pub const DESERET: &'static [(char, char)] = &[('𐐀', '𐑏')]; pub const DEVANAGARI: &'static [(char, char)] = &[ ('\u{900}', '\u{952}'), ('\u{955}', 'ॿ'), ('\u{1cd0}', 'ᳶ'), ('\u{1cf8}', '\u{1cf9}'), ('\u{20f0}', '\u{20f0}'), ('꠰', '꠹'), ('\u{a8e0}', '\u{a8ff}'), ('𑬀', '𑬉'), ]; pub const DIVES_AKURU: &'static [(char, char)] = &[ ('𑤀', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '𑥆'), ('𑥐', '𑥙'), ]; pub const DOGRA: &'static [(char, char)] = &[('।', '९'), ('꠰', '꠹'), ('𑠀', '𑠻')]; pub const DUPLOYAN: &'static [(char, char)] = &[('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𛲜', '\u{1bca3}')]; pub const EGYPTIAN_HIEROGLYPHS: &'static [(char, char)] = &[('𓀀', '\u{13455}')]; pub const ELBASAN: &'static [(char, char)] = &[('𐔀', '𐔧')]; pub const ELYMAIC: &'static [(char, char)] = &[('𐿠', '𐿶')]; pub const ETHIOPIC: &'static [(char, char)] = &[ ('ሀ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '፼'), ('ᎀ', '᎙'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ]; pub const GEORGIAN: &'static [(char, char)] = &[ ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჿ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ]; pub const GLAGOLITIC: &'static [(char, char)] = &[ ('\u{484}', '\u{484}'), ('\u{487}', '\u{487}'), ('Ⰰ', 'ⱟ'), ('⹃', '⹃'), ('\u{a66f}', '\u{a66f}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ]; pub const GOTHIC: &'static [(char, char)] = &[('𐌰', '𐍊')]; pub const GRANTHA: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('௦', '௳'), ('\u{1cd0}', '\u{1cd0}'), ('\u{1cd2}', '᳓'), ('ᳲ', '\u{1cf4}'), ('\u{1cf8}', '\u{1cf9}'), ('\u{20f0}', '\u{20f0}'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑿐', '𑿑'), ('𑿓', '𑿓'), ]; pub const GREEK: &'static [(char, char)] = &[ ('\u{342}', '\u{342}'), ('\u{345}', '\u{345}'), ('Ͱ', 'ͳ'), ('͵', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('΄', '΄'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϡ'), ('ϰ', 'Ͽ'), ('ᴦ', 'ᴪ'), ('ᵝ', 'ᵡ'), ('ᵦ', 'ᵪ'), ('ᶿ', '\u{1dc1}'), ('ἀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ῄ'), ('ῆ', 'ΐ'), ('ῖ', 'Ί'), ('῝', '`'), ('ῲ', 'ῴ'), ('ῶ', '῾'), ('Ω', 'Ω'), ('ꭥ', 'ꭥ'), ('𐅀', '𐆎'), ('𐆠', '𐆠'), ('𝈀', '𝉅'), ]; pub const GUJARATI: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૱'), ('ૹ', '\u{aff}'), ('꠰', '꠹'), ]; pub const GUNJALA_GONDI: &'static [(char, char)] = &[ ('।', '॥'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ]; pub const GURMUKHI: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '੶'), ('꠰', '꠹'), ]; pub const HAN: &'static [(char, char)] = &[ ('⺀', '⺙'), ('⺛', '⻳'), ('⼀', '⿕'), ('、', '〃'), ('々', '】'), ('〓', '〟'), ('〡', '\u{302d}'), ('〰', '〰'), ('〷', '〿'), ('・', '・'), ('㆐', '㆟'), ('㇀', '㇣'), ('㈠', '㉇'), ('㊀', '㊰'), ('㋀', '㋋'), ('㋿', '㋿'), ('㍘', '㍰'), ('㍻', '㍿'), ('㏠', '㏾'), ('㐀', '䶿'), ('一', '鿿'), ('꜀', '꜇'), ('豈', '舘'), ('並', '龎'), ('﹅', '﹆'), ('。', '・'), ('𖿢', '𖿣'), ('𖿰', '𖿱'), ('𝍠', '𝍱'), ('🉐', '🉑'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const HANGUL: &'static [(char, char)] = &[ ('ᄀ', 'ᇿ'), ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('\u{302e}', '〰'), ('〷', '〷'), ('・', '・'), ('ㄱ', 'ㆎ'), ('㈀', '㈞'), ('㉠', '㉾'), ('ꥠ', 'ꥼ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('﹅', '﹆'), ('。', '・'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ]; pub const HANIFI_ROHINGYA: &'static [(char, char)] = &[ ('،', '،'), ('؛', '؛'), ('؟', '؟'), ('ـ', 'ـ'), ('۔', '۔'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ]; pub const HANUNOO: &'static [(char, char)] = &[('ᜠ', '᜶')]; pub const HATRAN: &'static [(char, char)] = &[('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐣻', '𐣿')]; pub const HEBREW: &'static [(char, char)] = &[ ('\u{591}', '\u{5c7}'), ('א', 'ת'), ('ׯ', '״'), ('יִ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), ]; pub const HIRAGANA: &'static [(char, char)] = &[ ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), ('〷', '〷'), ('〼', '〽'), ('ぁ', 'ゖ'), ('\u{3099}', '゠'), ('・', 'ー'), ('﹅', '﹆'), ('。', '・'), ('ー', 'ー'), ('\u{ff9e}', '\u{ff9f}'), ('𛀁', '𛄟'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('🈀', '🈀'), ]; pub const IMPERIAL_ARAMAIC: &'static [(char, char)] = &[('𐡀', '𐡕'), ('𐡗', '𐡟')]; pub const INHERITED: &'static [(char, char)] = &[ ('\u{300}', '\u{341}'), ('\u{343}', '\u{344}'), ('\u{346}', '\u{362}'), ('\u{953}', '\u{954}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1dc2}', '\u{1df7}'), ('\u{1df9}', '\u{1df9}'), ('\u{1dfb}', '\u{1dff}'), ('\u{200c}', '\u{200d}'), ('\u{20d0}', '\u{20ef}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2d}'), ('\u{101fd}', '\u{101fd}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d167}', '\u{1d169}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const INSCRIPTIONAL_PAHLAVI: &'static [(char, char)] = &[('𐭠', '𐭲'), ('𐭸', '𐭿')]; pub const INSCRIPTIONAL_PARTHIAN: &'static [(char, char)] = &[('𐭀', '𐭕'), ('𐭘', '𐭟')]; pub const JAVANESE: &'static [(char, char)] = &[('\u{a980}', '꧍'), ('ꧏ', '꧙'), ('꧞', '꧟')]; pub const KAITHI: &'static [(char, char)] = &[ ('०', '९'), ('꠰', '꠹'), ('\u{11080}', '\u{110c2}'), ('\u{110cd}', '\u{110cd}'), ]; pub const KANNADA: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('ಀ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ('\u{1cd0}', '\u{1cd0}'), ('\u{1cd2}', '\u{1cd2}'), ('\u{1cda}', '\u{1cda}'), ('ᳲ', 'ᳲ'), ('\u{1cf4}', '\u{1cf4}'), ('꠰', '꠵'), ]; pub const KATAKANA: &'static [(char, char)] = &[ ('、', '〃'), ('〈', '】'), ('〓', '〟'), ('〰', '〵'), ('〷', '〷'), ('〼', '〽'), ('\u{3099}', '゜'), ('゠', 'ヿ'), ('ㇰ', 'ㇿ'), ('㋐', '㋾'), ('㌀', '㍗'), ('﹅', '﹆'), ('。', '\u{ff9f}'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛀀'), ('𛄠', '𛄢'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ]; pub const KAWI: &'static [(char, char)] = &[('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '𑽙')]; pub const KAYAH_LI: &'static [(char, char)] = &[('꤀', '꤯')]; pub const KHAROSHTHI: &'static [(char, char)] = &[ ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '𐩈'), ('𐩐', '𐩘'), ]; pub const KHITAN_SMALL_SCRIPT: &'static [(char, char)] = &[('\u{16fe4}', '\u{16fe4}'), ('𘬀', '𘳕')]; pub const KHMER: &'static [(char, char)] = &[('ក', '\u{17dd}'), ('០', '៩'), ('៰', '៹'), ('᧠', '᧿')]; pub const KHOJKI: &'static [(char, char)] = &[('૦', '૯'), ('꠰', '꠹'), ('𑈀', '𑈑'), ('𑈓', '\u{11241}')]; pub const KHUDAWADI: &'static [(char, char)] = &[('।', '॥'), ('꠰', '꠹'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹')]; pub const LAO: &'static [(char, char)] = &[ ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ]; pub const LATIN: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ʸ'), ('ˠ', 'ˤ'), ('\u{363}', '\u{36f}'), ('\u{485}', '\u{486}'), ('\u{951}', '\u{952}'), ('჻', '჻'), ('ᴀ', 'ᴥ'), ('ᴬ', 'ᵜ'), ('ᵢ', 'ᵥ'), ('ᵫ', 'ᵷ'), ('ᵹ', 'ᶾ'), ('Ḁ', 'ỿ'), ('\u{202f}', '\u{202f}'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20f0}', '\u{20f0}'), ('K', 'Å'), ('Ⅎ', 'Ⅎ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⱡ', 'Ɀ'), ('꜀', '꜇'), ('Ꜣ', 'ꞇ'), ('Ꞌ', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꟿ'), ('꤮', '꤮'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭤ'), ('ꭦ', 'ꭩ'), ('ff', 'st'), ('A', 'Z'), ('a', 'z'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ]; pub const LEPCHA: &'static [(char, char)] = &[('ᰀ', '\u{1c37}'), ('᰻', '᱉'), ('ᱍ', 'ᱏ')]; pub const LIMBU: &'static [(char, char)] = &[ ('॥', '॥'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥀', '᥀'), ('᥄', '᥏'), ]; pub const LINEAR_A: &'static [(char, char)] = &[('𐄇', '𐄳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧')]; pub const LINEAR_B: &'static [(char, char)] = &[ ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐄀', '𐄂'), ('𐄇', '𐄳'), ('𐄷', '𐄿'), ]; pub const LISU: &'static [(char, char)] = &[('ꓐ', '꓿'), ('𑾰', '𑾰')]; pub const LYCIAN: &'static [(char, char)] = &[('𐊀', '𐊜')]; pub const LYDIAN: &'static [(char, char)] = &[('𐤠', '𐤹'), ('𐤿', '𐤿')]; pub const MAHAJANI: &'static [(char, char)] = &[('।', '९'), ('꠰', '꠹'), ('𑅐', '𑅶')]; pub const MAKASAR: &'static [(char, char)] = &[('𑻠', '𑻸')]; pub const MALAYALAM: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', '൏'), ('ൔ', '\u{d63}'), ('൦', 'ൿ'), ('\u{1cda}', '\u{1cda}'), ('꠰', '꠲'), ]; pub const MANDAIC: &'static [(char, char)] = &[('ـ', 'ـ'), ('ࡀ', '\u{85b}'), ('࡞', '࡞')]; pub const MANICHAEAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐫀', '\u{10ae6}'), ('𐫫', '𐫶')]; pub const MARCHEN: &'static [(char, char)] = &[('𑱰', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}')]; pub const MASARAM_GONDI: &'static [(char, char)] = &[ ('।', '॥'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ]; pub const MEDEFAIDRIN: &'static [(char, char)] = &[('𖹀', '𖺚')]; pub const MEETEI_MAYEK: &'static [(char, char)] = &[('ꫠ', '\u{aaf6}'), ('ꯀ', '\u{abed}'), ('꯰', '꯹')]; pub const MENDE_KIKAKUI: &'static [(char, char)] = &[('𞠀', '𞣄'), ('𞣇', '\u{1e8d6}')]; pub const MEROITIC_CURSIVE: &'static [(char, char)] = &[('𐦠', '𐦷'), ('𐦼', '𐧏'), ('𐧒', '𐧿')]; pub const MEROITIC_HIEROGLYPHS: &'static [(char, char)] = &[('𐦀', '𐦟')]; pub const MIAO: &'static [(char, char)] = &[('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟')]; pub const MODI: &'static [(char, char)] = &[('꠰', '꠹'), ('𑘀', '𑙄'), ('𑙐', '𑙙')]; pub const MONGOLIAN: &'static [(char, char)] = &[ ('᠀', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('\u{202f}', '\u{202f}'), ('𑙠', '𑙬'), ]; pub const MRO: &'static [(char, char)] = &[('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩮', '𖩯')]; pub const MULTANI: &'static [(char, char)] = &[('੦', '੯'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊩')]; pub const MYANMAR: &'static [(char, char)] = &[('က', '႟'), ('꤮', '꤮'), ('ꧠ', 'ꧾ'), ('ꩠ', 'ꩿ')]; pub const NABATAEAN: &'static [(char, char)] = &[('𐢀', '𐢞'), ('𐢧', '𐢯')]; pub const NAG_MUNDARI: &'static [(char, char)] = &[('𞓐', '𞓹')]; pub const NANDINAGARI: &'static [(char, char)] = &[ ('।', '॥'), ('೦', '೯'), ('ᳩ', 'ᳩ'), ('ᳲ', 'ᳲ'), ('ᳺ', 'ᳺ'), ('꠰', '꠵'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧤'), ]; pub const NEW_TAI_LUE: &'static [(char, char)] = &[('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧚'), ('᧞', '᧟')]; pub const NEWA: &'static [(char, char)] = &[('𑐀', '𑑛'), ('𑑝', '𑑡')]; pub const NKO: &'static [(char, char)] = &[ ('،', '،'), ('؛', '؛'), ('؟', '؟'), ('߀', 'ߺ'), ('\u{7fd}', '߿'), ('﴾', '﴿'), ]; pub const NUSHU: &'static [(char, char)] = &[('𖿡', '𖿡'), ('𛅰', '𛋻')]; pub const NYIAKENG_PUACHUE_HMONG: &'static [(char, char)] = &[('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅏')]; pub const OGHAM: &'static [(char, char)] = &[('\u{1680}', '᚜')]; pub const OL_CHIKI: &'static [(char, char)] = &[('᱐', '᱿')]; pub const OLD_HUNGARIAN: &'static [(char, char)] = &[('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐳺', '𐳿')]; pub const OLD_ITALIC: &'static [(char, char)] = &[('𐌀', '𐌣'), ('𐌭', '𐌯')]; pub const OLD_NORTH_ARABIAN: &'static [(char, char)] = &[('𐪀', '𐪟')]; pub const OLD_PERMIC: &'static [(char, char)] = &[('\u{483}', '\u{483}'), ('𐍐', '\u{1037a}')]; pub const OLD_PERSIAN: &'static [(char, char)] = &[('𐎠', '𐏃'), ('𐏈', '𐏕')]; pub const OLD_SOGDIAN: &'static [(char, char)] = &[('𐼀', '𐼧')]; pub const OLD_SOUTH_ARABIAN: &'static [(char, char)] = &[('𐩠', '𐩿')]; pub const OLD_TURKIC: &'static [(char, char)] = &[('𐰀', '𐱈')]; pub const OLD_UYGHUR: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐫲', '𐫲'), ('𐽰', '𐾉')]; pub const ORIYA: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୷'), ('\u{1cda}', '\u{1cda}'), ('ᳲ', 'ᳲ'), ]; pub const OSAGE: &'static [(char, char)] = &[('𐒰', '𐓓'), ('𐓘', '𐓻')]; pub const OSMANYA: &'static [(char, char)] = &[('𐒀', '𐒝'), ('𐒠', '𐒩')]; pub const PAHAWH_HMONG: &'static [(char, char)] = &[('𖬀', '𖭅'), ('𖭐', '𖭙'), ('𖭛', '𖭡'), ('𖭣', '𖭷'), ('𖭽', '𖮏')]; pub const PALMYRENE: &'static [(char, char)] = &[('𐡠', '𐡿')]; pub const PAU_CIN_HAU: &'static [(char, char)] = &[('𑫀', '𑫸')]; pub const PHAGS_PA: &'static [(char, char)] = &[('᠂', '᠃'), ('᠅', '᠅'), ('ꡀ', '꡷')]; pub const PHOENICIAN: &'static [(char, char)] = &[('𐤀', '𐤛'), ('𐤟', '𐤟')]; pub const PSALTER_PAHLAVI: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐮀', '𐮑'), ('𐮙', '𐮜'), ('𐮩', '𐮯')]; pub const REJANG: &'static [(char, char)] = &[('ꤰ', '꥓'), ('꥟', '꥟')]; pub const RUNIC: &'static [(char, char)] = &[('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ')]; pub const SAMARITAN: &'static [(char, char)] = &[('ࠀ', '\u{82d}'), ('࠰', '࠾')]; pub const SAURASHTRA: &'static [(char, char)] = &[('ꢀ', '\u{a8c5}'), ('꣎', '꣙')]; pub const SHARADA: &'static [(char, char)] = &[ ('\u{951}', '\u{951}'), ('\u{1cd7}', '\u{1cd7}'), ('\u{1cd9}', '\u{1cd9}'), ('\u{1cdc}', '\u{1cdd}'), ('\u{1ce0}', '\u{1ce0}'), ('\u{11180}', '𑇟'), ]; pub const SHAVIAN: &'static [(char, char)] = &[('𐑐', '𐑿')]; pub const SIDDHAM: &'static [(char, char)] = &[('𑖀', '\u{115b5}'), ('𑖸', '\u{115dd}')]; pub const SIGNWRITING: &'static [(char, char)] = &[('𝠀', '𝪋'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}')]; pub const SINHALA: &'static [(char, char)] = &[ ('।', '॥'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', '෴'), ('𑇡', '𑇴'), ]; pub const SOGDIAN: &'static [(char, char)] = &[('ـ', 'ـ'), ('𐼰', '𐽙')]; pub const SORA_SOMPENG: &'static [(char, char)] = &[('𑃐', '𑃨'), ('𑃰', '𑃹')]; pub const SOYOMBO: &'static [(char, char)] = &[('𑩐', '𑪢')]; pub const SUNDANESE: &'static [(char, char)] = &[('\u{1b80}', 'ᮿ'), ('᳀', '᳇')]; pub const SYLOTI_NAGRI: &'static [(char, char)] = &[('।', '॥'), ('০', '৯'), ('ꠀ', '\u{a82c}')]; pub const SYRIAC: &'static [(char, char)] = &[ ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('ـ', 'ـ'), ('\u{64b}', '\u{655}'), ('\u{670}', '\u{670}'), ('܀', '܍'), ('\u{70f}', '\u{74a}'), ('ݍ', 'ݏ'), ('ࡠ', 'ࡪ'), ('\u{1df8}', '\u{1df8}'), ('\u{1dfa}', '\u{1dfa}'), ]; pub const TAGALOG: &'static [(char, char)] = &[('ᜀ', '᜕'), ('ᜟ', 'ᜟ'), ('᜵', '᜶')]; pub const TAGBANWA: &'static [(char, char)] = &[('᜵', '᜶'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}')]; pub const TAI_LE: &'static [(char, char)] = &[('၀', '၉'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ')]; pub const TAI_THAM: &'static [(char, char)] = &[ ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('᪠', '᪭'), ]; pub const TAI_VIET: &'static [(char, char)] = &[('ꪀ', 'ꫂ'), ('ꫛ', '꫟')]; pub const TAKRI: &'static [(char, char)] = &[('।', '॥'), ('꠰', '꠹'), ('𑚀', '𑚹'), ('𑛀', '𑛉')]; pub const TAMIL: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௺'), ('\u{1cda}', '\u{1cda}'), ('ꣳ', 'ꣳ'), ('\u{11301}', '\u{11301}'), ('𑌃', '𑌃'), ('\u{1133b}', '\u{1133c}'), ('𑿀', '𑿱'), ('𑿿', '𑿿'), ]; pub const TANGSA: &'static [(char, char)] = &[('𖩰', '𖪾'), ('𖫀', '𖫉')]; pub const TANGUT: &'static [(char, char)] = &[('𖿠', '𖿠'), ('𗀀', '𘟷'), ('𘠀', '𘫿'), ('𘴀', '𘴈')]; pub const TELUGU: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('౷', '౿'), ('\u{1cda}', '\u{1cda}'), ('ᳲ', 'ᳲ'), ]; pub const THAANA: &'static [(char, char)] = &[ ('،', '،'), ('؛', '\u{61c}'), ('؟', '؟'), ('٠', '٩'), ('ހ', 'ޱ'), ('ﷲ', 'ﷲ'), ('﷽', '﷽'), ]; pub const THAI: &'static [(char, char)] = &[('ก', '\u{e3a}'), ('เ', '๛')]; pub const TIBETAN: &'static [(char, char)] = &[ ('ༀ', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('྾', '࿌'), ('࿎', '࿔'), ('࿙', '࿚'), ]; pub const TIFINAGH: &'static [(char, char)] = &[('ⴰ', 'ⵧ'), ('ⵯ', '⵰'), ('\u{2d7f}', '\u{2d7f}')]; pub const TIRHUTA: &'static [(char, char)] = &[ ('\u{951}', '\u{952}'), ('।', '॥'), ('ᳲ', 'ᳲ'), ('꠰', '꠹'), ('𑒀', '𑓇'), ('𑓐', '𑓙'), ]; pub const TOTO: &'static [(char, char)] = &[('𞊐', '\u{1e2ae}')]; pub const UGARITIC: &'static [(char, char)] = &[('𐎀', '𐎝'), ('𐎟', '𐎟')]; pub const VAI: &'static [(char, char)] = &[('ꔀ', 'ꘫ')]; pub const VITHKUQI: &'static [(char, char)] = &[ ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ]; pub const WANCHO: &'static [(char, char)] = &[('𞋀', '𞋹'), ('𞋿', '𞋿')]; pub const WARANG_CITI: &'static [(char, char)] = &[('𑢠', '𑣲'), ('𑣿', '𑣿')]; pub const YEZIDI: &'static [(char, char)] = &[ ('،', '،'), ('؛', '؛'), ('؟', '؟'), ('٠', '٩'), ('𐺀', '𐺩'), ('\u{10eab}', '𐺭'), ('𐺰', '𐺱'), ]; pub const YI: &'static [(char, char)] = &[ ('、', '。'), ('〈', '】'), ('〔', '〛'), ('・', '・'), ('ꀀ', 'ꒌ'), ('꒐', '꓆'), ('。', '・'), ]; pub const ZANABAZAR_SQUARE: &'static [(char, char)] = &[('𑨀', '\u{11a47}')]; regex-syntax-0.8.2/src/unicode_tables/sentence_break.rs000064400000000000000000001455151046102023000213710ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate sentence-break ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("ATerm", ATERM), ("CR", CR), ("Close", CLOSE), ("Extend", EXTEND), ("Format", FORMAT), ("LF", LF), ("Lower", LOWER), ("Numeric", NUMERIC), ("OLetter", OLETTER), ("SContinue", SCONTINUE), ("STerm", STERM), ("Sep", SEP), ("Sp", SP), ("Upper", UPPER), ]; pub const ATERM: &'static [(char, char)] = &[('.', '.'), ('․', '․'), ('﹒', '﹒'), ('.', '.')]; pub const CR: &'static [(char, char)] = &[('\r', '\r')]; pub const CLOSE: &'static [(char, char)] = &[ ('"', '"'), ('\'', ')'), ('[', '['), (']', ']'), ('{', '{'), ('}', '}'), ('«', '«'), ('»', '»'), ('༺', '༽'), ('᚛', '᚜'), ('‘', '‟'), ('‹', '›'), ('⁅', '⁆'), ('⁽', '⁾'), ('₍', '₎'), ('⌈', '⌋'), ('〈', '〉'), ('❛', '❠'), ('❨', '❵'), ('⟅', '⟆'), ('⟦', '⟯'), ('⦃', '⦘'), ('⧘', '⧛'), ('⧼', '⧽'), ('⸀', '⸍'), ('⸜', '⸝'), ('⸠', '⸩'), ('⹂', '⹂'), ('⹕', '⹜'), ('〈', '】'), ('〔', '〛'), ('〝', '〟'), ('﴾', '﴿'), ('︗', '︘'), ('︵', '﹄'), ('﹇', '﹈'), ('﹙', '﹞'), ('(', ')'), ('[', '['), (']', ']'), ('{', '{'), ('}', '}'), ('⦅', '⦆'), ('「', '」'), ('🙶', '🙸'), ]; pub const EXTEND: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{489}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', 'ः'), ('\u{93a}', '\u{93c}'), ('ा', 'ॏ'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', 'ঃ'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', 'ઃ'), ('\u{abc}', '\u{abc}'), ('ા', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', 'ଃ'), ('\u{b3c}', '\u{b3c}'), ('\u{b3e}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', 'ಃ'), ('\u{cbc}', '\u{cbc}'), ('ಾ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('ೳ', 'ೳ'), ('\u{d00}', 'ഃ'), ('\u{d3b}', '\u{d3c}'), ('\u{d3e}', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', 'ඃ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', '༿'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('ါ', '\u{103e}'), ('ၖ', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{108d}'), ('ႏ', 'ႏ'), ('ႚ', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '᜕'), ('\u{1732}', '᜴'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('\u{1a17}', '\u{1a1b}'), ('ᩕ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', 'ᬄ'), ('\u{1b34}', '᭄'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', 'ᮂ'), ('ᮡ', '\u{1bad}'), ('\u{1be6}', '᯳'), ('ᰤ', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('᳷', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{200c}', '\u{200d}'), ('\u{20d0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('ꠣ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꢀ', 'ꢁ'), ('ꢴ', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '꥓'), ('\u{a980}', 'ꦃ'), ('\u{a9b3}', '꧀'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', 'ꩍ'), ('ꩻ', 'ꩽ'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('ꫫ', 'ꫯ'), ('ꫵ', '\u{aaf6}'), ('ꯣ', 'ꯪ'), ('꯬', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{ff9e}', '\u{ff9f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('𑀀', '𑀂'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '𑂂'), ('𑂰', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{11134}'), ('𑅅', '𑅆'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '𑆂'), ('𑆳', '𑇀'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '\u{111cf}'), ('𑈬', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112ea}'), ('\u{11300}', '𑌃'), ('\u{1133b}', '\u{1133c}'), ('\u{1133e}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('\u{11357}', '\u{11357}'), ('𑍢', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐵', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b0}', '\u{114c3}'), ('\u{115af}', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('𑘰', '\u{11640}'), ('\u{116ab}', '\u{116b7}'), ('\u{1171d}', '\u{1172b}'), ('𑠬', '\u{1183a}'), ('\u{11930}', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{1193e}'), ('𑥀', '𑥀'), ('𑥂', '\u{11943}'), ('𑧑', '\u{119d7}'), ('\u{119da}', '\u{119e0}'), ('𑧤', '𑧤'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '𑨹'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a99}'), ('𑰯', '\u{11c36}'), ('\u{11c38}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('𑶊', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '\u{11d97}'), ('\u{11ef3}', '𑻶'), ('\u{11f00}', '\u{11f01}'), ('𑼃', '𑼃'), ('𑼴', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('𖽑', '𖾇'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('𖿰', '𖿱'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('\u{e0020}', '\u{e007f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const FORMAT: &'static [(char, char)] = &[ ('\u{ad}', '\u{ad}'), ('\u{600}', '\u{605}'), ('\u{61c}', '\u{61c}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{890}', '\u{891}'), ('\u{8e2}', '\u{8e2}'), ('\u{180e}', '\u{180e}'), ('\u{200b}', '\u{200b}'), ('\u{200e}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('\u{feff}', '\u{feff}'), ('\u{fff9}', '\u{fffb}'), ('\u{110bd}', '\u{110bd}'), ('\u{110cd}', '\u{110cd}'), ('\u{13430}', '\u{1343f}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), ('\u{e0001}', '\u{e0001}'), ]; pub const LF: &'static [(char, char)] = &[('\n', '\n')]; pub const LOWER: &'static [(char, char)] = &[ ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('ß', 'ö'), ('ø', 'ÿ'), ('ā', 'ā'), ('ă', 'ă'), ('ą', 'ą'), ('ć', 'ć'), ('ĉ', 'ĉ'), ('ċ', 'ċ'), ('č', 'č'), ('ď', 'ď'), ('đ', 'đ'), ('ē', 'ē'), ('ĕ', 'ĕ'), ('ė', 'ė'), ('ę', 'ę'), ('ě', 'ě'), ('ĝ', 'ĝ'), ('ğ', 'ğ'), ('ġ', 'ġ'), ('ģ', 'ģ'), ('ĥ', 'ĥ'), ('ħ', 'ħ'), ('ĩ', 'ĩ'), ('ī', 'ī'), ('ĭ', 'ĭ'), ('į', 'į'), ('ı', 'ı'), ('ij', 'ij'), ('ĵ', 'ĵ'), ('ķ', 'ĸ'), ('ĺ', 'ĺ'), ('ļ', 'ļ'), ('ľ', 'ľ'), ('ŀ', 'ŀ'), ('ł', 'ł'), ('ń', 'ń'), ('ņ', 'ņ'), ('ň', 'ʼn'), ('ŋ', 'ŋ'), ('ō', 'ō'), ('ŏ', 'ŏ'), ('ő', 'ő'), ('œ', 'œ'), ('ŕ', 'ŕ'), ('ŗ', 'ŗ'), ('ř', 'ř'), ('ś', 'ś'), ('ŝ', 'ŝ'), ('ş', 'ş'), ('š', 'š'), ('ţ', 'ţ'), ('ť', 'ť'), ('ŧ', 'ŧ'), ('ũ', 'ũ'), ('ū', 'ū'), ('ŭ', 'ŭ'), ('ů', 'ů'), ('ű', 'ű'), ('ų', 'ų'), ('ŵ', 'ŵ'), ('ŷ', 'ŷ'), ('ź', 'ź'), ('ż', 'ż'), ('ž', 'ƀ'), ('ƃ', 'ƃ'), ('ƅ', 'ƅ'), ('ƈ', 'ƈ'), ('ƌ', 'ƍ'), ('ƒ', 'ƒ'), ('ƕ', 'ƕ'), ('ƙ', 'ƛ'), ('ƞ', 'ƞ'), ('ơ', 'ơ'), ('ƣ', 'ƣ'), ('ƥ', 'ƥ'), ('ƨ', 'ƨ'), ('ƪ', 'ƫ'), ('ƭ', 'ƭ'), ('ư', 'ư'), ('ƴ', 'ƴ'), ('ƶ', 'ƶ'), ('ƹ', 'ƺ'), ('ƽ', 'ƿ'), ('dž', 'dž'), ('lj', 'lj'), ('nj', 'nj'), ('ǎ', 'ǎ'), ('ǐ', 'ǐ'), ('ǒ', 'ǒ'), ('ǔ', 'ǔ'), ('ǖ', 'ǖ'), ('ǘ', 'ǘ'), ('ǚ', 'ǚ'), ('ǜ', 'ǝ'), ('ǟ', 'ǟ'), ('ǡ', 'ǡ'), ('ǣ', 'ǣ'), ('ǥ', 'ǥ'), ('ǧ', 'ǧ'), ('ǩ', 'ǩ'), ('ǫ', 'ǫ'), ('ǭ', 'ǭ'), ('ǯ', 'ǰ'), ('dz', 'dz'), ('ǵ', 'ǵ'), ('ǹ', 'ǹ'), ('ǻ', 'ǻ'), ('ǽ', 'ǽ'), ('ǿ', 'ǿ'), ('ȁ', 'ȁ'), ('ȃ', 'ȃ'), ('ȅ', 'ȅ'), ('ȇ', 'ȇ'), ('ȉ', 'ȉ'), ('ȋ', 'ȋ'), ('ȍ', 'ȍ'), ('ȏ', 'ȏ'), ('ȑ', 'ȑ'), ('ȓ', 'ȓ'), ('ȕ', 'ȕ'), ('ȗ', 'ȗ'), ('ș', 'ș'), ('ț', 'ț'), ('ȝ', 'ȝ'), ('ȟ', 'ȟ'), ('ȡ', 'ȡ'), ('ȣ', 'ȣ'), ('ȥ', 'ȥ'), ('ȧ', 'ȧ'), ('ȩ', 'ȩ'), ('ȫ', 'ȫ'), ('ȭ', 'ȭ'), ('ȯ', 'ȯ'), ('ȱ', 'ȱ'), ('ȳ', 'ȹ'), ('ȼ', 'ȼ'), ('ȿ', 'ɀ'), ('ɂ', 'ɂ'), ('ɇ', 'ɇ'), ('ɉ', 'ɉ'), ('ɋ', 'ɋ'), ('ɍ', 'ɍ'), ('ɏ', 'ʓ'), ('ʕ', 'ʸ'), ('ˀ', 'ˁ'), ('ˠ', 'ˤ'), ('ͱ', 'ͱ'), ('ͳ', 'ͳ'), ('ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('ΐ', 'ΐ'), ('ά', 'ώ'), ('ϐ', 'ϑ'), ('ϕ', 'ϗ'), ('ϙ', 'ϙ'), ('ϛ', 'ϛ'), ('ϝ', 'ϝ'), ('ϟ', 'ϟ'), ('ϡ', 'ϡ'), ('ϣ', 'ϣ'), ('ϥ', 'ϥ'), ('ϧ', 'ϧ'), ('ϩ', 'ϩ'), ('ϫ', 'ϫ'), ('ϭ', 'ϭ'), ('ϯ', 'ϳ'), ('ϵ', 'ϵ'), ('ϸ', 'ϸ'), ('ϻ', 'ϼ'), ('а', 'џ'), ('ѡ', 'ѡ'), ('ѣ', 'ѣ'), ('ѥ', 'ѥ'), ('ѧ', 'ѧ'), ('ѩ', 'ѩ'), ('ѫ', 'ѫ'), ('ѭ', 'ѭ'), ('ѯ', 'ѯ'), ('ѱ', 'ѱ'), ('ѳ', 'ѳ'), ('ѵ', 'ѵ'), ('ѷ', 'ѷ'), ('ѹ', 'ѹ'), ('ѻ', 'ѻ'), ('ѽ', 'ѽ'), ('ѿ', 'ѿ'), ('ҁ', 'ҁ'), ('ҋ', 'ҋ'), ('ҍ', 'ҍ'), ('ҏ', 'ҏ'), ('ґ', 'ґ'), ('ғ', 'ғ'), ('ҕ', 'ҕ'), ('җ', 'җ'), ('ҙ', 'ҙ'), ('қ', 'қ'), ('ҝ', 'ҝ'), ('ҟ', 'ҟ'), ('ҡ', 'ҡ'), ('ң', 'ң'), ('ҥ', 'ҥ'), ('ҧ', 'ҧ'), ('ҩ', 'ҩ'), ('ҫ', 'ҫ'), ('ҭ', 'ҭ'), ('ү', 'ү'), ('ұ', 'ұ'), ('ҳ', 'ҳ'), ('ҵ', 'ҵ'), ('ҷ', 'ҷ'), ('ҹ', 'ҹ'), ('һ', 'һ'), ('ҽ', 'ҽ'), ('ҿ', 'ҿ'), ('ӂ', 'ӂ'), ('ӄ', 'ӄ'), ('ӆ', 'ӆ'), ('ӈ', 'ӈ'), ('ӊ', 'ӊ'), ('ӌ', 'ӌ'), ('ӎ', 'ӏ'), ('ӑ', 'ӑ'), ('ӓ', 'ӓ'), ('ӕ', 'ӕ'), ('ӗ', 'ӗ'), ('ә', 'ә'), ('ӛ', 'ӛ'), ('ӝ', 'ӝ'), ('ӟ', 'ӟ'), ('ӡ', 'ӡ'), ('ӣ', 'ӣ'), ('ӥ', 'ӥ'), ('ӧ', 'ӧ'), ('ө', 'ө'), ('ӫ', 'ӫ'), ('ӭ', 'ӭ'), ('ӯ', 'ӯ'), ('ӱ', 'ӱ'), ('ӳ', 'ӳ'), ('ӵ', 'ӵ'), ('ӷ', 'ӷ'), ('ӹ', 'ӹ'), ('ӻ', 'ӻ'), ('ӽ', 'ӽ'), ('ӿ', 'ӿ'), ('ԁ', 'ԁ'), ('ԃ', 'ԃ'), ('ԅ', 'ԅ'), ('ԇ', 'ԇ'), ('ԉ', 'ԉ'), ('ԋ', 'ԋ'), ('ԍ', 'ԍ'), ('ԏ', 'ԏ'), ('ԑ', 'ԑ'), ('ԓ', 'ԓ'), ('ԕ', 'ԕ'), ('ԗ', 'ԗ'), ('ԙ', 'ԙ'), ('ԛ', 'ԛ'), ('ԝ', 'ԝ'), ('ԟ', 'ԟ'), ('ԡ', 'ԡ'), ('ԣ', 'ԣ'), ('ԥ', 'ԥ'), ('ԧ', 'ԧ'), ('ԩ', 'ԩ'), ('ԫ', 'ԫ'), ('ԭ', 'ԭ'), ('ԯ', 'ԯ'), ('ՠ', 'ֈ'), ('ჼ', 'ჼ'), ('ᏸ', 'ᏽ'), ('ᲀ', 'ᲈ'), ('ᴀ', 'ᶿ'), ('ḁ', 'ḁ'), ('ḃ', 'ḃ'), ('ḅ', 'ḅ'), ('ḇ', 'ḇ'), ('ḉ', 'ḉ'), ('ḋ', 'ḋ'), ('ḍ', 'ḍ'), ('ḏ', 'ḏ'), ('ḑ', 'ḑ'), ('ḓ', 'ḓ'), ('ḕ', 'ḕ'), ('ḗ', 'ḗ'), ('ḙ', 'ḙ'), ('ḛ', 'ḛ'), ('ḝ', 'ḝ'), ('ḟ', 'ḟ'), ('ḡ', 'ḡ'), ('ḣ', 'ḣ'), ('ḥ', 'ḥ'), ('ḧ', 'ḧ'), ('ḩ', 'ḩ'), ('ḫ', 'ḫ'), ('ḭ', 'ḭ'), ('ḯ', 'ḯ'), ('ḱ', 'ḱ'), ('ḳ', 'ḳ'), ('ḵ', 'ḵ'), ('ḷ', 'ḷ'), ('ḹ', 'ḹ'), ('ḻ', 'ḻ'), ('ḽ', 'ḽ'), ('ḿ', 'ḿ'), ('ṁ', 'ṁ'), ('ṃ', 'ṃ'), ('ṅ', 'ṅ'), ('ṇ', 'ṇ'), ('ṉ', 'ṉ'), ('ṋ', 'ṋ'), ('ṍ', 'ṍ'), ('ṏ', 'ṏ'), ('ṑ', 'ṑ'), ('ṓ', 'ṓ'), ('ṕ', 'ṕ'), ('ṗ', 'ṗ'), ('ṙ', 'ṙ'), ('ṛ', 'ṛ'), ('ṝ', 'ṝ'), ('ṟ', 'ṟ'), ('ṡ', 'ṡ'), ('ṣ', 'ṣ'), ('ṥ', 'ṥ'), ('ṧ', 'ṧ'), ('ṩ', 'ṩ'), ('ṫ', 'ṫ'), ('ṭ', 'ṭ'), ('ṯ', 'ṯ'), ('ṱ', 'ṱ'), ('ṳ', 'ṳ'), ('ṵ', 'ṵ'), ('ṷ', 'ṷ'), ('ṹ', 'ṹ'), ('ṻ', 'ṻ'), ('ṽ', 'ṽ'), ('ṿ', 'ṿ'), ('ẁ', 'ẁ'), ('ẃ', 'ẃ'), ('ẅ', 'ẅ'), ('ẇ', 'ẇ'), ('ẉ', 'ẉ'), ('ẋ', 'ẋ'), ('ẍ', 'ẍ'), ('ẏ', 'ẏ'), ('ẑ', 'ẑ'), ('ẓ', 'ẓ'), ('ẕ', 'ẝ'), ('ẟ', 'ẟ'), ('ạ', 'ạ'), ('ả', 'ả'), ('ấ', 'ấ'), ('ầ', 'ầ'), ('ẩ', 'ẩ'), ('ẫ', 'ẫ'), ('ậ', 'ậ'), ('ắ', 'ắ'), ('ằ', 'ằ'), ('ẳ', 'ẳ'), ('ẵ', 'ẵ'), ('ặ', 'ặ'), ('ẹ', 'ẹ'), ('ẻ', 'ẻ'), ('ẽ', 'ẽ'), ('ế', 'ế'), ('ề', 'ề'), ('ể', 'ể'), ('ễ', 'ễ'), ('ệ', 'ệ'), ('ỉ', 'ỉ'), ('ị', 'ị'), ('ọ', 'ọ'), ('ỏ', 'ỏ'), ('ố', 'ố'), ('ồ', 'ồ'), ('ổ', 'ổ'), ('ỗ', 'ỗ'), ('ộ', 'ộ'), ('ớ', 'ớ'), ('ờ', 'ờ'), ('ở', 'ở'), ('ỡ', 'ỡ'), ('ợ', 'ợ'), ('ụ', 'ụ'), ('ủ', 'ủ'), ('ứ', 'ứ'), ('ừ', 'ừ'), ('ử', 'ử'), ('ữ', 'ữ'), ('ự', 'ự'), ('ỳ', 'ỳ'), ('ỵ', 'ỵ'), ('ỷ', 'ỷ'), ('ỹ', 'ỹ'), ('ỻ', 'ỻ'), ('ỽ', 'ỽ'), ('ỿ', 'ἇ'), ('ἐ', 'ἕ'), ('ἠ', 'ἧ'), ('ἰ', 'ἷ'), ('ὀ', 'ὅ'), ('ὐ', 'ὗ'), ('ὠ', 'ὧ'), ('ὰ', 'ώ'), ('ᾀ', 'ᾇ'), ('ᾐ', 'ᾗ'), ('ᾠ', 'ᾧ'), ('ᾰ', 'ᾴ'), ('ᾶ', 'ᾷ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῇ'), ('ῐ', 'ΐ'), ('ῖ', 'ῗ'), ('ῠ', 'ῧ'), ('ῲ', 'ῴ'), ('ῶ', 'ῷ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℊ', 'ℊ'), ('ℎ', 'ℏ'), ('ℓ', 'ℓ'), ('ℯ', 'ℯ'), ('ℴ', 'ℴ'), ('ℹ', 'ℹ'), ('ℼ', 'ℽ'), ('ⅆ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('ⅰ', 'ⅿ'), ('ↄ', 'ↄ'), ('ⓐ', 'ⓩ'), ('ⰰ', 'ⱟ'), ('ⱡ', 'ⱡ'), ('ⱥ', 'ⱦ'), ('ⱨ', 'ⱨ'), ('ⱪ', 'ⱪ'), ('ⱬ', 'ⱬ'), ('ⱱ', 'ⱱ'), ('ⱳ', 'ⱴ'), ('ⱶ', 'ⱽ'), ('ⲁ', 'ⲁ'), ('ⲃ', 'ⲃ'), ('ⲅ', 'ⲅ'), ('ⲇ', 'ⲇ'), ('ⲉ', 'ⲉ'), ('ⲋ', 'ⲋ'), ('ⲍ', 'ⲍ'), ('ⲏ', 'ⲏ'), ('ⲑ', 'ⲑ'), ('ⲓ', 'ⲓ'), ('ⲕ', 'ⲕ'), ('ⲗ', 'ⲗ'), ('ⲙ', 'ⲙ'), ('ⲛ', 'ⲛ'), ('ⲝ', 'ⲝ'), ('ⲟ', 'ⲟ'), ('ⲡ', 'ⲡ'), ('ⲣ', 'ⲣ'), ('ⲥ', 'ⲥ'), ('ⲧ', 'ⲧ'), ('ⲩ', 'ⲩ'), ('ⲫ', 'ⲫ'), ('ⲭ', 'ⲭ'), ('ⲯ', 'ⲯ'), ('ⲱ', 'ⲱ'), ('ⲳ', 'ⲳ'), ('ⲵ', 'ⲵ'), ('ⲷ', 'ⲷ'), ('ⲹ', 'ⲹ'), ('ⲻ', 'ⲻ'), ('ⲽ', 'ⲽ'), ('ⲿ', 'ⲿ'), ('ⳁ', 'ⳁ'), ('ⳃ', 'ⳃ'), ('ⳅ', 'ⳅ'), ('ⳇ', 'ⳇ'), ('ⳉ', 'ⳉ'), ('ⳋ', 'ⳋ'), ('ⳍ', 'ⳍ'), ('ⳏ', 'ⳏ'), ('ⳑ', 'ⳑ'), ('ⳓ', 'ⳓ'), ('ⳕ', 'ⳕ'), ('ⳗ', 'ⳗ'), ('ⳙ', 'ⳙ'), ('ⳛ', 'ⳛ'), ('ⳝ', 'ⳝ'), ('ⳟ', 'ⳟ'), ('ⳡ', 'ⳡ'), ('ⳣ', 'ⳤ'), ('ⳬ', 'ⳬ'), ('ⳮ', 'ⳮ'), ('ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ꙁ', 'ꙁ'), ('ꙃ', 'ꙃ'), ('ꙅ', 'ꙅ'), ('ꙇ', 'ꙇ'), ('ꙉ', 'ꙉ'), ('ꙋ', 'ꙋ'), ('ꙍ', 'ꙍ'), ('ꙏ', 'ꙏ'), ('ꙑ', 'ꙑ'), ('ꙓ', 'ꙓ'), ('ꙕ', 'ꙕ'), ('ꙗ', 'ꙗ'), ('ꙙ', 'ꙙ'), ('ꙛ', 'ꙛ'), ('ꙝ', 'ꙝ'), ('ꙟ', 'ꙟ'), ('ꙡ', 'ꙡ'), ('ꙣ', 'ꙣ'), ('ꙥ', 'ꙥ'), ('ꙧ', 'ꙧ'), ('ꙩ', 'ꙩ'), ('ꙫ', 'ꙫ'), ('ꙭ', 'ꙭ'), ('ꚁ', 'ꚁ'), ('ꚃ', 'ꚃ'), ('ꚅ', 'ꚅ'), ('ꚇ', 'ꚇ'), ('ꚉ', 'ꚉ'), ('ꚋ', 'ꚋ'), ('ꚍ', 'ꚍ'), ('ꚏ', 'ꚏ'), ('ꚑ', 'ꚑ'), ('ꚓ', 'ꚓ'), ('ꚕ', 'ꚕ'), ('ꚗ', 'ꚗ'), ('ꚙ', 'ꚙ'), ('ꚛ', 'ꚝ'), ('ꜣ', 'ꜣ'), ('ꜥ', 'ꜥ'), ('ꜧ', 'ꜧ'), ('ꜩ', 'ꜩ'), ('ꜫ', 'ꜫ'), ('ꜭ', 'ꜭ'), ('ꜯ', 'ꜱ'), ('ꜳ', 'ꜳ'), ('ꜵ', 'ꜵ'), ('ꜷ', 'ꜷ'), ('ꜹ', 'ꜹ'), ('ꜻ', 'ꜻ'), ('ꜽ', 'ꜽ'), ('ꜿ', 'ꜿ'), ('ꝁ', 'ꝁ'), ('ꝃ', 'ꝃ'), ('ꝅ', 'ꝅ'), ('ꝇ', 'ꝇ'), ('ꝉ', 'ꝉ'), ('ꝋ', 'ꝋ'), ('ꝍ', 'ꝍ'), ('ꝏ', 'ꝏ'), ('ꝑ', 'ꝑ'), ('ꝓ', 'ꝓ'), ('ꝕ', 'ꝕ'), ('ꝗ', 'ꝗ'), ('ꝙ', 'ꝙ'), ('ꝛ', 'ꝛ'), ('ꝝ', 'ꝝ'), ('ꝟ', 'ꝟ'), ('ꝡ', 'ꝡ'), ('ꝣ', 'ꝣ'), ('ꝥ', 'ꝥ'), ('ꝧ', 'ꝧ'), ('ꝩ', 'ꝩ'), ('ꝫ', 'ꝫ'), ('ꝭ', 'ꝭ'), ('ꝯ', 'ꝸ'), ('ꝺ', 'ꝺ'), ('ꝼ', 'ꝼ'), ('ꝿ', 'ꝿ'), ('ꞁ', 'ꞁ'), ('ꞃ', 'ꞃ'), ('ꞅ', 'ꞅ'), ('ꞇ', 'ꞇ'), ('ꞌ', 'ꞌ'), ('ꞎ', 'ꞎ'), ('ꞑ', 'ꞑ'), ('ꞓ', 'ꞕ'), ('ꞗ', 'ꞗ'), ('ꞙ', 'ꞙ'), ('ꞛ', 'ꞛ'), ('ꞝ', 'ꞝ'), ('ꞟ', 'ꞟ'), ('ꞡ', 'ꞡ'), ('ꞣ', 'ꞣ'), ('ꞥ', 'ꞥ'), ('ꞧ', 'ꞧ'), ('ꞩ', 'ꞩ'), ('ꞯ', 'ꞯ'), ('ꞵ', 'ꞵ'), ('ꞷ', 'ꞷ'), ('ꞹ', 'ꞹ'), ('ꞻ', 'ꞻ'), ('ꞽ', 'ꞽ'), ('ꞿ', 'ꞿ'), ('ꟁ', 'ꟁ'), ('ꟃ', 'ꟃ'), ('ꟈ', 'ꟈ'), ('ꟊ', 'ꟊ'), ('ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟕ'), ('ꟗ', 'ꟗ'), ('ꟙ', 'ꟙ'), ('ꟲ', 'ꟴ'), ('ꟶ', 'ꟶ'), ('ꟸ', 'ꟺ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꮿ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('a', 'z'), ('𐐨', '𐑏'), ('𐓘', '𐓻'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐞀', '𐞀'), ('𐞃', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐳀', '𐳲'), ('𑣀', '𑣟'), ('𖹠', '𖹿'), ('𝐚', '𝐳'), ('𝑎', '𝑔'), ('𝑖', '𝑧'), ('𝒂', '𝒛'), ('𝒶', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝓏'), ('𝓪', '𝔃'), ('𝔞', '𝔷'), ('𝕒', '𝕫'), ('𝖆', '𝖟'), ('𝖺', '𝗓'), ('𝗮', '𝘇'), ('𝘢', '𝘻'), ('𝙖', '𝙯'), ('𝚊', '𝚥'), ('𝛂', '𝛚'), ('𝛜', '𝛡'), ('𝛼', '𝜔'), ('𝜖', '𝜛'), ('𝜶', '𝝎'), ('𝝐', '𝝕'), ('𝝰', '𝞈'), ('𝞊', '𝞏'), ('𝞪', '𝟂'), ('𝟄', '𝟉'), ('𝟋', '𝟋'), ('𝼀', '𝼉'), ('𝼋', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞤢', '𞥃'), ]; pub const NUMERIC: &'static [(char, char)] = &[ ('0', '9'), ('٠', '٩'), ('٫', '٬'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), ('𐒠', '𐒩'), ('𐴰', '𐴹'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), ('𑥐', '𑥙'), ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𑶠', '𑶩'), ('𑽐', '𑽙'), ('𖩠', '𖩩'), ('𖫀', '𖫉'), ('𖭐', '𖭙'), ('𝟎', '𝟿'), ('𞅀', '𞅉'), ('𞋰', '𞋹'), ('𞓰', '𞓹'), ('𞥐', '𞥙'), ('🯰', '🯹'), ]; pub const OLETTER: &'static [(char, char)] = &[ ('ƻ', 'ƻ'), ('ǀ', 'ǃ'), ('ʔ', 'ʔ'), ('ʹ', 'ʿ'), ('ˆ', 'ˑ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('ʹ', 'ʹ'), ('ՙ', 'ՙ'), ('א', 'ת'), ('ׯ', '׳'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ก', 'ะ'), ('า', 'ำ'), ('เ', 'ๆ'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ະ'), ('າ', 'ຳ'), ('ຽ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('က', 'ဪ'), ('ဿ', 'ဿ'), ('ၐ', 'ၕ'), ('ၚ', 'ၝ'), ('ၡ', 'ၡ'), ('ၥ', 'ၦ'), ('ၮ', 'ၰ'), ('ၵ', 'ႁ'), ('ႎ', 'ႎ'), ('ა', 'ჺ'), ('ჽ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ក', 'ឳ'), ('ៗ', 'ៗ'), ('ៜ', 'ៜ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᥐ', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('ᨀ', 'ᨖ'), ('ᨠ', 'ᩔ'), ('ᪧ', 'ᪧ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ℵ', 'ℸ'), ('ↀ', 'ↂ'), ('ↅ', 'ↈ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '〩'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('ꙮ', 'ꙮ'), ('ꙿ', 'ꙿ'), ('ꚠ', 'ꛯ'), ('ꜗ', 'ꜟ'), ('ꞈ', 'ꞈ'), ('ꞏ', 'ꞏ'), ('ꟷ', 'ꟷ'), ('ꟻ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꧠ', 'ꧤ'), ('ꧦ', 'ꧯ'), ('ꧺ', 'ꧾ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꩺ'), ('ꩾ', 'ꪯ'), ('ꪱ', 'ꪱ'), ('ꪵ', 'ꪶ'), ('ꪹ', 'ꪽ'), ('ꫀ', 'ꫀ'), ('ꫂ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꯀ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('ヲ', 'ン'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐑐', '𐒝'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞁', '𐞂'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑜀', '𑜚'), ('𑝀', '𑝆'), ('𑠀', '𑠫'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘴀', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝼊', '𝼊'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ]; pub const SCONTINUE: &'static [(char, char)] = &[ (',', '-'), (':', ':'), ('՝', '՝'), ('،', '؍'), ('߸', '߸'), ('᠂', '᠂'), ('᠈', '᠈'), ('–', '—'), ('、', '、'), ('︐', '︑'), ('︓', '︓'), ('︱', '︲'), ('﹐', '﹑'), ('﹕', '﹕'), ('﹘', '﹘'), ('﹣', '﹣'), (',', '-'), (':', ':'), ('、', '、'), ]; pub const STERM: &'static [(char, char)] = &[ ('!', '!'), ('?', '?'), ('։', '։'), ('؝', '؟'), ('۔', '۔'), ('܀', '܂'), ('߹', '߹'), ('࠷', '࠷'), ('࠹', '࠹'), ('࠽', '࠾'), ('।', '॥'), ('၊', '။'), ('።', '።'), ('፧', '፨'), ('᙮', '᙮'), ('᜵', '᜶'), ('᠃', '᠃'), ('᠉', '᠉'), ('᥄', '᥅'), ('᪨', '᪫'), ('᭚', '᭛'), ('᭞', '᭟'), ('᭽', '᭾'), ('᰻', '᰼'), ('᱾', '᱿'), ('‼', '‽'), ('⁇', '⁉'), ('⸮', '⸮'), ('⸼', '⸼'), ('⹓', '⹔'), ('。', '。'), ('꓿', '꓿'), ('꘎', '꘏'), ('꛳', '꛳'), ('꛷', '꛷'), ('꡶', '꡷'), ('꣎', '꣏'), ('꤯', '꤯'), ('꧈', '꧉'), ('꩝', '꩟'), ('꫰', '꫱'), ('꯫', '꯫'), ('﹖', '﹗'), ('!', '!'), ('?', '?'), ('。', '。'), ('𐩖', '𐩗'), ('𐽕', '𐽙'), ('𐾆', '𐾉'), ('𑁇', '𑁈'), ('𑂾', '𑃁'), ('𑅁', '𑅃'), ('𑇅', '𑇆'), ('𑇍', '𑇍'), ('𑇞', '𑇟'), ('𑈸', '𑈹'), ('𑈻', '𑈼'), ('𑊩', '𑊩'), ('𑑋', '𑑌'), ('𑗂', '𑗃'), ('𑗉', '𑗗'), ('𑙁', '𑙂'), ('𑜼', '𑜾'), ('𑥄', '𑥄'), ('𑥆', '𑥆'), ('𑩂', '𑩃'), ('𑪛', '𑪜'), ('𑱁', '𑱂'), ('𑻷', '𑻸'), ('𑽃', '𑽄'), ('𖩮', '𖩯'), ('𖫵', '𖫵'), ('𖬷', '𖬸'), ('𖭄', '𖭄'), ('𖺘', '𖺘'), ('𛲟', '𛲟'), ('𝪈', '𝪈'), ]; pub const SEP: &'static [(char, char)] = &[('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; pub const SP: &'static [(char, char)] = &[ ('\t', '\t'), ('\u{b}', '\u{c}'), (' ', ' '), ('\u{a0}', '\u{a0}'), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'), ('\u{202f}', '\u{202f}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; pub const UPPER: &'static [(char, char)] = &[ ('A', 'Z'), ('À', 'Ö'), ('Ø', 'Þ'), ('Ā', 'Ā'), ('Ă', 'Ă'), ('Ą', 'Ą'), ('Ć', 'Ć'), ('Ĉ', 'Ĉ'), ('Ċ', 'Ċ'), ('Č', 'Č'), ('Ď', 'Ď'), ('Đ', 'Đ'), ('Ē', 'Ē'), ('Ĕ', 'Ĕ'), ('Ė', 'Ė'), ('Ę', 'Ę'), ('Ě', 'Ě'), ('Ĝ', 'Ĝ'), ('Ğ', 'Ğ'), ('Ġ', 'Ġ'), ('Ģ', 'Ģ'), ('Ĥ', 'Ĥ'), ('Ħ', 'Ħ'), ('Ĩ', 'Ĩ'), ('Ī', 'Ī'), ('Ĭ', 'Ĭ'), ('Į', 'Į'), ('İ', 'İ'), ('IJ', 'IJ'), ('Ĵ', 'Ĵ'), ('Ķ', 'Ķ'), ('Ĺ', 'Ĺ'), ('Ļ', 'Ļ'), ('Ľ', 'Ľ'), ('Ŀ', 'Ŀ'), ('Ł', 'Ł'), ('Ń', 'Ń'), ('Ņ', 'Ņ'), ('Ň', 'Ň'), ('Ŋ', 'Ŋ'), ('Ō', 'Ō'), ('Ŏ', 'Ŏ'), ('Ő', 'Ő'), ('Œ', 'Œ'), ('Ŕ', 'Ŕ'), ('Ŗ', 'Ŗ'), ('Ř', 'Ř'), ('Ś', 'Ś'), ('Ŝ', 'Ŝ'), ('Ş', 'Ş'), ('Š', 'Š'), ('Ţ', 'Ţ'), ('Ť', 'Ť'), ('Ŧ', 'Ŧ'), ('Ũ', 'Ũ'), ('Ū', 'Ū'), ('Ŭ', 'Ŭ'), ('Ů', 'Ů'), ('Ű', 'Ű'), ('Ų', 'Ų'), ('Ŵ', 'Ŵ'), ('Ŷ', 'Ŷ'), ('Ÿ', 'Ź'), ('Ż', 'Ż'), ('Ž', 'Ž'), ('Ɓ', 'Ƃ'), ('Ƅ', 'Ƅ'), ('Ɔ', 'Ƈ'), ('Ɖ', 'Ƌ'), ('Ǝ', 'Ƒ'), ('Ɠ', 'Ɣ'), ('Ɩ', 'Ƙ'), ('Ɯ', 'Ɲ'), ('Ɵ', 'Ơ'), ('Ƣ', 'Ƣ'), ('Ƥ', 'Ƥ'), ('Ʀ', 'Ƨ'), ('Ʃ', 'Ʃ'), ('Ƭ', 'Ƭ'), ('Ʈ', 'Ư'), ('Ʊ', 'Ƴ'), ('Ƶ', 'Ƶ'), ('Ʒ', 'Ƹ'), ('Ƽ', 'Ƽ'), ('DŽ', 'Dž'), ('LJ', 'Lj'), ('NJ', 'Nj'), ('Ǎ', 'Ǎ'), ('Ǐ', 'Ǐ'), ('Ǒ', 'Ǒ'), ('Ǔ', 'Ǔ'), ('Ǖ', 'Ǖ'), ('Ǘ', 'Ǘ'), ('Ǚ', 'Ǚ'), ('Ǜ', 'Ǜ'), ('Ǟ', 'Ǟ'), ('Ǡ', 'Ǡ'), ('Ǣ', 'Ǣ'), ('Ǥ', 'Ǥ'), ('Ǧ', 'Ǧ'), ('Ǩ', 'Ǩ'), ('Ǫ', 'Ǫ'), ('Ǭ', 'Ǭ'), ('Ǯ', 'Ǯ'), ('DZ', 'Dz'), ('Ǵ', 'Ǵ'), ('Ƕ', 'Ǹ'), ('Ǻ', 'Ǻ'), ('Ǽ', 'Ǽ'), ('Ǿ', 'Ǿ'), ('Ȁ', 'Ȁ'), ('Ȃ', 'Ȃ'), ('Ȅ', 'Ȅ'), ('Ȇ', 'Ȇ'), ('Ȉ', 'Ȉ'), ('Ȋ', 'Ȋ'), ('Ȍ', 'Ȍ'), ('Ȏ', 'Ȏ'), ('Ȑ', 'Ȑ'), ('Ȓ', 'Ȓ'), ('Ȕ', 'Ȕ'), ('Ȗ', 'Ȗ'), ('Ș', 'Ș'), ('Ț', 'Ț'), ('Ȝ', 'Ȝ'), ('Ȟ', 'Ȟ'), ('Ƞ', 'Ƞ'), ('Ȣ', 'Ȣ'), ('Ȥ', 'Ȥ'), ('Ȧ', 'Ȧ'), ('Ȩ', 'Ȩ'), ('Ȫ', 'Ȫ'), ('Ȭ', 'Ȭ'), ('Ȯ', 'Ȯ'), ('Ȱ', 'Ȱ'), ('Ȳ', 'Ȳ'), ('Ⱥ', 'Ȼ'), ('Ƚ', 'Ⱦ'), ('Ɂ', 'Ɂ'), ('Ƀ', 'Ɇ'), ('Ɉ', 'Ɉ'), ('Ɋ', 'Ɋ'), ('Ɍ', 'Ɍ'), ('Ɏ', 'Ɏ'), ('Ͱ', 'Ͱ'), ('Ͳ', 'Ͳ'), ('Ͷ', 'Ͷ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ώ'), ('Α', 'Ρ'), ('Σ', 'Ϋ'), ('Ϗ', 'Ϗ'), ('ϒ', 'ϔ'), ('Ϙ', 'Ϙ'), ('Ϛ', 'Ϛ'), ('Ϝ', 'Ϝ'), ('Ϟ', 'Ϟ'), ('Ϡ', 'Ϡ'), ('Ϣ', 'Ϣ'), ('Ϥ', 'Ϥ'), ('Ϧ', 'Ϧ'), ('Ϩ', 'Ϩ'), ('Ϫ', 'Ϫ'), ('Ϭ', 'Ϭ'), ('Ϯ', 'Ϯ'), ('ϴ', 'ϴ'), ('Ϸ', 'Ϸ'), ('Ϲ', 'Ϻ'), ('Ͻ', 'Я'), ('Ѡ', 'Ѡ'), ('Ѣ', 'Ѣ'), ('Ѥ', 'Ѥ'), ('Ѧ', 'Ѧ'), ('Ѩ', 'Ѩ'), ('Ѫ', 'Ѫ'), ('Ѭ', 'Ѭ'), ('Ѯ', 'Ѯ'), ('Ѱ', 'Ѱ'), ('Ѳ', 'Ѳ'), ('Ѵ', 'Ѵ'), ('Ѷ', 'Ѷ'), ('Ѹ', 'Ѹ'), ('Ѻ', 'Ѻ'), ('Ѽ', 'Ѽ'), ('Ѿ', 'Ѿ'), ('Ҁ', 'Ҁ'), ('Ҋ', 'Ҋ'), ('Ҍ', 'Ҍ'), ('Ҏ', 'Ҏ'), ('Ґ', 'Ґ'), ('Ғ', 'Ғ'), ('Ҕ', 'Ҕ'), ('Җ', 'Җ'), ('Ҙ', 'Ҙ'), ('Қ', 'Қ'), ('Ҝ', 'Ҝ'), ('Ҟ', 'Ҟ'), ('Ҡ', 'Ҡ'), ('Ң', 'Ң'), ('Ҥ', 'Ҥ'), ('Ҧ', 'Ҧ'), ('Ҩ', 'Ҩ'), ('Ҫ', 'Ҫ'), ('Ҭ', 'Ҭ'), ('Ү', 'Ү'), ('Ұ', 'Ұ'), ('Ҳ', 'Ҳ'), ('Ҵ', 'Ҵ'), ('Ҷ', 'Ҷ'), ('Ҹ', 'Ҹ'), ('Һ', 'Һ'), ('Ҽ', 'Ҽ'), ('Ҿ', 'Ҿ'), ('Ӏ', 'Ӂ'), ('Ӄ', 'Ӄ'), ('Ӆ', 'Ӆ'), ('Ӈ', 'Ӈ'), ('Ӊ', 'Ӊ'), ('Ӌ', 'Ӌ'), ('Ӎ', 'Ӎ'), ('Ӑ', 'Ӑ'), ('Ӓ', 'Ӓ'), ('Ӕ', 'Ӕ'), ('Ӗ', 'Ӗ'), ('Ә', 'Ә'), ('Ӛ', 'Ӛ'), ('Ӝ', 'Ӝ'), ('Ӟ', 'Ӟ'), ('Ӡ', 'Ӡ'), ('Ӣ', 'Ӣ'), ('Ӥ', 'Ӥ'), ('Ӧ', 'Ӧ'), ('Ө', 'Ө'), ('Ӫ', 'Ӫ'), ('Ӭ', 'Ӭ'), ('Ӯ', 'Ӯ'), ('Ӱ', 'Ӱ'), ('Ӳ', 'Ӳ'), ('Ӵ', 'Ӵ'), ('Ӷ', 'Ӷ'), ('Ӹ', 'Ӹ'), ('Ӻ', 'Ӻ'), ('Ӽ', 'Ӽ'), ('Ӿ', 'Ӿ'), ('Ԁ', 'Ԁ'), ('Ԃ', 'Ԃ'), ('Ԅ', 'Ԅ'), ('Ԇ', 'Ԇ'), ('Ԉ', 'Ԉ'), ('Ԋ', 'Ԋ'), ('Ԍ', 'Ԍ'), ('Ԏ', 'Ԏ'), ('Ԑ', 'Ԑ'), ('Ԓ', 'Ԓ'), ('Ԕ', 'Ԕ'), ('Ԗ', 'Ԗ'), ('Ԙ', 'Ԙ'), ('Ԛ', 'Ԛ'), ('Ԝ', 'Ԝ'), ('Ԟ', 'Ԟ'), ('Ԡ', 'Ԡ'), ('Ԣ', 'Ԣ'), ('Ԥ', 'Ԥ'), ('Ԧ', 'Ԧ'), ('Ԩ', 'Ԩ'), ('Ԫ', 'Ԫ'), ('Ԭ', 'Ԭ'), ('Ԯ', 'Ԯ'), ('Ա', 'Ֆ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('Ꭰ', 'Ᏽ'), ('Ḁ', 'Ḁ'), ('Ḃ', 'Ḃ'), ('Ḅ', 'Ḅ'), ('Ḇ', 'Ḇ'), ('Ḉ', 'Ḉ'), ('Ḋ', 'Ḋ'), ('Ḍ', 'Ḍ'), ('Ḏ', 'Ḏ'), ('Ḑ', 'Ḑ'), ('Ḓ', 'Ḓ'), ('Ḕ', 'Ḕ'), ('Ḗ', 'Ḗ'), ('Ḙ', 'Ḙ'), ('Ḛ', 'Ḛ'), ('Ḝ', 'Ḝ'), ('Ḟ', 'Ḟ'), ('Ḡ', 'Ḡ'), ('Ḣ', 'Ḣ'), ('Ḥ', 'Ḥ'), ('Ḧ', 'Ḧ'), ('Ḩ', 'Ḩ'), ('Ḫ', 'Ḫ'), ('Ḭ', 'Ḭ'), ('Ḯ', 'Ḯ'), ('Ḱ', 'Ḱ'), ('Ḳ', 'Ḳ'), ('Ḵ', 'Ḵ'), ('Ḷ', 'Ḷ'), ('Ḹ', 'Ḹ'), ('Ḻ', 'Ḻ'), ('Ḽ', 'Ḽ'), ('Ḿ', 'Ḿ'), ('Ṁ', 'Ṁ'), ('Ṃ', 'Ṃ'), ('Ṅ', 'Ṅ'), ('Ṇ', 'Ṇ'), ('Ṉ', 'Ṉ'), ('Ṋ', 'Ṋ'), ('Ṍ', 'Ṍ'), ('Ṏ', 'Ṏ'), ('Ṑ', 'Ṑ'), ('Ṓ', 'Ṓ'), ('Ṕ', 'Ṕ'), ('Ṗ', 'Ṗ'), ('Ṙ', 'Ṙ'), ('Ṛ', 'Ṛ'), ('Ṝ', 'Ṝ'), ('Ṟ', 'Ṟ'), ('Ṡ', 'Ṡ'), ('Ṣ', 'Ṣ'), ('Ṥ', 'Ṥ'), ('Ṧ', 'Ṧ'), ('Ṩ', 'Ṩ'), ('Ṫ', 'Ṫ'), ('Ṭ', 'Ṭ'), ('Ṯ', 'Ṯ'), ('Ṱ', 'Ṱ'), ('Ṳ', 'Ṳ'), ('Ṵ', 'Ṵ'), ('Ṷ', 'Ṷ'), ('Ṹ', 'Ṹ'), ('Ṻ', 'Ṻ'), ('Ṽ', 'Ṽ'), ('Ṿ', 'Ṿ'), ('Ẁ', 'Ẁ'), ('Ẃ', 'Ẃ'), ('Ẅ', 'Ẅ'), ('Ẇ', 'Ẇ'), ('Ẉ', 'Ẉ'), ('Ẋ', 'Ẋ'), ('Ẍ', 'Ẍ'), ('Ẏ', 'Ẏ'), ('Ẑ', 'Ẑ'), ('Ẓ', 'Ẓ'), ('Ẕ', 'Ẕ'), ('ẞ', 'ẞ'), ('Ạ', 'Ạ'), ('Ả', 'Ả'), ('Ấ', 'Ấ'), ('Ầ', 'Ầ'), ('Ẩ', 'Ẩ'), ('Ẫ', 'Ẫ'), ('Ậ', 'Ậ'), ('Ắ', 'Ắ'), ('Ằ', 'Ằ'), ('Ẳ', 'Ẳ'), ('Ẵ', 'Ẵ'), ('Ặ', 'Ặ'), ('Ẹ', 'Ẹ'), ('Ẻ', 'Ẻ'), ('Ẽ', 'Ẽ'), ('Ế', 'Ế'), ('Ề', 'Ề'), ('Ể', 'Ể'), ('Ễ', 'Ễ'), ('Ệ', 'Ệ'), ('Ỉ', 'Ỉ'), ('Ị', 'Ị'), ('Ọ', 'Ọ'), ('Ỏ', 'Ỏ'), ('Ố', 'Ố'), ('Ồ', 'Ồ'), ('Ổ', 'Ổ'), ('Ỗ', 'Ỗ'), ('Ộ', 'Ộ'), ('Ớ', 'Ớ'), ('Ờ', 'Ờ'), ('Ở', 'Ở'), ('Ỡ', 'Ỡ'), ('Ợ', 'Ợ'), ('Ụ', 'Ụ'), ('Ủ', 'Ủ'), ('Ứ', 'Ứ'), ('Ừ', 'Ừ'), ('Ử', 'Ử'), ('Ữ', 'Ữ'), ('Ự', 'Ự'), ('Ỳ', 'Ỳ'), ('Ỵ', 'Ỵ'), ('Ỷ', 'Ỷ'), ('Ỹ', 'Ỹ'), ('Ỻ', 'Ỻ'), ('Ỽ', 'Ỽ'), ('Ỿ', 'Ỿ'), ('Ἀ', 'Ἇ'), ('Ἐ', 'Ἕ'), ('Ἠ', 'Ἧ'), ('Ἰ', 'Ἷ'), ('Ὀ', 'Ὅ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'Ὗ'), ('Ὠ', 'Ὧ'), ('ᾈ', 'ᾏ'), ('ᾘ', 'ᾟ'), ('ᾨ', 'ᾯ'), ('Ᾰ', 'ᾼ'), ('Ὲ', 'ῌ'), ('Ῐ', 'Ί'), ('Ῠ', 'Ῥ'), ('Ὸ', 'ῼ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℋ', 'ℍ'), ('ℐ', 'ℒ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℰ', 'ℳ'), ('ℾ', 'ℿ'), ('ⅅ', 'ⅅ'), ('Ⅰ', 'Ⅿ'), ('Ↄ', 'Ↄ'), ('Ⓐ', 'Ⓩ'), ('Ⰰ', 'Ⱟ'), ('Ⱡ', 'Ⱡ'), ('Ɫ', 'Ɽ'), ('Ⱨ', 'Ⱨ'), ('Ⱪ', 'Ⱪ'), ('Ⱬ', 'Ⱬ'), ('Ɑ', 'Ɒ'), ('Ⱳ', 'Ⱳ'), ('Ⱶ', 'Ⱶ'), ('Ȿ', 'Ⲁ'), ('Ⲃ', 'Ⲃ'), ('Ⲅ', 'Ⲅ'), ('Ⲇ', 'Ⲇ'), ('Ⲉ', 'Ⲉ'), ('Ⲋ', 'Ⲋ'), ('Ⲍ', 'Ⲍ'), ('Ⲏ', 'Ⲏ'), ('Ⲑ', 'Ⲑ'), ('Ⲓ', 'Ⲓ'), ('Ⲕ', 'Ⲕ'), ('Ⲗ', 'Ⲗ'), ('Ⲙ', 'Ⲙ'), ('Ⲛ', 'Ⲛ'), ('Ⲝ', 'Ⲝ'), ('Ⲟ', 'Ⲟ'), ('Ⲡ', 'Ⲡ'), ('Ⲣ', 'Ⲣ'), ('Ⲥ', 'Ⲥ'), ('Ⲧ', 'Ⲧ'), ('Ⲩ', 'Ⲩ'), ('Ⲫ', 'Ⲫ'), ('Ⲭ', 'Ⲭ'), ('Ⲯ', 'Ⲯ'), ('Ⲱ', 'Ⲱ'), ('Ⲳ', 'Ⲳ'), ('Ⲵ', 'Ⲵ'), ('Ⲷ', 'Ⲷ'), ('Ⲹ', 'Ⲹ'), ('Ⲻ', 'Ⲻ'), ('Ⲽ', 'Ⲽ'), ('Ⲿ', 'Ⲿ'), ('Ⳁ', 'Ⳁ'), ('Ⳃ', 'Ⳃ'), ('Ⳅ', 'Ⳅ'), ('Ⳇ', 'Ⳇ'), ('Ⳉ', 'Ⳉ'), ('Ⳋ', 'Ⳋ'), ('Ⳍ', 'Ⳍ'), ('Ⳏ', 'Ⳏ'), ('Ⳑ', 'Ⳑ'), ('Ⳓ', 'Ⳓ'), ('Ⳕ', 'Ⳕ'), ('Ⳗ', 'Ⳗ'), ('Ⳙ', 'Ⳙ'), ('Ⳛ', 'Ⳛ'), ('Ⳝ', 'Ⳝ'), ('Ⳟ', 'Ⳟ'), ('Ⳡ', 'Ⳡ'), ('Ⳣ', 'Ⳣ'), ('Ⳬ', 'Ⳬ'), ('Ⳮ', 'Ⳮ'), ('Ⳳ', 'Ⳳ'), ('Ꙁ', 'Ꙁ'), ('Ꙃ', 'Ꙃ'), ('Ꙅ', 'Ꙅ'), ('Ꙇ', 'Ꙇ'), ('Ꙉ', 'Ꙉ'), ('Ꙋ', 'Ꙋ'), ('Ꙍ', 'Ꙍ'), ('Ꙏ', 'Ꙏ'), ('Ꙑ', 'Ꙑ'), ('Ꙓ', 'Ꙓ'), ('Ꙕ', 'Ꙕ'), ('Ꙗ', 'Ꙗ'), ('Ꙙ', 'Ꙙ'), ('Ꙛ', 'Ꙛ'), ('Ꙝ', 'Ꙝ'), ('Ꙟ', 'Ꙟ'), ('Ꙡ', 'Ꙡ'), ('Ꙣ', 'Ꙣ'), ('Ꙥ', 'Ꙥ'), ('Ꙧ', 'Ꙧ'), ('Ꙩ', 'Ꙩ'), ('Ꙫ', 'Ꙫ'), ('Ꙭ', 'Ꙭ'), ('Ꚁ', 'Ꚁ'), ('Ꚃ', 'Ꚃ'), ('Ꚅ', 'Ꚅ'), ('Ꚇ', 'Ꚇ'), ('Ꚉ', 'Ꚉ'), ('Ꚋ', 'Ꚋ'), ('Ꚍ', 'Ꚍ'), ('Ꚏ', 'Ꚏ'), ('Ꚑ', 'Ꚑ'), ('Ꚓ', 'Ꚓ'), ('Ꚕ', 'Ꚕ'), ('Ꚗ', 'Ꚗ'), ('Ꚙ', 'Ꚙ'), ('Ꚛ', 'Ꚛ'), ('Ꜣ', 'Ꜣ'), ('Ꜥ', 'Ꜥ'), ('Ꜧ', 'Ꜧ'), ('Ꜩ', 'Ꜩ'), ('Ꜫ', 'Ꜫ'), ('Ꜭ', 'Ꜭ'), ('Ꜯ', 'Ꜯ'), ('Ꜳ', 'Ꜳ'), ('Ꜵ', 'Ꜵ'), ('Ꜷ', 'Ꜷ'), ('Ꜹ', 'Ꜹ'), ('Ꜻ', 'Ꜻ'), ('Ꜽ', 'Ꜽ'), ('Ꜿ', 'Ꜿ'), ('Ꝁ', 'Ꝁ'), ('Ꝃ', 'Ꝃ'), ('Ꝅ', 'Ꝅ'), ('Ꝇ', 'Ꝇ'), ('Ꝉ', 'Ꝉ'), ('Ꝋ', 'Ꝋ'), ('Ꝍ', 'Ꝍ'), ('Ꝏ', 'Ꝏ'), ('Ꝑ', 'Ꝑ'), ('Ꝓ', 'Ꝓ'), ('Ꝕ', 'Ꝕ'), ('Ꝗ', 'Ꝗ'), ('Ꝙ', 'Ꝙ'), ('Ꝛ', 'Ꝛ'), ('Ꝝ', 'Ꝝ'), ('Ꝟ', 'Ꝟ'), ('Ꝡ', 'Ꝡ'), ('Ꝣ', 'Ꝣ'), ('Ꝥ', 'Ꝥ'), ('Ꝧ', 'Ꝧ'), ('Ꝩ', 'Ꝩ'), ('Ꝫ', 'Ꝫ'), ('Ꝭ', 'Ꝭ'), ('Ꝯ', 'Ꝯ'), ('Ꝺ', 'Ꝺ'), ('Ꝼ', 'Ꝼ'), ('Ᵹ', 'Ꝿ'), ('Ꞁ', 'Ꞁ'), ('Ꞃ', 'Ꞃ'), ('Ꞅ', 'Ꞅ'), ('Ꞇ', 'Ꞇ'), ('Ꞌ', 'Ꞌ'), ('Ɥ', 'Ɥ'), ('Ꞑ', 'Ꞑ'), ('Ꞓ', 'Ꞓ'), ('Ꞗ', 'Ꞗ'), ('Ꞙ', 'Ꞙ'), ('Ꞛ', 'Ꞛ'), ('Ꞝ', 'Ꞝ'), ('Ꞟ', 'Ꞟ'), ('Ꞡ', 'Ꞡ'), ('Ꞣ', 'Ꞣ'), ('Ꞥ', 'Ꞥ'), ('Ꞧ', 'Ꞧ'), ('Ꞩ', 'Ꞩ'), ('Ɦ', 'Ɪ'), ('Ʞ', 'Ꞵ'), ('Ꞷ', 'Ꞷ'), ('Ꞹ', 'Ꞹ'), ('Ꞻ', 'Ꞻ'), ('Ꞽ', 'Ꞽ'), ('Ꞿ', 'Ꞿ'), ('Ꟁ', 'Ꟁ'), ('Ꟃ', 'Ꟃ'), ('Ꞔ', 'Ꟈ'), ('Ꟊ', 'Ꟊ'), ('Ꟑ', 'Ꟑ'), ('Ꟗ', 'Ꟗ'), ('Ꟙ', 'Ꟙ'), ('Ꟶ', 'Ꟶ'), ('A', 'Z'), ('𐐀', '𐐧'), ('𐒰', '𐓓'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐲀', '𐲲'), ('𑢠', '𑢿'), ('𖹀', '𖹟'), ('𝐀', '𝐙'), ('𝐴', '𝑍'), ('𝑨', '𝒁'), ('𝒜', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒵'), ('𝓐', '𝓩'), ('𝔄', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔸', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕬', '𝖅'), ('𝖠', '𝖹'), ('𝗔', '𝗭'), ('𝘈', '𝘡'), ('𝘼', '𝙕'), ('𝙰', '𝚉'), ('𝚨', '𝛀'), ('𝛢', '𝛺'), ('𝜜', '𝜴'), ('𝝖', '𝝮'), ('𝞐', '𝞨'), ('𝟊', '𝟊'), ('𞤀', '𞤡'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ]; regex-syntax-0.8.2/src/unicode_tables/word_break.rs000064400000000000000000000626761046102023000205460ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate word-break ucd-15.0.0 --chars // // Unicode version: 15.0.0. // // ucd-generate 0.2.14 is available on crates.io. pub const BY_NAME: &'static [(&'static str, &'static [(char, char)])] = &[ ("ALetter", ALETTER), ("CR", CR), ("Double_Quote", DOUBLE_QUOTE), ("Extend", EXTEND), ("ExtendNumLet", EXTENDNUMLET), ("Format", FORMAT), ("Hebrew_Letter", HEBREW_LETTER), ("Katakana", KATAKANA), ("LF", LF), ("MidLetter", MIDLETTER), ("MidNum", MIDNUM), ("MidNumLet", MIDNUMLET), ("Newline", NEWLINE), ("Numeric", NUMERIC), ("Regional_Indicator", REGIONAL_INDICATOR), ("Single_Quote", SINGLE_QUOTE), ("WSegSpace", WSEGSPACE), ("ZWJ", ZWJ), ]; pub const ALETTER: &'static [(char, char)] = &[ ('A', 'Z'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', '˗'), ('˞', '˿'), ('Ͱ', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('Ҋ', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', '՜'), ('՞', '՞'), ('ՠ', 'ֈ'), ('֊', '֊'), ('׳', '׳'), ('ؠ', 'ي'), ('ٮ', 'ٯ'), ('ٱ', 'ۓ'), ('ە', 'ە'), ('ۥ', 'ۦ'), ('ۮ', 'ۯ'), ('ۺ', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', 'ܐ'), ('ܒ', 'ܯ'), ('ݍ', 'ޥ'), ('ޱ', 'ޱ'), ('ߊ', 'ߪ'), ('ߴ', 'ߵ'), ('ߺ', 'ߺ'), ('ࠀ', 'ࠕ'), ('ࠚ', 'ࠚ'), ('ࠤ', 'ࠤ'), ('ࠨ', 'ࠨ'), ('ࡀ', 'ࡘ'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('ࢠ', 'ࣉ'), ('ऄ', 'ह'), ('ऽ', 'ऽ'), ('ॐ', 'ॐ'), ('क़', 'ॡ'), ('ॱ', 'ঀ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('ঽ', 'ঽ'), ('ৎ', 'ৎ'), ('ড়', 'ঢ়'), ('য়', 'ৡ'), ('ৰ', 'ৱ'), ('ৼ', 'ৼ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('ੲ', 'ੴ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('ઽ', 'ઽ'), ('ૐ', 'ૐ'), ('ૠ', 'ૡ'), ('ૹ', 'ૹ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('ଽ', 'ଽ'), ('ଡ଼', 'ଢ଼'), ('ୟ', 'ୡ'), ('ୱ', 'ୱ'), ('ஃ', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('ௐ', 'ௐ'), ('అ', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('ఽ', 'ఽ'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', 'ౡ'), ('ಀ', 'ಀ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('ಽ', 'ಽ'), ('ೝ', 'ೞ'), ('ೠ', 'ೡ'), ('ೱ', 'ೲ'), ('ഄ', 'ഌ'), ('എ', 'ഐ'), ('ഒ', 'ഺ'), ('ഽ', 'ഽ'), ('ൎ', 'ൎ'), ('ൔ', 'ൖ'), ('ൟ', 'ൡ'), ('ൺ', 'ൿ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('ༀ', 'ༀ'), ('ཀ', 'ཇ'), ('ཉ', 'ཬ'), ('ྈ', 'ྌ'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', 'ᜑ'), ('ᜟ', 'ᜱ'), ('ᝀ', 'ᝑ'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢄ'), ('ᢇ', 'ᢨ'), ('ᢪ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('ᨀ', 'ᨖ'), ('ᬅ', 'ᬳ'), ('ᭅ', 'ᭌ'), ('ᮃ', 'ᮠ'), ('ᮮ', 'ᮯ'), ('ᮺ', 'ᯥ'), ('ᰀ', 'ᰣ'), ('ᱍ', 'ᱏ'), ('ᱚ', 'ᱽ'), ('ᲀ', 'ᲈ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('ᳩ', 'ᳬ'), ('ᳮ', 'ᳳ'), ('ᳵ', 'ᳶ'), ('ᳺ', 'ᳺ'), ('ᴀ', 'ᶿ'), ('Ḁ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳮ'), ('Ⳳ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('ⶀ', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('ⸯ', 'ⸯ'), ('々', '々'), ('〻', '〼'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ꀀ', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘟ'), ('ꘪ', 'ꘫ'), ('Ꙁ', 'ꙮ'), ('ꙿ', 'ꚝ'), ('ꚠ', 'ꛯ'), ('꜈', 'ꟊ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'ꟙ'), ('ꟲ', 'ꠁ'), ('ꠃ', 'ꠅ'), ('ꠇ', 'ꠊ'), ('ꠌ', 'ꠢ'), ('ꡀ', 'ꡳ'), ('ꢂ', 'ꢳ'), ('ꣲ', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', 'ꣾ'), ('ꤊ', 'ꤥ'), ('ꤰ', 'ꥆ'), ('ꥠ', 'ꥼ'), ('ꦄ', 'ꦲ'), ('ꧏ', 'ꧏ'), ('ꨀ', 'ꨨ'), ('ꩀ', 'ꩂ'), ('ꩄ', 'ꩋ'), ('ꫠ', 'ꫪ'), ('ꫲ', 'ꫴ'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭩ'), ('ꭰ', 'ꯢ'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('ﭐ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('A', 'Z'), ('a', 'z'), ('ᅠ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '𐍵'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '𐨀'), ('𐨐', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '𐫤'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '𐴣'), ('𐺀', '𐺩'), ('𐺰', '𐺱'), ('𐼀', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '𐽅'), ('𐽰', '𐾁'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀃', '𑀷'), ('𑁱', '𑁲'), ('𑁵', '𑁵'), ('𑂃', '𑂯'), ('𑃐', '𑃨'), ('𑄃', '𑄦'), ('𑅄', '𑅄'), ('𑅇', '𑅇'), ('𑅐', '𑅲'), ('𑅶', '𑅶'), ('𑆃', '𑆲'), ('𑇁', '𑇄'), ('𑇚', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '𑈫'), ('𑈿', '𑉀'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '𑋞'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('𑌽', '𑌽'), ('𑍐', '𑍐'), ('𑍝', '𑍡'), ('𑐀', '𑐴'), ('𑑇', '𑑊'), ('𑑟', '𑑡'), ('𑒀', '𑒯'), ('𑓄', '𑓅'), ('𑓇', '𑓇'), ('𑖀', '𑖮'), ('𑗘', '𑗛'), ('𑘀', '𑘯'), ('𑙄', '𑙄'), ('𑚀', '𑚪'), ('𑚸', '𑚸'), ('𑠀', '𑠫'), ('𑢠', '𑣟'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤯'), ('𑤿', '𑤿'), ('𑥁', '𑥁'), ('𑦠', '𑦧'), ('𑦪', '𑧐'), ('𑧡', '𑧡'), ('𑧣', '𑧣'), ('𑨀', '𑨀'), ('𑨋', '𑨲'), ('𑨺', '𑨺'), ('𑩐', '𑩐'), ('𑩜', '𑪉'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑰀', '𑰈'), ('𑰊', '𑰮'), ('𑱀', '𑱀'), ('𑱲', '𑲏'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '𑴰'), ('𑵆', '𑵆'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶉'), ('𑶘', '𑶘'), ('𑻠', '𑻲'), ('𑼂', '𑼂'), ('𑼄', '𑼐'), ('𑼒', '𑼳'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('𓑁', '𓑆'), ('𔐀', '𔙆'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩰', '𖪾'), ('𖫐', '𖫭'), ('𖬀', '𖬯'), ('𖭀', '𖭃'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('𖽐', '𖽐'), ('𖾓', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '𖿣'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('𞀰', '𞁭'), ('𞄀', '𞄬'), ('𞄷', '𞄽'), ('𞅎', '𞅎'), ('𞊐', '𞊭'), ('𞋀', '𞋫'), ('𞓐', '𞓫'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('𞤀', '𞥃'), ('𞥋', '𞥋'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ]; pub const CR: &'static [(char, char)] = &[('\r', '\r')]; pub const DOUBLE_QUOTE: &'static [(char, char)] = &[('"', '"')]; pub const EXTEND: &'static [(char, char)] = &[ ('\u{300}', '\u{36f}'), ('\u{483}', '\u{489}'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('\u{610}', '\u{61a}'), ('\u{64b}', '\u{65f}'), ('\u{670}', '\u{670}'), ('\u{6d6}', '\u{6dc}'), ('\u{6df}', '\u{6e4}'), ('\u{6e7}', '\u{6e8}'), ('\u{6ea}', '\u{6ed}'), ('\u{711}', '\u{711}'), ('\u{730}', '\u{74a}'), ('\u{7a6}', '\u{7b0}'), ('\u{7eb}', '\u{7f3}'), ('\u{7fd}', '\u{7fd}'), ('\u{816}', '\u{819}'), ('\u{81b}', '\u{823}'), ('\u{825}', '\u{827}'), ('\u{829}', '\u{82d}'), ('\u{859}', '\u{85b}'), ('\u{898}', '\u{89f}'), ('\u{8ca}', '\u{8e1}'), ('\u{8e3}', 'ः'), ('\u{93a}', '\u{93c}'), ('ा', 'ॏ'), ('\u{951}', '\u{957}'), ('\u{962}', '\u{963}'), ('\u{981}', 'ঃ'), ('\u{9bc}', '\u{9bc}'), ('\u{9be}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', '\u{9cd}'), ('\u{9d7}', '\u{9d7}'), ('\u{9e2}', '\u{9e3}'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('\u{a70}', '\u{a71}'), ('\u{a75}', '\u{a75}'), ('\u{a81}', 'ઃ'), ('\u{abc}', '\u{abc}'), ('ા', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('\u{ae2}', '\u{ae3}'), ('\u{afa}', '\u{aff}'), ('\u{b01}', 'ଃ'), ('\u{b3c}', '\u{b3c}'), ('\u{b3e}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('\u{b62}', '\u{b63}'), ('\u{b82}', '\u{b82}'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('\u{bd7}', '\u{bd7}'), ('\u{c00}', '\u{c04}'), ('\u{c3c}', '\u{c3c}'), ('\u{c3e}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('\u{c62}', '\u{c63}'), ('\u{c81}', 'ಃ'), ('\u{cbc}', '\u{cbc}'), ('ಾ', 'ೄ'), ('\u{cc6}', 'ೈ'), ('ೊ', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('\u{ce2}', '\u{ce3}'), ('ೳ', 'ೳ'), ('\u{d00}', 'ഃ'), ('\u{d3b}', '\u{d3c}'), ('\u{d3e}', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', '\u{d4d}'), ('\u{d57}', '\u{d57}'), ('\u{d62}', '\u{d63}'), ('\u{d81}', 'ඃ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('ෲ', 'ෳ'), ('\u{e31}', '\u{e31}'), ('\u{e34}', '\u{e3a}'), ('\u{e47}', '\u{e4e}'), ('\u{eb1}', '\u{eb1}'), ('\u{eb4}', '\u{ebc}'), ('\u{ec8}', '\u{ece}'), ('\u{f18}', '\u{f19}'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', '༿'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f87}'), ('\u{f8d}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('ါ', '\u{103e}'), ('ၖ', '\u{1059}'), ('\u{105e}', '\u{1060}'), ('ၢ', 'ၤ'), ('ၧ', 'ၭ'), ('\u{1071}', '\u{1074}'), ('\u{1082}', '\u{108d}'), ('ႏ', 'ႏ'), ('ႚ', '\u{109d}'), ('\u{135d}', '\u{135f}'), ('\u{1712}', '᜕'), ('\u{1732}', '᜴'), ('\u{1752}', '\u{1753}'), ('\u{1772}', '\u{1773}'), ('\u{17b4}', '\u{17d3}'), ('\u{17dd}', '\u{17dd}'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '\u{180f}'), ('\u{1885}', '\u{1886}'), ('\u{18a9}', '\u{18a9}'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('\u{1a17}', '\u{1a1b}'), ('ᩕ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '\u{1a7f}'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', 'ᬄ'), ('\u{1b34}', '᭄'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', 'ᮂ'), ('ᮡ', '\u{1bad}'), ('\u{1be6}', '᯳'), ('ᰤ', '\u{1c37}'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', '\u{1ce8}'), ('\u{1ced}', '\u{1ced}'), ('\u{1cf4}', '\u{1cf4}'), ('᳷', '\u{1cf9}'), ('\u{1dc0}', '\u{1dff}'), ('\u{200c}', '\u{200c}'), ('\u{20d0}', '\u{20f0}'), ('\u{2cef}', '\u{2cf1}'), ('\u{2d7f}', '\u{2d7f}'), ('\u{2de0}', '\u{2dff}'), ('\u{302a}', '\u{302f}'), ('\u{3099}', '\u{309a}'), ('\u{a66f}', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('\u{a69e}', '\u{a69f}'), ('\u{a6f0}', '\u{a6f1}'), ('\u{a802}', '\u{a802}'), ('\u{a806}', '\u{a806}'), ('\u{a80b}', '\u{a80b}'), ('ꠣ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꢀ', 'ꢁ'), ('ꢴ', '\u{a8c5}'), ('\u{a8e0}', '\u{a8f1}'), ('\u{a8ff}', '\u{a8ff}'), ('\u{a926}', '\u{a92d}'), ('\u{a947}', '꥓'), ('\u{a980}', 'ꦃ'), ('\u{a9b3}', '꧀'), ('\u{a9e5}', '\u{a9e5}'), ('\u{aa29}', '\u{aa36}'), ('\u{aa43}', '\u{aa43}'), ('\u{aa4c}', 'ꩍ'), ('ꩻ', 'ꩽ'), ('\u{aab0}', '\u{aab0}'), ('\u{aab2}', '\u{aab4}'), ('\u{aab7}', '\u{aab8}'), ('\u{aabe}', '\u{aabf}'), ('\u{aac1}', '\u{aac1}'), ('ꫫ', 'ꫯ'), ('ꫵ', '\u{aaf6}'), ('ꯣ', 'ꯪ'), ('꯬', '\u{abed}'), ('\u{fb1e}', '\u{fb1e}'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('\u{ff9e}', '\u{ff9f}'), ('\u{101fd}', '\u{101fd}'), ('\u{102e0}', '\u{102e0}'), ('\u{10376}', '\u{1037a}'), ('\u{10a01}', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '\u{10a0f}'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('\u{10ae5}', '\u{10ae6}'), ('\u{10d24}', '\u{10d27}'), ('\u{10eab}', '\u{10eac}'), ('\u{10efd}', '\u{10eff}'), ('\u{10f46}', '\u{10f50}'), ('\u{10f82}', '\u{10f85}'), ('𑀀', '𑀂'), ('\u{11038}', '\u{11046}'), ('\u{11070}', '\u{11070}'), ('\u{11073}', '\u{11074}'), ('\u{1107f}', '𑂂'), ('𑂰', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('\u{11100}', '\u{11102}'), ('\u{11127}', '\u{11134}'), ('𑅅', '𑅆'), ('\u{11173}', '\u{11173}'), ('\u{11180}', '𑆂'), ('𑆳', '𑇀'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '\u{111cf}'), ('𑈬', '\u{11237}'), ('\u{1123e}', '\u{1123e}'), ('\u{11241}', '\u{11241}'), ('\u{112df}', '\u{112ea}'), ('\u{11300}', '𑌃'), ('\u{1133b}', '\u{1133c}'), ('\u{1133e}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '𑍍'), ('\u{11357}', '\u{11357}'), ('𑍢', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑐵', '\u{11446}'), ('\u{1145e}', '\u{1145e}'), ('\u{114b0}', '\u{114c3}'), ('\u{115af}', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('\u{115dc}', '\u{115dd}'), ('𑘰', '\u{11640}'), ('\u{116ab}', '\u{116b7}'), ('\u{1171d}', '\u{1172b}'), ('𑠬', '\u{1183a}'), ('\u{11930}', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{1193e}'), ('𑥀', '𑥀'), ('𑥂', '\u{11943}'), ('𑧑', '\u{119d7}'), ('\u{119da}', '\u{119e0}'), ('𑧤', '𑧤'), ('\u{11a01}', '\u{11a0a}'), ('\u{11a33}', '𑨹'), ('\u{11a3b}', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('\u{11a51}', '\u{11a5b}'), ('\u{11a8a}', '\u{11a99}'), ('𑰯', '\u{11c36}'), ('\u{11c38}', '\u{11c3f}'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('\u{11d31}', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d45}'), ('\u{11d47}', '\u{11d47}'), ('𑶊', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '\u{11d97}'), ('\u{11ef3}', '𑻶'), ('\u{11f00}', '\u{11f01}'), ('𑼃', '𑼃'), ('𑼴', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('\u{13440}', '\u{13440}'), ('\u{13447}', '\u{13455}'), ('\u{16af0}', '\u{16af4}'), ('\u{16b30}', '\u{16b36}'), ('\u{16f4f}', '\u{16f4f}'), ('𖽑', '𖾇'), ('\u{16f8f}', '\u{16f92}'), ('\u{16fe4}', '\u{16fe4}'), ('𖿰', '𖿱'), ('\u{1bc9d}', '\u{1bc9e}'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('𝅭', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('\u{1e08f}', '\u{1e08f}'), ('\u{1e130}', '\u{1e136}'), ('\u{1e2ae}', '\u{1e2ae}'), ('\u{1e2ec}', '\u{1e2ef}'), ('\u{1e4ec}', '\u{1e4ef}'), ('\u{1e8d0}', '\u{1e8d6}'), ('\u{1e944}', '\u{1e94a}'), ('🏻', '🏿'), ('\u{e0020}', '\u{e007f}'), ('\u{e0100}', '\u{e01ef}'), ]; pub const EXTENDNUMLET: &'static [(char, char)] = &[ ('_', '_'), ('\u{202f}', '\u{202f}'), ('‿', '⁀'), ('⁔', '⁔'), ('︳', '︴'), ('﹍', '﹏'), ('_', '_'), ]; pub const FORMAT: &'static [(char, char)] = &[ ('\u{ad}', '\u{ad}'), ('\u{600}', '\u{605}'), ('\u{61c}', '\u{61c}'), ('\u{6dd}', '\u{6dd}'), ('\u{70f}', '\u{70f}'), ('\u{890}', '\u{891}'), ('\u{8e2}', '\u{8e2}'), ('\u{180e}', '\u{180e}'), ('\u{200e}', '\u{200f}'), ('\u{202a}', '\u{202e}'), ('\u{2060}', '\u{2064}'), ('\u{2066}', '\u{206f}'), ('\u{feff}', '\u{feff}'), ('\u{fff9}', '\u{fffb}'), ('\u{110bd}', '\u{110bd}'), ('\u{110cd}', '\u{110cd}'), ('\u{13430}', '\u{1343f}'), ('\u{1bca0}', '\u{1bca3}'), ('\u{1d173}', '\u{1d17a}'), ('\u{e0001}', '\u{e0001}'), ]; pub const HEBREW_LETTER: &'static [(char, char)] = &[ ('א', 'ת'), ('ׯ', 'ײ'), ('יִ', 'יִ'), ('ײַ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﭏ'), ]; pub const KATAKANA: &'static [(char, char)] = &[ ('〱', '〵'), ('゛', '゜'), ('゠', 'ヺ'), ('ー', 'ヿ'), ('ㇰ', 'ㇿ'), ('㋐', '㋾'), ('㌀', '㍗'), ('ヲ', 'ン'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛀀'), ('𛄠', '𛄢'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ]; pub const LF: &'static [(char, char)] = &[('\n', '\n')]; pub const MIDLETTER: &'static [(char, char)] = &[ (':', ':'), ('·', '·'), ('·', '·'), ('՟', '՟'), ('״', '״'), ('‧', '‧'), ('︓', '︓'), ('﹕', '﹕'), (':', ':'), ]; pub const MIDNUM: &'static [(char, char)] = &[ (',', ','), (';', ';'), (';', ';'), ('։', '։'), ('،', '؍'), ('٬', '٬'), ('߸', '߸'), ('⁄', '⁄'), ('︐', '︐'), ('︔', '︔'), ('﹐', '﹐'), ('﹔', '﹔'), (',', ','), (';', ';'), ]; pub const MIDNUMLET: &'static [(char, char)] = &[ ('.', '.'), ('‘', '’'), ('․', '․'), ('﹒', '﹒'), (''', '''), ('.', '.'), ]; pub const NEWLINE: &'static [(char, char)] = &[('\u{b}', '\u{c}'), ('\u{85}', '\u{85}'), ('\u{2028}', '\u{2029}')]; pub const NUMERIC: &'static [(char, char)] = &[ ('0', '9'), ('٠', '٩'), ('٫', '٫'), ('۰', '۹'), ('߀', '߉'), ('०', '९'), ('০', '৯'), ('੦', '੯'), ('૦', '૯'), ('୦', '୯'), ('௦', '௯'), ('౦', '౯'), ('೦', '೯'), ('൦', '൯'), ('෦', '෯'), ('๐', '๙'), ('໐', '໙'), ('༠', '༩'), ('၀', '၉'), ('႐', '႙'), ('០', '៩'), ('᠐', '᠙'), ('᥆', '᥏'), ('᧐', '᧙'), ('᪀', '᪉'), ('᪐', '᪙'), ('᭐', '᭙'), ('᮰', '᮹'), ('᱀', '᱉'), ('᱐', '᱙'), ('꘠', '꘩'), ('꣐', '꣙'), ('꤀', '꤉'), ('꧐', '꧙'), ('꧰', '꧹'), ('꩐', '꩙'), ('꯰', '꯹'), ('0', '9'), ('𐒠', '𐒩'), ('𐴰', '𐴹'), ('𑁦', '𑁯'), ('𑃰', '𑃹'), ('𑄶', '𑄿'), ('𑇐', '𑇙'), ('𑋰', '𑋹'), ('𑑐', '𑑙'), ('𑓐', '𑓙'), ('𑙐', '𑙙'), ('𑛀', '𑛉'), ('𑜰', '𑜹'), ('𑣠', '𑣩'), ('𑥐', '𑥙'), ('𑱐', '𑱙'), ('𑵐', '𑵙'), ('𑶠', '𑶩'), ('𑽐', '𑽙'), ('𖩠', '𖩩'), ('𖫀', '𖫉'), ('𖭐', '𖭙'), ('𝟎', '𝟿'), ('𞅀', '𞅉'), ('𞋰', '𞋹'), ('𞓰', '𞓹'), ('𞥐', '𞥙'), ('🯰', '🯹'), ]; pub const REGIONAL_INDICATOR: &'static [(char, char)] = &[('🇦', '🇿')]; pub const SINGLE_QUOTE: &'static [(char, char)] = &[('\'', '\'')]; pub const WSEGSPACE: &'static [(char, char)] = &[ (' ', ' '), ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{2006}'), ('\u{2008}', '\u{200a}'), ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'), ]; pub const ZWJ: &'static [(char, char)] = &[('\u{200d}', '\u{200d}')]; regex-syntax-0.8.2/src/utf8.rs000064400000000000000000000450251046102023000143220ustar 00000000000000/*! Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes. This is sub-module is useful for constructing byte based automatons that need to embed UTF-8 decoding. The most common use of this module is in conjunction with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type. See the documentation on the `Utf8Sequences` iterator for more details and an example. # Wait, what is this? This is simplest to explain with an example. Let's say you wanted to test whether a particular byte sequence was a Cyrillic character. One possible scalar value range is `[0400-04FF]`. The set of allowed bytes for this range can be expressed as a sequence of byte ranges: ```text [D0-D3][80-BF] ``` This is simple enough: simply encode the boundaries, `0400` encodes to `D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each corresponding pair of bytes: `D0` to `D3` and `80` to `BF`. However, what if you wanted to add the Cyrillic Supplementary characters to your range? Your range might then become `[0400-052F]`. The same procedure as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges you'd get from the previous transformation would be `[D0-D4][80-AF]`. However, this isn't quite correct because this range doesn't capture many characters, for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`). Instead, you need multiple sequences of byte ranges: ```text [D0-D3][80-BF] # matches codepoints 0400-04FF [D4][80-AF] # matches codepoints 0500-052F ``` This gets even more complicated if you want bigger ranges, particularly if they naively contain surrogate codepoints. For example, the sequence of byte ranges for the basic multilingual plane (`[0000-FFFF]`) look like this: ```text [0-7F] [C2-DF][80-BF] [E0][A0-BF][80-BF] [E1-EC][80-BF][80-BF] [ED][80-9F][80-BF] [EE-EF][80-BF][80-BF] ``` Note that the byte ranges above will *not* match any erroneous encoding of UTF-8, including encodings of surrogate codepoints. And, of course, for all of Unicode (`[000000-10FFFF]`): ```text [0-7F] [C2-DF][80-BF] [E0][A0-BF][80-BF] [E1-EC][80-BF][80-BF] [ED][80-9F][80-BF] [EE-EF][80-BF][80-BF] [F0][90-BF][80-BF][80-BF] [F1-F3][80-BF][80-BF][80-BF] [F4][80-8F][80-BF][80-BF] ``` This module automates the process of creating these byte ranges from ranges of Unicode scalar values. # Lineage I got the idea and general implementation strategy from Russ Cox in his [article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2. Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?). I also got the idea from [Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java), which uses it for executing automata on their term index. */ use core::{char, fmt, iter::FusedIterator, slice}; use alloc::{vec, vec::Vec}; const MAX_UTF8_BYTES: usize = 4; /// Utf8Sequence represents a sequence of byte ranges. /// /// To match a Utf8Sequence, a candidate byte sequence must match each /// successive range. /// /// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte /// sequence `\xDD\x61` would not match because `0x61 < 0x80`. #[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)] pub enum Utf8Sequence { /// One byte range. One(Utf8Range), /// Two successive byte ranges. Two([Utf8Range; 2]), /// Three successive byte ranges. Three([Utf8Range; 3]), /// Four successive byte ranges. Four([Utf8Range; 4]), } impl Utf8Sequence { /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value /// range. /// /// This assumes that `start` and `end` have the same length. fn from_encoded_range(start: &[u8], end: &[u8]) -> Self { assert_eq!(start.len(), end.len()); match start.len() { 2 => Utf8Sequence::Two([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), ]), 3 => Utf8Sequence::Three([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), Utf8Range::new(start[2], end[2]), ]), 4 => Utf8Sequence::Four([ Utf8Range::new(start[0], end[0]), Utf8Range::new(start[1], end[1]), Utf8Range::new(start[2], end[2]), Utf8Range::new(start[3], end[3]), ]), n => unreachable!("invalid encoded length: {}", n), } } /// Returns the underlying sequence of byte ranges as a slice. pub fn as_slice(&self) -> &[Utf8Range] { use self::Utf8Sequence::*; match *self { One(ref r) => slice::from_ref(r), Two(ref r) => &r[..], Three(ref r) => &r[..], Four(ref r) => &r[..], } } /// Returns the number of byte ranges in this sequence. /// /// The length is guaranteed to be in the closed interval `[1, 4]`. pub fn len(&self) -> usize { self.as_slice().len() } /// Reverses the ranges in this sequence. /// /// For example, if this corresponds to the following sequence: /// /// ```text /// [D0-D3][80-BF] /// ``` /// /// Then after reversal, it will be /// /// ```text /// [80-BF][D0-D3] /// ``` /// /// This is useful when one is constructing a UTF-8 automaton to match /// character classes in reverse. pub fn reverse(&mut self) { match *self { Utf8Sequence::One(_) => {} Utf8Sequence::Two(ref mut x) => x.reverse(), Utf8Sequence::Three(ref mut x) => x.reverse(), Utf8Sequence::Four(ref mut x) => x.reverse(), } } /// Returns true if and only if a prefix of `bytes` matches this sequence /// of byte ranges. pub fn matches(&self, bytes: &[u8]) -> bool { if bytes.len() < self.len() { return false; } for (&b, r) in bytes.iter().zip(self) { if !r.matches(b) { return false; } } true } } impl<'a> IntoIterator for &'a Utf8Sequence { type IntoIter = slice::Iter<'a, Utf8Range>; type Item = &'a Utf8Range; fn into_iter(self) -> Self::IntoIter { self.as_slice().iter() } } impl fmt::Debug for Utf8Sequence { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use self::Utf8Sequence::*; match *self { One(ref r) => write!(f, "{:?}", r), Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]), Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]), Four(ref r) => { write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3]) } } } } /// A single inclusive range of UTF-8 bytes. #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] pub struct Utf8Range { /// Start of byte range (inclusive). pub start: u8, /// End of byte range (inclusive). pub end: u8, } impl Utf8Range { fn new(start: u8, end: u8) -> Self { Utf8Range { start, end } } /// Returns true if and only if the given byte is in this range. pub fn matches(&self, b: u8) -> bool { self.start <= b && b <= self.end } } impl fmt::Debug for Utf8Range { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.start == self.end { write!(f, "[{:X}]", self.start) } else { write!(f, "[{:X}-{:X}]", self.start, self.end) } } } /// An iterator over ranges of matching UTF-8 byte sequences. /// /// The iteration represents an alternation of comprehensive byte sequences /// that match precisely the set of UTF-8 encoded scalar values. /// /// A byte sequence corresponds to one of the scalar values in the range given /// if and only if it completely matches exactly one of the sequences of byte /// ranges produced by this iterator. /// /// Each sequence of byte ranges matches a unique set of bytes. That is, no two /// sequences will match the same bytes. /// /// # Example /// /// This shows how to match an arbitrary byte sequence against a range of /// scalar values. /// /// ```rust /// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence}; /// /// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool { /// for range in seqs { /// if range.matches(bytes) { /// return true; /// } /// } /// false /// } /// /// // Test the basic multilingual plane. /// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect(); /// /// // UTF-8 encoding of 'a'. /// assert!(matches(&seqs, &[0x61])); /// // UTF-8 encoding of '☃' (`\u{2603}`). /// assert!(matches(&seqs, &[0xE2, 0x98, 0x83])); /// // UTF-8 encoding of `\u{10348}` (outside the BMP). /// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88])); /// // Tries to match against a UTF-8 encoding of a surrogate codepoint, /// // which is invalid UTF-8, and therefore fails, despite the fact that /// // the corresponding codepoint (0xD800) falls in the range given. /// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80])); /// // And fails against plain old invalid UTF-8. /// assert!(!matches(&seqs, &[0xFF, 0xFF])); /// ``` /// /// If this example seems circuitous, that's because it is! It's meant to be /// illustrative. In practice, you could just try to decode your byte sequence /// and compare it with the scalar value range directly. However, this is not /// always possible (for example, in a byte based automaton). #[derive(Debug)] pub struct Utf8Sequences { range_stack: Vec, } impl Utf8Sequences { /// Create a new iterator over UTF-8 byte ranges for the scalar value range /// given. pub fn new(start: char, end: char) -> Self { let mut it = Utf8Sequences { range_stack: vec![] }; it.push(u32::from(start), u32::from(end)); it } /// reset resets the scalar value range. /// Any existing state is cleared, but resources may be reused. /// /// N.B. Benchmarks say that this method is dubious. #[doc(hidden)] pub fn reset(&mut self, start: char, end: char) { self.range_stack.clear(); self.push(u32::from(start), u32::from(end)); } fn push(&mut self, start: u32, end: u32) { self.range_stack.push(ScalarRange { start, end }); } } struct ScalarRange { start: u32, end: u32, } impl fmt::Debug for ScalarRange { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "ScalarRange({:X}, {:X})", self.start, self.end) } } impl Iterator for Utf8Sequences { type Item = Utf8Sequence; fn next(&mut self) -> Option { 'TOP: while let Some(mut r) = self.range_stack.pop() { 'INNER: loop { if let Some((r1, r2)) = r.split() { self.push(r2.start, r2.end); r.start = r1.start; r.end = r1.end; continue 'INNER; } if !r.is_valid() { continue 'TOP; } for i in 1..MAX_UTF8_BYTES { let max = max_scalar_value(i); if r.start <= max && max < r.end { self.push(max + 1, r.end); r.end = max; continue 'INNER; } } if let Some(ascii_range) = r.as_ascii() { return Some(Utf8Sequence::One(ascii_range)); } for i in 1..MAX_UTF8_BYTES { let m = (1 << (6 * i)) - 1; if (r.start & !m) != (r.end & !m) { if (r.start & m) != 0 { self.push((r.start | m) + 1, r.end); r.end = r.start | m; continue 'INNER; } if (r.end & m) != m { self.push(r.end & !m, r.end); r.end = (r.end & !m) - 1; continue 'INNER; } } } let mut start = [0; MAX_UTF8_BYTES]; let mut end = [0; MAX_UTF8_BYTES]; let n = r.encode(&mut start, &mut end); return Some(Utf8Sequence::from_encoded_range( &start[0..n], &end[0..n], )); } } None } } impl FusedIterator for Utf8Sequences {} impl ScalarRange { /// split splits this range if it overlaps with a surrogate codepoint. /// /// Either or both ranges may be invalid. fn split(&self) -> Option<(ScalarRange, ScalarRange)> { if self.start < 0xE000 && self.end > 0xD7FF { Some(( ScalarRange { start: self.start, end: 0xD7FF }, ScalarRange { start: 0xE000, end: self.end }, )) } else { None } } /// is_valid returns true if and only if start <= end. fn is_valid(&self) -> bool { self.start <= self.end } /// as_ascii returns this range as a Utf8Range if and only if all scalar /// values in this range can be encoded as a single byte. fn as_ascii(&self) -> Option { if self.is_ascii() { let start = u8::try_from(self.start).unwrap(); let end = u8::try_from(self.end).unwrap(); Some(Utf8Range::new(start, end)) } else { None } } /// is_ascii returns true if the range is ASCII only (i.e., takes a single /// byte to encode any scalar value). fn is_ascii(&self) -> bool { self.is_valid() && self.end <= 0x7f } /// encode writes the UTF-8 encoding of the start and end of this range /// to the corresponding destination slices, and returns the number of /// bytes written. /// /// The slices should have room for at least `MAX_UTF8_BYTES`. fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize { let cs = char::from_u32(self.start).unwrap(); let ce = char::from_u32(self.end).unwrap(); let ss = cs.encode_utf8(start); let se = ce.encode_utf8(end); assert_eq!(ss.len(), se.len()); ss.len() } } fn max_scalar_value(nbytes: usize) -> u32 { match nbytes { 1 => 0x007F, 2 => 0x07FF, 3 => 0xFFFF, 4 => 0x0010_FFFF, _ => unreachable!("invalid UTF-8 byte sequence size"), } } #[cfg(test)] mod tests { use core::char; use alloc::{vec, vec::Vec}; use crate::utf8::{Utf8Range, Utf8Sequences}; fn rutf8(s: u8, e: u8) -> Utf8Range { Utf8Range::new(s, e) } fn never_accepts_surrogate_codepoints(start: char, end: char) { for cp in 0xD800..0xE000 { let buf = encode_surrogate(cp); for r in Utf8Sequences::new(start, end) { if r.matches(&buf) { panic!( "Sequence ({:X}, {:X}) contains range {:?}, \ which matches surrogate code point {:X} \ with encoded bytes {:?}", u32::from(start), u32::from(end), r, cp, buf, ); } } } } #[test] fn codepoints_no_surrogates() { never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}'); never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}'); never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}'); never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}'); never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}'); } #[test] fn single_codepoint_one_sequence() { // Tests that every range of scalar values that contains a single // scalar value is recognized by one sequence of byte ranges. for i in 0x0..=0x0010_FFFF { let c = match char::from_u32(i) { None => continue, Some(c) => c, }; let seqs: Vec<_> = Utf8Sequences::new(c, c).collect(); assert_eq!(seqs.len(), 1); } } #[test] fn bmp() { use crate::utf8::Utf8Sequence::*; let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::>(); assert_eq!( seqs, vec![ One(rutf8(0x0, 0x7F)), Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]), Three([ rutf8(0xE0, 0xE0), rutf8(0xA0, 0xBF), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xE1, 0xEC), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xED, 0xED), rutf8(0x80, 0x9F), rutf8(0x80, 0xBF) ]), Three([ rutf8(0xEE, 0xEF), rutf8(0x80, 0xBF), rutf8(0x80, 0xBF) ]), ] ); } #[test] fn reverse() { use crate::utf8::Utf8Sequence::*; let mut s = One(rutf8(0xA, 0xB)); s.reverse(); assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]); let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]); s.reverse(); assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]); let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]); s.reverse(); assert_eq!( s.as_slice(), &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)] ); let mut s = Four([ rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD), rutf8(0xD, 0xE), ]); s.reverse(); assert_eq!( s.as_slice(), &[ rutf8(0xD, 0xE), rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB) ] ); } fn encode_surrogate(cp: u32) -> [u8; 3] { const TAG_CONT: u8 = 0b1000_0000; const TAG_THREE_B: u8 = 0b1110_0000; assert!(0xD800 <= cp && cp < 0xE000); let mut dst = [0; 3]; dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B; dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT; dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT; dst } } regex-syntax-0.8.2/test000075500000000000000000000014171046102023000132010ustar 00000000000000#!/bin/bash set -e # cd to the directory containing this crate's Cargo.toml so that we don't need # to pass --manifest-path to every `cargo` command. cd "$(dirname "$0")" # This is a convenience script for running a broad swath of the syntax tests. echo "===== DEFAULT FEATURES ===" cargo test features=( std unicode unicode-age unicode-bool unicode-case unicode-gencat unicode-perl unicode-script unicode-segment ) for f in "${features[@]}"; do echo "=== FEATURE: $f ===" # We only run library tests because I couldn't figure out how to easily # make doc tests run in 'no_std' mode. In particular, without the Error # trait, using '?' in doc tests seems tricky. cargo test --no-default-features --lib --features "$f" done