regex-lite-0.1.6/.cargo_vcs_info.json0000644000000001500000000000100131010ustar { "git": { "sha1": "1288b83af3d8b441efb264ed6651b0dfb9c2df78" }, "path_in_vcs": "regex-lite" }regex-lite-0.1.6/Cargo.toml0000644000000024510000000000100111050ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "regex-lite" version = "0.1.6" authors = [ "The Rust Project Developers", "Andrew Gallant ", ] build = false autobins = false autoexamples = false autotests = false autobenches = false description = """ A lightweight regex engine that optimizes for binary size and compilation time. """ documentation = "https://docs.rs/regex-lite" readme = "README.md" license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" [package.metadata.docs.rs] all-features = true [lib] name = "regex_lite" path = "src/lib.rs" [[test]] name = "integration" path = "tests/lib.rs" [dev-dependencies.anyhow] version = "1.0.69" [dev-dependencies.regex-test] version = "0.1.0" [features] default = [ "std", "string", ] std = [] string = [] regex-lite-0.1.6/Cargo.toml.orig000064400000000000000000000017351046102023000145720ustar 00000000000000[package] name = "regex-lite" version = "0.1.6" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-lite" documentation = "https://docs.rs/regex-lite" description = """ A lightweight regex engine that optimizes for binary size and compilation time. """ workspace = ".." edition = "2021" rust-version = "1.65" autotests = false # Features are documented in the "Crate features" section of the crate docs: # https://docs.rs/regex-lite/*/#crate-features # # (Currently there are no supported features. 'std' is technically one, but it # is currently required.) [features] default = ["std", "string"] std = [] string = [] [dev-dependencies] anyhow = "1.0.69" regex-test = { path = "../regex-test", version = "0.1.0" } [[test]] path = "tests/lib.rs" name = "integration" [package.metadata.docs.rs] # We want to document all features. all-features = true regex-lite-0.1.6/LICENSE-APACHE000064400000000000000000000251371046102023000136310ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. regex-lite-0.1.6/LICENSE-MIT000064400000000000000000000020571046102023000133350ustar 00000000000000Copyright (c) 2014 The Rust Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regex-lite-0.1.6/README.md000064400000000000000000000103061046102023000131540ustar 00000000000000regex-lite ========== This crate provides a **lightweight** regex engine for searching strings. The regex syntax supported by this crate is nearly identical to what is found in the `regex` crate. Like the `regex` crate, all regex searches in this crate have worst case `O(m * n)` time complexity, where `m` is proportional to the size of the regex and `n` is proportional to the size of the string being searched. [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex-lite.svg)](https://crates.io/crates/regex-lite) ### Documentation https://docs.rs/regex-lite ### Usage To bring this crate into your repository, either add `regex-lite` to your `Cargo.toml`, or run `cargo add regex-lite`. Here's a simple example that matches a date in YYYY-MM-DD format and prints the year, month and day: ```rust use regex_lite::Regex; fn main() { let re = Regex::new(r"(?x) (?P\d{4}) # the year - (?P\d{2}) # the month - (?P\d{2}) # the day ").unwrap(); let caps = re.captures("2010-03-14").unwrap(); assert_eq!("2010", &caps["year"]); assert_eq!("03", &caps["month"]); assert_eq!("14", &caps["day"]); } ``` If you have lots of dates in text that you'd like to iterate over, then it's easy to adapt the above example with an iterator: ```rust use regex::Regex; const TO_SEARCH: &'static str = " On 2010-03-14, foo happened. On 2014-10-14, bar happened. "; fn main() { let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); for caps in re.captures_iter(TO_SEARCH) { // Note that all of the unwraps are actually OK for this regex // because the only way for the regex to match is if all of the // capture groups match. This is not true in general though! println!("year: {}, month: {}, day: {}", caps.get(1).unwrap().as_str(), caps.get(2).unwrap().as_str(), caps.get(3).unwrap().as_str()); } } ``` This example outputs: ```text year: 2010, month: 03, day: 14 year: 2014, month: 10, day: 14 ``` ### Minimum Rust version policy This crate's minimum supported `rustc` version is `1.65.0`. The policy is that the minimum Rust version required to use this crate can be increased in semver compatible updates. ### Motivation The primary purpose of this crate is to provide an alternative regex engine for folks that are unhappy with the binary size and compilation time of the primary `regex` crate. The `regex-lite` crate does the absolute minimum possible to act as a drop-in replacement to the `regex` crate's `Regex` type. It avoids a lot of complexity by choosing not to optimize searches and to opt out of functionality such as robust Unicode support. By keeping the code simpler and smaller, we get binary sizes and compile times that are substantially better than even the `regex` crate with all of its features disabled. To make the benefits a bit more concrete, here are the results of one experiment I did. For `regex`, I disabled all features except for `std`: * `regex 1.7.3`: 1.41s compile time, 373KB relative size increase * `regex 1.8.1`: 1.46s compile time, 410KB relative size increase * `regex 1.9.0`: 1.93s compile time, 565KB relative size increase * `regex-lite 0.1.0`: 0.73s compile time, 94KB relative size increase The main reason why `regex-lite` does so much better than `regex` when all of `regex`'s features are disabled is because of irreducible complexity. There are certain parts of the code in `regex` that can't be arbitrarily divided based on binary size and compile time goals. It's instead more sustainable to just maintain an entirely separate crate. Ideas for improving the binary size and compile times of this crate even more are most welcome. ### License This project is licensed under either of * Apache License, Version 2.0, ([LICENSE-APACHE](LICENSE-APACHE) or https://www.apache.org/licenses/LICENSE-2.0) * MIT license ([LICENSE-MIT](LICENSE-MIT) or https://opensource.org/licenses/MIT) at your option. The data in `regex-syntax/src/unicode_tables/` is licensed under the Unicode License Agreement ([LICENSE-UNICODE](https://www.unicode.org/copyright.html#License)). regex-lite-0.1.6/src/error.rs000064400000000000000000000016101046102023000141610ustar 00000000000000/// An error that occurred during parsing or compiling a regular expression. /// /// A parse error occurs when the syntax of the regex pattern is not /// valid. Otherwise, a regex can still fail to build if it would /// result in a machine that exceeds the configured size limit, via /// [`RegexBuilder::size_limit`](crate::RegexBuilder::size_limit). /// /// This error type provides no introspection capabilities. The only thing you /// can do with it is convert it to a string as a human readable error message. #[derive(Clone, Debug, Eq, PartialEq)] pub struct Error { msg: &'static str, } impl Error { pub(crate) fn new(msg: &'static str) -> Error { Error { msg } } } #[cfg(feature = "std")] impl std::error::Error for Error {} impl core::fmt::Display for Error { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{}", self.msg) } } regex-lite-0.1.6/src/hir/mod.rs000064400000000000000000000747021046102023000144050ustar 00000000000000use alloc::{boxed::Box, string::String, vec, vec::Vec}; use crate::{error::Error, utf8}; mod parse; /// Escapes all regular expression meta characters in `pattern`. /// /// The string returned may be safely used as a literal in a regular /// expression. pub fn escape(pattern: &str) -> String { let mut buf = String::new(); buf.reserve(pattern.len()); for ch in pattern.chars() { if is_meta_character(ch) { buf.push('\\'); } buf.push(ch); } buf } /// Returns true if the given character has significance in a regex. /// /// Generally speaking, these are the only characters which _must_ be escaped /// in order to match their literal meaning. For example, to match a literal /// `|`, one could write `\|`. Sometimes escaping isn't always necessary. For /// example, `-` is treated as a meta character because of its significance /// for writing ranges inside of character classes, but the regex `-` will /// match a literal `-` because `-` has no special meaning outside of character /// classes. /// /// In order to determine whether a character may be escaped at all, the /// [`is_escapeable_character`] routine should be used. The difference between /// `is_meta_character` and `is_escapeable_character` is that the latter will /// return true for some characters that are _not_ meta characters. For /// example, `%` and `\%` both match a literal `%` in all contexts. In other /// words, `is_escapeable_character` includes "superfluous" escapes. /// /// Note that the set of characters for which this function returns `true` or /// `false` is fixed and won't change in a semver compatible release. (In this /// case, "semver compatible release" actually refers to the `regex` crate /// itself, since reducing or expanding the set of meta characters would be a /// breaking change for not just `regex-syntax` but also `regex` itself.) fn is_meta_character(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true, _ => false, } } /// Returns true if the given character can be escaped in a regex. /// /// This returns true in all cases that `is_meta_character` returns true, but /// also returns true in some cases where `is_meta_character` returns false. /// For example, `%` is not a meta character, but it is escapeable. That is, /// `%` and `\%` both match a literal `%` in all contexts. /// /// The purpose of this routine is to provide knowledge about what characters /// may be escaped. Namely, most regex engines permit "superfluous" escapes /// where characters without any special significance may be escaped even /// though there is no actual _need_ to do so. /// /// This will return false for some characters. For example, `e` is not /// escapeable. Therefore, `\e` will either result in a parse error (which is /// true today), or it could backwards compatibly evolve into a new construct /// with its own meaning. Indeed, that is the purpose of banning _some_ /// superfluous escapes: it provides a way to evolve the syntax in a compatible /// manner. fn is_escapeable_character(c: char) -> bool { // Certainly escapeable if it's a meta character. if is_meta_character(c) { return true; } // Any character that isn't ASCII is definitely not escapeable. There's // no real need to allow things like \☃ right? if !c.is_ascii() { return false; } // Otherwise, we basically say that everything is escapeable unless it's a // letter or digit. Things like \3 are either octal (when enabled) or an // error, and we should keep it that way. Otherwise, letters are reserved // for adding new syntax in a backwards compatible way. match c { '0'..='9' | 'A'..='Z' | 'a'..='z' => false, // While not currently supported, we keep these as not escapeable to // give us some flexibility with respect to supporting the \< and // \> word boundary assertions in the future. By rejecting them as // escapeable, \< and \> will result in a parse error. Thus, we can // turn them into something else in the future without it being a // backwards incompatible change. '<' | '>' => false, _ => true, } } /// The configuration for a regex parser. #[derive(Clone, Copy, Debug)] pub(crate) struct Config { /// The maximum number of times we're allowed to recurse. /// /// Note that unlike the regex-syntax parser, we actually use recursion in /// this parser for simplicity. My hope is that by setting a conservative /// default call limit and providing a way to configure it, that we can /// keep this simplification. But if we must, we can re-work the parser to /// put the call stack on the heap like regex-syntax does. pub(crate) nest_limit: u32, /// Various flags that control how a pattern is interpreted. pub(crate) flags: Flags, } impl Default for Config { fn default() -> Config { Config { nest_limit: 50, flags: Flags::default() } } } /// Various flags that control the interpretation of the pattern. /// /// These can be set via explicit configuration in code, or change dynamically /// during parsing via inline flags. For example, `foo(?i:bar)baz` will match /// `foo` and `baz` case sensitiviely and `bar` case insensitively (assuming a /// default configuration). #[derive(Clone, Copy, Debug, Default)] pub(crate) struct Flags { /// Whether to match case insensitively. /// /// This is the `i` flag. pub(crate) case_insensitive: bool, /// Whether `^` and `$` should be treated as line anchors or not. /// /// This is the `m` flag. pub(crate) multi_line: bool, /// Whether `.` should match line terminators or not. /// /// This is the `s` flag. pub(crate) dot_matches_new_line: bool, /// Whether to swap the meaning of greedy and non-greedy operators. /// /// This is the `U` flag. pub(crate) swap_greed: bool, /// Whether to enable CRLF mode. /// /// This is the `R` flag. pub(crate) crlf: bool, /// Whether to ignore whitespace. i.e., verbose mode. /// /// This is the `x` flag. pub(crate) ignore_whitespace: bool, } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Hir { kind: HirKind, is_start_anchored: bool, is_match_empty: bool, static_explicit_captures_len: Option, } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) enum HirKind { Empty, Char(char), Class(Class), Look(Look), Repetition(Repetition), Capture(Capture), Concat(Vec), Alternation(Vec), } impl Hir { /// Parses the given pattern string with the given configuration into a /// structured representation. If the pattern is invalid, then an error /// is returned. pub(crate) fn parse(config: Config, pattern: &str) -> Result { self::parse::Parser::new(config, pattern).parse() } /// Returns the underlying kind of this high-level intermediate /// representation. /// /// Note that there is explicitly no way to build an `Hir` directly from /// an `HirKind`. If you need to do that, then you must do case analysis /// on the `HirKind` and call the appropriate smart constructor on `Hir`. pub(crate) fn kind(&self) -> &HirKind { &self.kind } /// Returns true if and only if this Hir expression can only match at the /// beginning of a haystack. pub(crate) fn is_start_anchored(&self) -> bool { self.is_start_anchored } /// Returns true if and only if this Hir expression can match the empty /// string. pub(crate) fn is_match_empty(&self) -> bool { self.is_match_empty } /// If the pattern always reports the same number of matching capture groups /// for every match, then this returns the number of those groups. This /// doesn't include the implicit group found in every pattern. pub(crate) fn static_explicit_captures_len(&self) -> Option { self.static_explicit_captures_len } fn fail() -> Hir { let kind = HirKind::Class(Class { ranges: vec![] }); Hir { kind, is_start_anchored: false, is_match_empty: false, static_explicit_captures_len: Some(0), } } fn empty() -> Hir { let kind = HirKind::Empty; Hir { kind, is_start_anchored: false, is_match_empty: true, static_explicit_captures_len: Some(0), } } fn char(ch: char) -> Hir { let kind = HirKind::Char(ch); Hir { kind, is_start_anchored: false, is_match_empty: false, static_explicit_captures_len: Some(0), } } fn class(class: Class) -> Hir { let kind = HirKind::Class(class); Hir { kind, is_start_anchored: false, is_match_empty: false, static_explicit_captures_len: Some(0), } } fn look(look: Look) -> Hir { let kind = HirKind::Look(look); Hir { kind, is_start_anchored: matches!(look, Look::Start), is_match_empty: true, static_explicit_captures_len: Some(0), } } fn repetition(rep: Repetition) -> Hir { if rep.min == 0 && rep.max == Some(0) { return Hir::empty(); } else if rep.min == 1 && rep.max == Some(1) { return *rep.sub; } let is_start_anchored = rep.min > 0 && rep.sub.is_start_anchored; let is_match_empty = rep.min == 0 || rep.sub.is_match_empty; let mut static_explicit_captures_len = rep.sub.static_explicit_captures_len; // If the static captures len of the sub-expression is not known or // is greater than zero, then it automatically propagates to the // repetition, regardless of the repetition. Otherwise, it might // change, but only when the repetition can match 0 times. if rep.min == 0 && static_explicit_captures_len.map_or(false, |len| len > 0) { // If we require a match 0 times, then our captures len is // guaranteed to be zero. Otherwise, if we *can* match the empty // string, then it's impossible to know how many captures will be // in the resulting match. if rep.max == Some(0) { static_explicit_captures_len = Some(0); } else { static_explicit_captures_len = None; } } Hir { kind: HirKind::Repetition(rep), is_start_anchored, is_match_empty, static_explicit_captures_len, } } fn capture(cap: Capture) -> Hir { let is_start_anchored = cap.sub.is_start_anchored; let is_match_empty = cap.sub.is_match_empty; let static_explicit_captures_len = cap .sub .static_explicit_captures_len .map(|len| len.saturating_add(1)); let kind = HirKind::Capture(cap); Hir { kind, is_start_anchored, is_match_empty, static_explicit_captures_len, } } fn concat(mut subs: Vec) -> Hir { if subs.is_empty() { Hir::empty() } else if subs.len() == 1 { subs.pop().unwrap() } else { let is_start_anchored = subs[0].is_start_anchored; let mut is_match_empty = true; let mut static_explicit_captures_len = Some(0usize); for sub in subs.iter() { is_match_empty = is_match_empty && sub.is_match_empty; static_explicit_captures_len = static_explicit_captures_len .and_then(|len1| { Some((len1, sub.static_explicit_captures_len?)) }) .and_then(|(len1, len2)| Some(len1.saturating_add(len2))); } Hir { kind: HirKind::Concat(subs), is_start_anchored, is_match_empty, static_explicit_captures_len, } } } fn alternation(mut subs: Vec) -> Hir { if subs.is_empty() { Hir::fail() } else if subs.len() == 1 { subs.pop().unwrap() } else { let mut it = subs.iter().peekable(); let mut is_start_anchored = it.peek().map_or(false, |sub| sub.is_start_anchored); let mut is_match_empty = it.peek().map_or(false, |sub| sub.is_match_empty); let mut static_explicit_captures_len = it.peek().and_then(|sub| sub.static_explicit_captures_len); for sub in it { is_start_anchored = is_start_anchored && sub.is_start_anchored; is_match_empty = is_match_empty || sub.is_match_empty; if static_explicit_captures_len != sub.static_explicit_captures_len { static_explicit_captures_len = None; } } Hir { kind: HirKind::Alternation(subs), is_start_anchored, is_match_empty, static_explicit_captures_len, } } } } impl HirKind { /// Returns a slice of this kind's sub-expressions, if any. fn subs(&self) -> &[Hir] { use core::slice::from_ref; match *self { HirKind::Empty | HirKind::Char(_) | HirKind::Class(_) | HirKind::Look(_) => &[], HirKind::Repetition(Repetition { ref sub, .. }) => from_ref(sub), HirKind::Capture(Capture { ref sub, .. }) => from_ref(sub), HirKind::Concat(ref subs) => subs, HirKind::Alternation(ref subs) => subs, } } } #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Class { pub(crate) ranges: Vec, } impl Class { /// Create a new class from the given ranges. The ranges may be provided /// in any order or may even overlap. They will be automatically /// canonicalized. fn new>(ranges: I) -> Class { let mut class = Class { ranges: ranges.into_iter().collect() }; class.canonicalize(); class } /// Expand this class such that it matches the ASCII codepoints in this set /// case insensitively. fn ascii_case_fold(&mut self) { let len = self.ranges.len(); for i in 0..len { if let Some(folded) = self.ranges[i].ascii_case_fold() { self.ranges.push(folded); } } self.canonicalize(); } /// Negate this set. /// /// For all `x` where `x` is any element, if `x` was in this set, then it /// will not be in this set after negation. fn negate(&mut self) { const MIN: char = '\x00'; const MAX: char = char::MAX; if self.ranges.is_empty() { self.ranges.push(ClassRange { start: MIN, end: MAX }); return; } // There should be a way to do this in-place with constant memory, // but I couldn't figure out a simple way to do it. So just append // the negation to the end of this range, and then drain it before // we're done. let drain_end = self.ranges.len(); // If our class doesn't start the minimum possible char, then negation // needs to include all codepoints up to the minimum in this set. if self.ranges[0].start > MIN { self.ranges.push(ClassRange { start: MIN, // OK because we know it's bigger than MIN. end: prev_char(self.ranges[0].start).unwrap(), }); } for i in 1..drain_end { // let lower = self.ranges[i - 1].upper().increment(); // let upper = self.ranges[i].lower().decrement(); // self.ranges.push(I::create(lower, upper)); self.ranges.push(ClassRange { // OK because we know i-1 is never the last range and therefore // there must be a range greater than it. It therefore follows // that 'end' can never be char::MAX, and thus there must be // a next char. start: next_char(self.ranges[i - 1].end).unwrap(), // Since 'i' is guaranteed to never be the first range, it // follows that there is always a range before this and thus // 'start' can never be '\x00'. Thus, there must be a previous // char. end: prev_char(self.ranges[i].start).unwrap(), }); } if self.ranges[drain_end - 1].end < MAX { // let lower = self.ranges[drain_end - 1].upper().increment(); // self.ranges.push(I::create(lower, I::Bound::max_value())); self.ranges.push(ClassRange { // OK because we know 'end' is less than char::MAX, and thus // there is a next char. start: next_char(self.ranges[drain_end - 1].end).unwrap(), end: MAX, }); } self.ranges.drain(..drain_end); // We don't need to canonicalize because we processed the ranges above // in canonical order and the new ranges we added based on those are // also necessarily in canonical order. } /// Converts this set into a canonical ordering. fn canonicalize(&mut self) { if self.is_canonical() { return; } self.ranges.sort(); assert!(!self.ranges.is_empty()); // Is there a way to do this in-place with constant memory? I couldn't // figure out a way to do it. So just append the canonicalization to // the end of this range, and then drain it before we're done. let drain_end = self.ranges.len(); for oldi in 0..drain_end { // If we've added at least one new range, then check if we can // merge this range in the previously added range. if self.ranges.len() > drain_end { let (last, rest) = self.ranges.split_last_mut().unwrap(); if let Some(union) = last.union(&rest[oldi]) { *last = union; continue; } } self.ranges.push(self.ranges[oldi]); } self.ranges.drain(..drain_end); } /// Returns true if and only if this class is in a canonical ordering. fn is_canonical(&self) -> bool { for pair in self.ranges.windows(2) { if pair[0] >= pair[1] { return false; } if pair[0].is_contiguous(&pair[1]) { return false; } } true } } #[derive(Clone, Copy, Debug, Eq, PartialEq, PartialOrd, Ord)] pub(crate) struct ClassRange { pub(crate) start: char, pub(crate) end: char, } impl ClassRange { /// Apply simple case folding to this byte range. Only ASCII case mappings /// (for A-Za-z) are applied. /// /// Additional ranges are appended to the given vector. Canonical ordering /// is *not* maintained in the given vector. fn ascii_case_fold(&self) -> Option { if !(ClassRange { start: 'a', end: 'z' }).is_intersection_empty(self) { let start = core::cmp::max(self.start, 'a'); let end = core::cmp::min(self.end, 'z'); return Some(ClassRange { start: char::try_from(u32::from(start) - 32).unwrap(), end: char::try_from(u32::from(end) - 32).unwrap(), }); } if !(ClassRange { start: 'A', end: 'Z' }).is_intersection_empty(self) { let start = core::cmp::max(self.start, 'A'); let end = core::cmp::min(self.end, 'Z'); return Some(ClassRange { start: char::try_from(u32::from(start) + 32).unwrap(), end: char::try_from(u32::from(end) + 32).unwrap(), }); } None } /// Union the given overlapping range into this range. /// /// If the two ranges aren't contiguous, then this returns `None`. fn union(&self, other: &ClassRange) -> Option { if !self.is_contiguous(other) { return None; } let start = core::cmp::min(self.start, other.start); let end = core::cmp::max(self.end, other.end); Some(ClassRange { start, end }) } /// Returns true if and only if the two ranges are contiguous. Two ranges /// are contiguous if and only if the ranges are either overlapping or /// adjacent. fn is_contiguous(&self, other: &ClassRange) -> bool { let (s1, e1) = (u32::from(self.start), u32::from(self.end)); let (s2, e2) = (u32::from(other.start), u32::from(other.end)); core::cmp::max(s1, s2) <= core::cmp::min(e1, e2).saturating_add(1) } /// Returns true if and only if the intersection of this range and the /// other range is empty. fn is_intersection_empty(&self, other: &ClassRange) -> bool { let (s1, e1) = (self.start, self.end); let (s2, e2) = (other.start, other.end); core::cmp::max(s1, s2) > core::cmp::min(e1, e2) } } /// The high-level intermediate representation for a look-around assertion. /// /// An assertion match is always zero-length. Also called an "empty match." #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. Start = 1 << 0, /// Match the end of text. Specifically, this matches at the ending /// position of the input. End = 1 << 1, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following a `\n` character. StartLF = 1 << 2, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\n` character. EndLF = 1 << 3, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following either a `\r` or `\n` character, but never after /// a `\r` when a `\n` follows. StartCRLF = 1 << 4, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` /// precedes it. EndCRLF = 1 << 5, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. Word = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordNegate = 1 << 7, /// Match the start of an ASCII-only word boundary. That is, this matches a /// position at either the beginning of the haystack or where the previous /// character is not a word character and the following character is a word /// character. WordStart = 1 << 8, /// Match the end of an ASCII-only word boundary. That is, this matches /// a position at either the end of the haystack or where the previous /// character is a word character and the following character is not a word /// character. WordEnd = 1 << 9, /// Match the start half of an ASCII-only word boundary. That is, this /// matches a position at either the beginning of the haystack or where the /// previous character is not a word character. WordStartHalf = 1 << 10, /// Match the end half of an ASCII-only word boundary. That is, this /// matches a position at either the end of the haystack or where the /// following character is not a word character. WordEndHalf = 1 << 11, } impl Look { /// Returns true if the given position in the given haystack matches this /// look-around assertion. pub(crate) fn is_match(&self, haystack: &[u8], at: usize) -> bool { use self::Look::*; match *self { Start => at == 0, End => at == haystack.len(), StartLF => at == 0 || haystack[at - 1] == b'\n', EndLF => at == haystack.len() || haystack[at] == b'\n', StartCRLF => { at == 0 || haystack[at - 1] == b'\n' || (haystack[at - 1] == b'\r' && (at >= haystack.len() || haystack[at] != b'\n')) } EndCRLF => { at == haystack.len() || haystack[at] == b'\r' || (haystack[at] == b'\n' && (at == 0 || haystack[at - 1] != b'\r')) } Word => { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before != word_after } WordNegate => { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before == word_after } WordStart => { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); !word_before && word_after } WordEnd => { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before && !word_after } WordStartHalf => { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); !word_before } WordEndHalf => { let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); !word_after } } } } /// The high-level intermediate representation of a repetition operator. /// /// A repetition operator permits the repetition of an arbitrary /// sub-expression. #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Repetition { /// The minimum range of the repetition. /// /// Note that special cases like `?`, `+` and `*` all get translated into /// the ranges `{0,1}`, `{1,}` and `{0,}`, respectively. /// /// When `min` is zero, this expression can match the empty string /// regardless of what its sub-expression is. pub(crate) min: u32, /// The maximum range of the repetition. /// /// Note that when `max` is `None`, `min` acts as a lower bound but where /// there is no upper bound. For something like `x{5}` where the min and /// max are equivalent, `min` will be set to `5` and `max` will be set to /// `Some(5)`. pub(crate) max: Option, /// Whether this repetition operator is greedy or not. A greedy operator /// will match as much as it can. A non-greedy operator will match as /// little as it can. /// /// Typically, operators are greedy by default and are only non-greedy when /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is /// not. However, this can be inverted via the `U` "ungreedy" flag. pub(crate) greedy: bool, /// The expression being repeated. pub(crate) sub: Box, } /// The high-level intermediate representation for a capturing group. /// /// A capturing group always has an index and a child expression. It may /// also have a name associated with it (e.g., `(?P\w)`), but it's not /// necessary. /// /// Note that there is no explicit representation of a non-capturing group /// in a `Hir`. Instead, non-capturing grouping is handled automatically by /// the recursive structure of the `Hir` itself. #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct Capture { /// The capture index of the capture. pub(crate) index: u32, /// The name of the capture, if it exists. pub(crate) name: Option>, /// The expression inside the capturing group, which may be empty. pub(crate) sub: Box, } fn next_char(ch: char) -> Option { // Skip over the surrogate range. if ch == '\u{D7FF}' { return Some('\u{E000}'); } // OK because char::MAX < u32::MAX and we handle U+D7FF above. char::from_u32(u32::from(ch).checked_add(1).unwrap()) } fn prev_char(ch: char) -> Option { // Skip over the surrogate range. if ch == '\u{E000}' { return Some('\u{D7FF}'); } // OK because subtracting 1 from any valid scalar value other than 0 // and U+E000 yields a valid scalar value. Some(char::from_u32(u32::from(ch).checked_sub(1)?).unwrap()) } impl Drop for Hir { fn drop(&mut self) { use core::mem; match *self.kind() { HirKind::Empty | HirKind::Char(_) | HirKind::Class(_) | HirKind::Look(_) => return, HirKind::Capture(ref x) if x.sub.kind.subs().is_empty() => return, HirKind::Repetition(ref x) if x.sub.kind.subs().is_empty() => { return } HirKind::Concat(ref x) if x.is_empty() => return, HirKind::Alternation(ref x) if x.is_empty() => return, _ => {} } let mut stack = vec![mem::replace(self, Hir::empty())]; while let Some(mut expr) = stack.pop() { match expr.kind { HirKind::Empty | HirKind::Char(_) | HirKind::Class(_) | HirKind::Look(_) => {} HirKind::Capture(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Repetition(ref mut x) => { stack.push(mem::replace(&mut x.sub, Hir::empty())); } HirKind::Concat(ref mut x) => { stack.extend(x.drain(..)); } HirKind::Alternation(ref mut x) => { stack.extend(x.drain(..)); } } } } } regex-lite-0.1.6/src/hir/parse.rs000064400000000000000000002412551046102023000147370ustar 00000000000000use core::cell::{Cell, RefCell}; use alloc::{ boxed::Box, string::{String, ToString}, vec, vec::Vec, }; use crate::{ error::Error, hir::{self, Config, Flags, Hir, HirKind}, }; // These are all of the errors that can occur while parsing a regex. Unlike // regex-syntax, our errors are not particularly great. They are just enough // to get a general sense of what went wrong. But in exchange, the error // reporting mechanism is *much* simpler than what's in regex-syntax. // // By convention, we use each of these messages in exactly one place. That // way, every branch that leads to an error has a unique message. This in turn // means that given a message, one can precisely identify which part of the // parser reported it. // // Finally, we give names to each message so that we can reference them in // tests. const ERR_TOO_MUCH_NESTING: &str = "pattern has too much nesting"; const ERR_TOO_MANY_CAPTURES: &str = "too many capture groups"; const ERR_DUPLICATE_CAPTURE_NAME: &str = "duplicate capture group name"; const ERR_UNCLOSED_GROUP: &str = "found open group without closing ')'"; const ERR_UNCLOSED_GROUP_QUESTION: &str = "expected closing ')', but got end of pattern"; const ERR_UNOPENED_GROUP: &str = "found closing ')' without matching '('"; const ERR_LOOK_UNSUPPORTED: &str = "look-around is not supported"; const ERR_EMPTY_FLAGS: &str = "empty flag directive '(?)' is not allowed"; const ERR_MISSING_GROUP_NAME: &str = "expected capture group name, but got end of pattern"; const ERR_INVALID_GROUP_NAME: &str = "invalid group name"; const ERR_UNCLOSED_GROUP_NAME: &str = "expected end of capture group name, but got end of pattern"; const ERR_EMPTY_GROUP_NAME: &str = "empty capture group names are not allowed"; const ERR_FLAG_UNRECOGNIZED: &str = "unrecognized inline flag"; const ERR_FLAG_REPEATED_NEGATION: &str = "inline flag negation cannot be repeated"; const ERR_FLAG_DUPLICATE: &str = "duplicate inline flag is not allowed"; const ERR_FLAG_UNEXPECTED_EOF: &str = "expected ':' or ')' to end inline flags, but got end of pattern"; const ERR_FLAG_DANGLING_NEGATION: &str = "inline flags cannot end with negation directive"; const ERR_DECIMAL_NO_DIGITS: &str = "expected decimal number, but found no digits"; const ERR_DECIMAL_INVALID: &str = "got invalid decimal number"; const ERR_HEX_BRACE_INVALID_DIGIT: &str = "expected hexadecimal number in braces, but got non-hex digit"; const ERR_HEX_BRACE_UNEXPECTED_EOF: &str = "expected hexadecimal number, but saw end of pattern before closing brace"; const ERR_HEX_BRACE_EMPTY: &str = "expected hexadecimal number in braces, but got no digits"; const ERR_HEX_BRACE_INVALID: &str = "got invalid hexadecimal number in braces"; const ERR_HEX_FIXED_UNEXPECTED_EOF: &str = "expected fixed length hexadecimal number, but saw end of pattern first"; const ERR_HEX_FIXED_INVALID_DIGIT: &str = "expected fixed length hexadecimal number, but got non-hex digit"; const ERR_HEX_FIXED_INVALID: &str = "got invalid fixed length hexadecimal number"; const ERR_HEX_UNEXPECTED_EOF: &str = "expected hexadecimal number, but saw end of pattern first"; const ERR_ESCAPE_UNEXPECTED_EOF: &str = "saw start of escape sequence, but saw end of pattern before it finished"; const ERR_BACKREF_UNSUPPORTED: &str = "backreferences are not supported"; const ERR_UNICODE_CLASS_UNSUPPORTED: &str = "Unicode character classes are not supported"; const ERR_ESCAPE_UNRECOGNIZED: &str = "unrecognized escape sequence"; const ERR_POSIX_CLASS_UNRECOGNIZED: &str = "unrecognized POSIX character class"; const ERR_UNCOUNTED_REP_SUB_MISSING: &str = "uncounted repetition operator must be applied to a sub-expression"; const ERR_COUNTED_REP_SUB_MISSING: &str = "counted repetition operator must be applied to a sub-expression"; const ERR_COUNTED_REP_UNCLOSED: &str = "found unclosed counted repetition operator"; const ERR_COUNTED_REP_MIN_UNCLOSED: &str = "found incomplete and unclosed counted repetition operator"; const ERR_COUNTED_REP_COMMA_UNCLOSED: &str = "found counted repetition operator with a comma that is unclosed"; const ERR_COUNTED_REP_MIN_MAX_UNCLOSED: &str = "found counted repetition with min and max that is unclosed"; const ERR_COUNTED_REP_INVALID: &str = "expected closing brace for counted repetition, but got something else"; const ERR_COUNTED_REP_INVALID_RANGE: &str = "found counted repetition with a min bigger than its max"; const ERR_CLASS_UNCLOSED_AFTER_ITEM: &str = "non-empty character class has no closing bracket"; const ERR_CLASS_INVALID_RANGE_ITEM: &str = "character class ranges must start and end with a single character"; const ERR_CLASS_INVALID_ITEM: &str = "invalid escape sequence in character class"; const ERR_CLASS_UNCLOSED_AFTER_DASH: &str = "non-empty character class has no closing bracket after dash"; const ERR_CLASS_UNCLOSED_AFTER_NEGATION: &str = "negated character class has no closing bracket"; const ERR_CLASS_UNCLOSED_AFTER_CLOSING: &str = "character class begins with literal ']' but has no closing bracket"; const ERR_CLASS_INVALID_RANGE: &str = "invalid range in character class"; const ERR_CLASS_UNCLOSED: &str = "found unclosed character class"; const ERR_CLASS_NEST_UNSUPPORTED: &str = "nested character classes are not supported"; const ERR_CLASS_INTERSECTION_UNSUPPORTED: &str = "character class intersection is not supported"; const ERR_CLASS_DIFFERENCE_UNSUPPORTED: &str = "character class difference is not supported"; const ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED: &str = "character class symmetric difference is not supported"; const ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED: &str = "special word boundary assertion is unclosed or has an invalid character"; const ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED: &str = "special word boundary assertion is unrecognized"; const ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF: &str = "found start of special word boundary or repetition without an end"; /// A regular expression parser. /// /// This parses a string representation of a regular expression into an /// abstract syntax tree. The size of the tree is proportional to the length /// of the regular expression pattern. /// /// A `Parser` can be configured in more detail via a [`ParserBuilder`]. #[derive(Clone, Debug)] pub(super) struct Parser<'a> { /// The configuration of the parser as given by the caller. config: Config, /// The pattern we're parsing as given by the caller. pattern: &'a str, /// The call depth of the parser. This is incremented for each /// sub-expression parsed. Its peak value is the maximum nesting of the /// pattern. depth: Cell, /// The current position of the parser. pos: Cell, /// The current codepoint of the parser. The codepoint corresponds to the /// codepoint encoded in `pattern` beginning at `pos`. /// /// This is `None` if and only if `pos == pattern.len()`. char: Cell>, /// The current capture index. capture_index: Cell, /// The flags that are currently set. flags: RefCell, /// A sorted sequence of capture names. This is used to detect duplicate /// capture names and report an error if one is detected. capture_names: RefCell>, } /// The constructor and a variety of helper routines. impl<'a> Parser<'a> { /// Build a parser from this configuration with the given pattern. pub(super) fn new(config: Config, pattern: &'a str) -> Parser<'a> { Parser { config, pattern, depth: Cell::new(0), pos: Cell::new(0), char: Cell::new(pattern.chars().next()), capture_index: Cell::new(0), flags: RefCell::new(config.flags), capture_names: RefCell::new(vec![]), } } /// Returns the full pattern string that we're parsing. fn pattern(&self) -> &str { self.pattern } /// Return the current byte offset of the parser. /// /// The offset starts at `0` from the beginning of the regular expression /// pattern string. fn pos(&self) -> usize { self.pos.get() } /// Increments the call depth of the parser. /// /// If the call depth would exceed the configured nest limit, then this /// returns an error. /// /// This returns the old depth. fn increment_depth(&self) -> Result { let old = self.depth.get(); if old > self.config.nest_limit { return Err(Error::new(ERR_TOO_MUCH_NESTING)); } // OK because our depth starts at 0, and we return an error if it // ever reaches the limit. So the call depth can never exceed u32::MAX. let new = old.checked_add(1).unwrap(); self.depth.set(new); Ok(old) } /// Decrements the call depth of the parser. /// /// This panics if the current depth is 0. fn decrement_depth(&self) { let old = self.depth.get(); // If this fails then the caller has a bug in how they're incrementing // and decrementing the depth of the parser's call stack. let new = old.checked_sub(1).unwrap(); self.depth.set(new); } /// Return the codepoint at the current position of the parser. /// /// This panics if the parser is positioned at the end of the pattern. fn char(&self) -> char { self.char.get().expect("codepoint, but parser is done") } /// Returns true if the next call to `bump` would return false. fn is_done(&self) -> bool { self.pos() == self.pattern.len() } /// Returns the flags that are current set for this regex. fn flags(&self) -> Flags { *self.flags.borrow() } /// Bump the parser to the next Unicode scalar value. /// /// If the end of the input has been reached, then `false` is returned. fn bump(&self) -> bool { if self.is_done() { return false; } self.pos.set(self.pos() + self.char().len_utf8()); self.char.set(self.pattern()[self.pos()..].chars().next()); self.char.get().is_some() } /// If the substring starting at the current position of the parser has /// the given prefix, then bump the parser to the character immediately /// following the prefix and return true. Otherwise, don't bump the parser /// and return false. fn bump_if(&self, prefix: &str) -> bool { if self.pattern()[self.pos()..].starts_with(prefix) { for _ in 0..prefix.chars().count() { self.bump(); } true } else { false } } /// Bump the parser, and if the `x` flag is enabled, bump through any /// subsequent spaces. Return true if and only if the parser is not done. fn bump_and_bump_space(&self) -> bool { if !self.bump() { return false; } self.bump_space(); !self.is_done() } /// If the `x` flag is enabled (i.e., whitespace insensitivity with /// comments), then this will advance the parser through all whitespace /// and comments to the next non-whitespace non-comment byte. /// /// If the `x` flag is disabled, then this is a no-op. /// /// This should be used selectively throughout the parser where /// arbitrary whitespace is permitted when the `x` flag is enabled. For /// example, `{ 5 , 6}` is equivalent to `{5,6}`. fn bump_space(&self) { if !self.flags().ignore_whitespace { return; } while !self.is_done() { if self.char().is_whitespace() { self.bump(); } else if self.char() == '#' { self.bump(); while !self.is_done() { let c = self.char(); self.bump(); if c == '\n' { break; } } } else { break; } } } /// Peek at the next character in the input without advancing the parser. /// /// If the input has been exhausted, then this returns `None`. fn peek(&self) -> Option { if self.is_done() { return None; } self.pattern()[self.pos() + self.char().len_utf8()..].chars().next() } /// Peeks at the next character in the pattern from the current offset, and /// will ignore spaces when the parser is in whitespace insensitive mode. fn peek_space(&self) -> Option { if !self.flags().ignore_whitespace { return self.peek(); } if self.is_done() { return None; } let mut start = self.pos() + self.char().len_utf8(); let mut in_comment = false; for (i, ch) in self.pattern()[start..].char_indices() { if ch.is_whitespace() { continue; } else if !in_comment && ch == '#' { in_comment = true; } else if in_comment && ch == '\n' { in_comment = false; } else { start += i; break; } } self.pattern()[start..].chars().next() } /// Return the next capturing index. Each subsequent call increments the /// internal index. Since the way capture indices are computed is a public /// API guarantee, use of this routine depends on the parser being depth /// first and left-to-right. /// /// If the capture limit is exceeded, then an error is returned. fn next_capture_index(&self) -> Result { let current = self.capture_index.get(); let next = current .checked_add(1) .ok_or_else(|| Error::new(ERR_TOO_MANY_CAPTURES))?; self.capture_index.set(next); Ok(next) } /// Adds the given capture name to this parser. If this capture name has /// already been used, then an error is returned. fn add_capture_name(&self, name: &str) -> Result<(), Error> { let mut names = self.capture_names.borrow_mut(); match names.binary_search_by(|n| name.cmp(n)) { Ok(_) => Err(Error::new(ERR_DUPLICATE_CAPTURE_NAME)), Err(i) => { names.insert(i, name.to_string()); Ok(()) } } } /// Returns true if and only if the parser is positioned at a look-around /// prefix. The conditions under which this returns true must always /// correspond to a regular expression that would otherwise be consider /// invalid. /// /// This should only be called immediately after parsing the opening of /// a group or a set of flags. fn is_lookaround_prefix(&self) -> bool { self.bump_if("?=") || self.bump_if("?!") || self.bump_if("?<=") || self.bump_if("? Parser<'a> { pub(super) fn parse(&self) -> Result { let hir = self.parse_inner()?; // While we also check nesting during parsing, that only checks the // number of recursive parse calls. It does not necessarily cover // all possible recursive nestings of the Hir itself. For example, // repetition operators don't require recursive parse calls. So one // can stack them arbitrarily without overflowing the stack in the // *parser*. But then if one recurses over the resulting Hir, a stack // overflow is possible. So here we check the Hir nesting level // thoroughly to ensure it isn't nested too deeply. // // Note that we do still need the nesting limit check in the parser as // well, since that will avoid overflowing the stack during parse time // before the complete Hir value is constructed. check_hir_nesting(&hir, self.config.nest_limit)?; Ok(hir) } fn parse_inner(&self) -> Result { let depth = self.increment_depth()?; let mut alternates = vec![]; let mut concat = vec![]; loop { self.bump_space(); if self.is_done() { break; } match self.char() { '(' => { // Save the old flags and reset them only when we close // the group. let oldflags = *self.flags.borrow(); if let Some(sub) = self.parse_group()? { concat.push(sub); // We only reset them here because if 'parse_group' // returns None, then that means it handled a flag // directive, e.g., '(?ism)'. And the whole point is // that those flags remain active until either disabled // or the end of the pattern or current group. *self.flags.borrow_mut() = oldflags; } if self.char.get() != Some(')') { return Err(Error::new(ERR_UNCLOSED_GROUP)); } self.bump(); } ')' => { if depth == 0 { return Err(Error::new(ERR_UNOPENED_GROUP)); } break; } '|' => { alternates.push(Hir::concat(core::mem::take(&mut concat))); self.bump(); } '[' => concat.push(self.parse_class()?), '?' | '*' | '+' => { concat = self.parse_uncounted_repetition(concat)?; } '{' => { concat = self.parse_counted_repetition(concat)?; } _ => concat.push(self.parse_primitive()?), } } self.decrement_depth(); alternates.push(Hir::concat(concat)); // N.B. This strips off the "alternation" if there's only one branch. Ok(Hir::alternation(alternates)) } /// Parses a "primitive" pattern. A primitive is any expression that does /// not contain any sub-expressions. /// /// This assumes the parser is pointing at the beginning of the primitive. fn parse_primitive(&self) -> Result { let ch = self.char(); self.bump(); match ch { '\\' => self.parse_escape(), '.' => Ok(self.hir_dot()), '^' => Ok(self.hir_anchor_start()), '$' => Ok(self.hir_anchor_end()), ch => Ok(self.hir_char(ch)), } } /// Parse an escape sequence. This always results in a "primitive" HIR, /// that is, an HIR with no sub-expressions. /// /// This assumes the parser is positioned at the start of the sequence, /// immediately *after* the `\`. It advances the parser to the first /// position immediately following the escape sequence. fn parse_escape(&self) -> Result { if self.is_done() { return Err(Error::new(ERR_ESCAPE_UNEXPECTED_EOF)); } let ch = self.char(); // Put some of the more complicated routines into helpers. match ch { '0'..='9' => return Err(Error::new(ERR_BACKREF_UNSUPPORTED)), 'p' | 'P' => { return Err(Error::new(ERR_UNICODE_CLASS_UNSUPPORTED)) } 'x' | 'u' | 'U' => return self.parse_hex(), 'd' | 's' | 'w' | 'D' | 'S' | 'W' => { return Ok(self.parse_perl_class()); } _ => {} } // Handle all of the one letter sequences inline. self.bump(); if hir::is_meta_character(ch) || hir::is_escapeable_character(ch) { return Ok(self.hir_char(ch)); } let special = |ch| Ok(self.hir_char(ch)); match ch { 'a' => special('\x07'), 'f' => special('\x0C'), 't' => special('\t'), 'n' => special('\n'), 'r' => special('\r'), 'v' => special('\x0B'), 'A' => Ok(Hir::look(hir::Look::Start)), 'z' => Ok(Hir::look(hir::Look::End)), 'b' => { let mut hir = Hir::look(hir::Look::Word); if !self.is_done() && self.char() == '{' { if let Some(special) = self.maybe_parse_special_word_boundary()? { hir = special; } } Ok(hir) } 'B' => Ok(Hir::look(hir::Look::WordNegate)), '<' => Ok(Hir::look(hir::Look::WordStart)), '>' => Ok(Hir::look(hir::Look::WordEnd)), _ => Err(Error::new(ERR_ESCAPE_UNRECOGNIZED)), } } /// Attempt to parse a specialty word boundary. That is, `\b{start}`, /// `\b{end}`, `\b{start-half}` or `\b{end-half}`. /// /// This is similar to `maybe_parse_ascii_class` in that, in most cases, /// if it fails it will just return `None` with no error. This is done /// because `\b{5}` is a valid expression and we want to let that be parsed /// by the existing counted repetition parsing code. (I thought about just /// invoking the counted repetition code from here, but it seemed a little /// ham-fisted.) /// /// Unlike `maybe_parse_ascii_class` though, this can return an error. /// Namely, if we definitely know it isn't a counted repetition, then we /// return an error specific to the specialty word boundaries. /// /// This assumes the parser is positioned at a `{` immediately following /// a `\b`. When `None` is returned, the parser is returned to the position /// at which it started: pointing at a `{`. /// /// The position given should correspond to the start of the `\b`. fn maybe_parse_special_word_boundary(&self) -> Result, Error> { assert_eq!(self.char(), '{'); let is_valid_char = |c| match c { 'A'..='Z' | 'a'..='z' | '-' => true, _ => false, }; let start = self.pos(); if !self.bump_and_bump_space() { return Err(Error::new(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF)); } // This is one of the critical bits: if the first non-whitespace // character isn't in [-A-Za-z] (i.e., this can't be a special word // boundary), then we bail and let the counted repetition parser deal // with this. if !is_valid_char(self.char()) { self.pos.set(start); self.char.set(Some('{')); return Ok(None); } // Now collect up our chars until we see a '}'. let mut scratch = String::new(); while !self.is_done() && is_valid_char(self.char()) { scratch.push(self.char()); self.bump_and_bump_space(); } if self.is_done() || self.char() != '}' { return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED)); } self.bump(); let kind = match scratch.as_str() { "start" => hir::Look::WordStart, "end" => hir::Look::WordEnd, "start-half" => hir::Look::WordStartHalf, "end-half" => hir::Look::WordEndHalf, _ => { return Err(Error::new(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED)) } }; Ok(Some(Hir::look(kind))) } /// Parse a hex representation of a Unicode codepoint. This handles both /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to /// the first character immediately following the hexadecimal literal. fn parse_hex(&self) -> Result { let digit_len = match self.char() { 'x' => 2, 'u' => 4, 'U' => 8, unk => unreachable!( "invalid start of fixed length hexadecimal number {}", unk ), }; if !self.bump_and_bump_space() { return Err(Error::new(ERR_HEX_UNEXPECTED_EOF)); } if self.char() == '{' { self.parse_hex_brace() } else { self.parse_hex_digits(digit_len) } } /// Parse an N-digit hex representation of a Unicode codepoint. This /// expects the parser to be positioned at the first digit and will advance /// the parser to the first character immediately following the escape /// sequence. /// /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`) /// or 8 (for `\UNNNNNNNN`). fn parse_hex_digits(&self, digit_len: usize) -> Result { let mut scratch = String::new(); for i in 0..digit_len { if i > 0 && !self.bump_and_bump_space() { return Err(Error::new(ERR_HEX_FIXED_UNEXPECTED_EOF)); } if !is_hex(self.char()) { return Err(Error::new(ERR_HEX_FIXED_INVALID_DIGIT)); } scratch.push(self.char()); } // The final bump just moves the parser past the literal, which may // be EOF. self.bump_and_bump_space(); match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { None => Err(Error::new(ERR_HEX_FIXED_INVALID)), Some(ch) => Ok(self.hir_char(ch)), } } /// Parse a hex representation of any Unicode scalar value. This expects /// the parser to be positioned at the opening brace `{` and will advance /// the parser to the first character following the closing brace `}`. fn parse_hex_brace(&self) -> Result { let mut scratch = String::new(); while self.bump_and_bump_space() && self.char() != '}' { if !is_hex(self.char()) { return Err(Error::new(ERR_HEX_BRACE_INVALID_DIGIT)); } scratch.push(self.char()); } if self.is_done() { return Err(Error::new(ERR_HEX_BRACE_UNEXPECTED_EOF)); } assert_eq!(self.char(), '}'); self.bump_and_bump_space(); if scratch.is_empty() { return Err(Error::new(ERR_HEX_BRACE_EMPTY)); } match u32::from_str_radix(&scratch, 16).ok().and_then(char::from_u32) { None => Err(Error::new(ERR_HEX_BRACE_INVALID)), Some(ch) => Ok(self.hir_char(ch)), } } /// Parse a decimal number into a u32 while trimming leading and trailing /// whitespace. /// /// This expects the parser to be positioned at the first position where /// a decimal digit could occur. This will advance the parser to the byte /// immediately following the last contiguous decimal digit. /// /// If no decimal digit could be found or if there was a problem parsing /// the complete set of digits into a u32, then an error is returned. fn parse_decimal(&self) -> Result { let mut scratch = String::new(); while !self.is_done() && self.char().is_whitespace() { self.bump(); } while !self.is_done() && '0' <= self.char() && self.char() <= '9' { scratch.push(self.char()); self.bump_and_bump_space(); } while !self.is_done() && self.char().is_whitespace() { self.bump_and_bump_space(); } let digits = scratch.as_str(); if digits.is_empty() { return Err(Error::new(ERR_DECIMAL_NO_DIGITS)); } match u32::from_str_radix(digits, 10).ok() { Some(n) => Ok(n), None => Err(Error::new(ERR_DECIMAL_INVALID)), } } /// Parses an uncounted repetition operator. An uncounted repetition /// operator includes `?`, `*` and `+`, but does not include the `{m,n}` /// syntax. The current character should be one of `?`, `*` or `+`. Any /// other character will result in a panic. /// /// This assumes that the parser is currently positioned at the repetition /// operator and advances the parser to the first character after the /// operator. (Note that the operator may include a single additional `?`, /// which makes the operator ungreedy.) /// /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. /// /// If the concatenation is empty, then this returns an error. fn parse_uncounted_repetition( &self, mut concat: Vec, ) -> Result, Error> { let sub = match concat.pop() { Some(hir) => Box::new(hir), None => { return Err(Error::new(ERR_UNCOUNTED_REP_SUB_MISSING)); } }; let (min, max) = match self.char() { '?' => (0, Some(1)), '*' => (0, None), '+' => (1, None), unk => unreachable!("unrecognized repetition operator '{}'", unk), }; let mut greedy = true; if self.bump() && self.char() == '?' { greedy = false; self.bump(); } if self.flags().swap_greed { greedy = !greedy; } concat.push(Hir::repetition(hir::Repetition { min, max, greedy, sub, })); Ok(concat) } /// Parses a counted repetition operation. A counted repetition operator /// corresponds to the `{m,n}` syntax, and does not include the `?`, `*` or /// `+` operators. /// /// This assumes that the parser is currently at the opening `{` and /// advances the parser to the first character after the operator. (Note /// that the operator may include a single additional `?`, which makes the /// operator ungreedy.) /// /// The caller should include the concatenation that is being built. The /// concatenation returned includes the repetition operator applied to the /// last expression in the given concatenation. /// /// If the concatenation is empty, then this returns an error. fn parse_counted_repetition( &self, mut concat: Vec, ) -> Result, Error> { assert_eq!(self.char(), '{', "expected opening brace"); let sub = match concat.pop() { Some(hir) => Box::new(hir), None => { return Err(Error::new(ERR_COUNTED_REP_SUB_MISSING)); } }; if !self.bump_and_bump_space() { return Err(Error::new(ERR_COUNTED_REP_UNCLOSED)); } let min = self.parse_decimal()?; let mut max = Some(min); if self.is_done() { return Err(Error::new(ERR_COUNTED_REP_MIN_UNCLOSED)); } if self.char() == ',' { if !self.bump_and_bump_space() { return Err(Error::new(ERR_COUNTED_REP_COMMA_UNCLOSED)); } if self.char() != '}' { max = Some(self.parse_decimal()?); } else { max = None; } if self.is_done() { return Err(Error::new(ERR_COUNTED_REP_MIN_MAX_UNCLOSED)); } } if self.char() != '}' { return Err(Error::new(ERR_COUNTED_REP_INVALID)); } let mut greedy = true; if self.bump_and_bump_space() && self.char() == '?' { greedy = false; self.bump(); } if self.flags().swap_greed { greedy = !greedy; } if max.map_or(false, |max| min > max) { return Err(Error::new(ERR_COUNTED_REP_INVALID_RANGE)); } concat.push(Hir::repetition(hir::Repetition { min, max, greedy, sub, })); Ok(concat) } /// Parses the part of a pattern that starts with a `(`. This is usually /// a group sub-expression, but might just be a directive that enables /// (or disables) certain flags. /// /// This assumes the parser is pointing at the opening `(`. fn parse_group(&self) -> Result, Error> { assert_eq!(self.char(), '('); self.bump_and_bump_space(); if self.is_lookaround_prefix() { return Err(Error::new(ERR_LOOK_UNSUPPORTED)); } if self.bump_if("?P<") || self.bump_if("?<") { let index = self.next_capture_index()?; let name = Some(Box::from(self.parse_capture_name()?)); let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name, sub }; Ok(Some(Hir::capture(cap))) } else if self.bump_if("?") { if self.is_done() { return Err(Error::new(ERR_UNCLOSED_GROUP_QUESTION)); } let start = self.pos(); // The flags get reset in the top-level 'parse' routine. *self.flags.borrow_mut() = self.parse_flags()?; let consumed = self.pos() - start; if self.char() == ')' { // We don't allow empty flags, e.g., `(?)`. if consumed == 0 { return Err(Error::new(ERR_EMPTY_FLAGS)); } Ok(None) } else { assert_eq!(':', self.char()); self.bump(); self.parse_inner().map(Some) } } else { let index = self.next_capture_index()?; let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name: None, sub }; Ok(Some(Hir::capture(cap))) } } /// Parses a capture group name. Assumes that the parser is positioned at /// the first character in the name following the opening `<` (and may /// possibly be EOF). This advances the parser to the first character /// following the closing `>`. fn parse_capture_name(&self) -> Result<&str, Error> { if self.is_done() { return Err(Error::new(ERR_MISSING_GROUP_NAME)); } let start = self.pos(); loop { if self.char() == '>' { break; } if !is_capture_char(self.char(), self.pos() == start) { return Err(Error::new(ERR_INVALID_GROUP_NAME)); } if !self.bump() { break; } } let end = self.pos(); if self.is_done() { return Err(Error::new(ERR_UNCLOSED_GROUP_NAME)); } assert_eq!(self.char(), '>'); self.bump(); let name = &self.pattern()[start..end]; if name.is_empty() { return Err(Error::new(ERR_EMPTY_GROUP_NAME)); } self.add_capture_name(name)?; Ok(name) } /// Parse a sequence of flags starting at the current character. /// /// This advances the parser to the character immediately following the /// flags, which is guaranteed to be either `:` or `)`. /// /// # Errors /// /// If any flags are duplicated, then an error is returned. /// /// If the negation operator is used more than once, then an error is /// returned. /// /// If no flags could be found or if the negation operation is not followed /// by any flags, then an error is returned. fn parse_flags(&self) -> Result { let mut flags = *self.flags.borrow(); let mut negate = false; // Keeps track of whether the previous flag item was a '-'. We use this // to detect whether there is a dangling '-', which is invalid. let mut last_was_negation = false; // A set to keep track of the flags we've seen. Since all flags are // ASCII, we only need 128 bytes. let mut seen = [false; 128]; while self.char() != ':' && self.char() != ')' { if self.char() == '-' { last_was_negation = true; if negate { return Err(Error::new(ERR_FLAG_REPEATED_NEGATION)); } negate = true; } else { last_was_negation = false; self.parse_flag(&mut flags, negate)?; // OK because every valid flag is ASCII, and we're only here if // the flag is valid. let flag_byte = u8::try_from(self.char()).unwrap(); if seen[usize::from(flag_byte)] { return Err(Error::new(ERR_FLAG_DUPLICATE)); } seen[usize::from(flag_byte)] = true; } if !self.bump() { return Err(Error::new(ERR_FLAG_UNEXPECTED_EOF)); } } if last_was_negation { return Err(Error::new(ERR_FLAG_DANGLING_NEGATION)); } Ok(flags) } /// Parse the current character as a flag. Do not advance the parser. /// /// This sets the appropriate boolean value in place on the set of flags /// given. The boolean is inverted when `negate` is true. /// /// # Errors /// /// If the flag is not recognized, then an error is returned. fn parse_flag( &self, flags: &mut Flags, negate: bool, ) -> Result<(), Error> { let enabled = !negate; match self.char() { 'i' => flags.case_insensitive = enabled, 'm' => flags.multi_line = enabled, 's' => flags.dot_matches_new_line = enabled, 'U' => flags.swap_greed = enabled, 'R' => flags.crlf = enabled, 'x' => flags.ignore_whitespace = enabled, // We make a special exception for this flag where we let it // through as a recognized flag, but treat it as a no-op. This in // practice retains some compatibility with the regex crate. It is // a little suspect to do this, but for example, '(?-u:\b).+' in // the regex crate is equivalent to '\b.+' in regex-lite. 'u' => {} _ => return Err(Error::new(ERR_FLAG_UNRECOGNIZED)), } Ok(()) } /// Parse a standard character class consisting primarily of characters or /// character ranges. /// /// This assumes the parser is positioned at the opening `[`. If parsing /// is successful, then the parser is advanced to the position immediately /// following the closing `]`. fn parse_class(&self) -> Result { assert_eq!(self.char(), '['); let mut union = vec![]; if !self.bump_and_bump_space() { return Err(Error::new(ERR_CLASS_UNCLOSED)); } // Determine whether the class is negated or not. let negate = if self.char() != '^' { false } else { if !self.bump_and_bump_space() { return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_NEGATION)); } true }; // Accept any number of `-` as literal `-`. while self.char() == '-' { union.push(hir::ClassRange { start: '-', end: '-' }); if !self.bump_and_bump_space() { return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); } } // If `]` is the *first* char in a set, then interpret it as a literal // `]`. That is, an empty class is impossible to write. if union.is_empty() && self.char() == ']' { union.push(hir::ClassRange { start: ']', end: ']' }); if !self.bump_and_bump_space() { return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_CLOSING)); } } loop { self.bump_space(); if self.is_done() { return Err(Error::new(ERR_CLASS_UNCLOSED)); } match self.char() { '[' => { // Attempt to treat this as the beginning of a POSIX class. // If POSIX class parsing fails, then the parser backs up // to `[`. if let Some(class) = self.maybe_parse_posix_class() { union.extend_from_slice(&class.ranges); continue; } // ... otherwise we don't support nested classes. return Err(Error::new(ERR_CLASS_NEST_UNSUPPORTED)); } ']' => { self.bump(); let mut class = hir::Class::new(union); // Note that we must apply case folding before negation! // Consider `(?i)[^x]`. If we applied negation first, then // the result would be the character class that matched any // Unicode scalar value. if self.flags().case_insensitive { class.ascii_case_fold(); } if negate { class.negate(); } return Ok(Hir::class(class)); } '&' if self.peek() == Some('&') => { return Err(Error::new( ERR_CLASS_INTERSECTION_UNSUPPORTED, )); } '-' if self.peek() == Some('-') => { return Err(Error::new(ERR_CLASS_DIFFERENCE_UNSUPPORTED)); } '~' if self.peek() == Some('~') => { return Err(Error::new( ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, )); } _ => self.parse_class_range(&mut union)?, } } } /// Parse a single primitive item in a character class set. The item to /// be parsed can either be one of a simple literal character, a range /// between two simple literal characters or a "primitive" character /// class like `\w`. /// /// If an invalid escape is found, or if a character class is found where /// a simple literal is expected (e.g., in a range), then an error is /// returned. /// /// Otherwise, the range (or ranges) are appended to the given union of /// ranges. fn parse_class_range( &self, union: &mut Vec, ) -> Result<(), Error> { let prim1 = self.parse_class_item()?; self.bump_space(); if self.is_done() { return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_ITEM)); } // If the next char isn't a `-`, then we don't have a range. // There are two exceptions. If the char after a `-` is a `]`, then // `-` is interpreted as a literal `-`. Alternatively, if the char // after a `-` is a `-`, then `--` corresponds to a "difference" // operation. (Which we don't support in regex-lite, but error about // specifically in an effort to be loud about differences between the // main regex crate where possible.) if self.char() != '-' || self.peek_space() == Some(']') || self.peek_space() == Some('-') { union.extend_from_slice(&into_class_item_ranges(prim1)?); return Ok(()); } // OK, now we're parsing a range, so bump past the `-` and parse the // second half of the range. if !self.bump_and_bump_space() { return Err(Error::new(ERR_CLASS_UNCLOSED_AFTER_DASH)); } let prim2 = self.parse_class_item()?; let range = hir::ClassRange { start: into_class_item_range(prim1)?, end: into_class_item_range(prim2)?, }; if range.start > range.end { return Err(Error::new(ERR_CLASS_INVALID_RANGE)); } union.push(range); Ok(()) } /// Parse a single item in a character class as a primitive, where the /// primitive either consists of a verbatim literal or a single escape /// sequence. /// /// This assumes the parser is positioned at the beginning of a primitive, /// and advances the parser to the first position after the primitive if /// successful. /// /// Note that it is the caller's responsibility to report an error if an /// illegal primitive was parsed. fn parse_class_item(&self) -> Result { let ch = self.char(); self.bump(); if ch == '\\' { self.parse_escape() } else { Ok(Hir::char(ch)) } } /// Attempt to parse a POSIX character class, e.g., `[:alnum:]`. /// /// This assumes the parser is positioned at the opening `[`. /// /// If no valid POSIX character class could be found, then this does not /// advance the parser and `None` is returned. Otherwise, the parser is /// advanced to the first byte following the closing `]` and the /// corresponding POSIX class is returned. fn maybe_parse_posix_class(&self) -> Option { // POSIX character classes are interesting from a parsing perspective // because parsing cannot fail with any interesting error. For example, // in order to use an POSIX character class, it must be enclosed in // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think // of it as "POSIX character classes have the syntax `[:NAME:]` which // can only appear within character brackets." This means that things // like `[[:lower:]A]` are legal constructs. // // However, if one types an incorrect POSIX character class, e.g., // `[[:loower:]]`, then we treat that as if it were normal nested // character class containing the characters `:elorw`. (Which isn't // supported and results in an error in regex-lite.) One might argue // that we should return an error instead since the repeated colons // give away the intent to write an POSIX class. But what if the user // typed `[[:lower]]` instead? How can we tell that was intended to be // a POSXI class and not just a normal nested class? // // Reasonable people can probably disagree over this, but for better // or worse, we implement semantics that never fails at the expense of // better failure modes. assert_eq!(self.char(), '['); // If parsing fails, then we back up the parser to this starting point. let start_pos = self.pos(); let start_char = self.char.get(); let reset = || { self.pos.set(start_pos); self.char.set(start_char); }; let mut negated = false; if !self.bump() || self.char() != ':' { reset(); return None; } if !self.bump() { reset(); return None; } if self.char() == '^' { negated = true; if !self.bump() { reset(); return None; } } let name_start = self.pos(); while self.char() != ':' && self.bump() {} if self.is_done() { reset(); return None; } let name = &self.pattern()[name_start..self.pos()]; if !self.bump_if(":]") { reset(); return None; } if let Ok(ranges) = posix_class(name) { let mut class = hir::Class::new(ranges); if negated { class.negate(); } return Some(class); } reset(); None } /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the /// parser is currently at a valid character class name and will be /// advanced to the character immediately following the class. fn parse_perl_class(&self) -> Hir { let ch = self.char(); self.bump(); let mut class = hir::Class::new(match ch { 'd' | 'D' => posix_class("digit").unwrap(), 's' | 'S' => posix_class("space").unwrap(), 'w' | 'W' => posix_class("word").unwrap(), unk => unreachable!("invalid Perl class \\{}", unk), }); if ch.is_ascii_uppercase() { class.negate(); } Hir::class(class) } fn hir_dot(&self) -> Hir { if self.flags().dot_matches_new_line { Hir::class(hir::Class::new([hir::ClassRange { start: '\x00', end: '\u{10FFFF}', }])) } else if self.flags().crlf { Hir::class(hir::Class::new([ hir::ClassRange { start: '\x00', end: '\x09' }, hir::ClassRange { start: '\x0B', end: '\x0C' }, hir::ClassRange { start: '\x0E', end: '\u{10FFFF}' }, ])) } else { Hir::class(hir::Class::new([ hir::ClassRange { start: '\x00', end: '\x09' }, hir::ClassRange { start: '\x0B', end: '\u{10FFFF}' }, ])) } } fn hir_anchor_start(&self) -> Hir { let look = if self.flags().multi_line { if self.flags().crlf { hir::Look::StartCRLF } else { hir::Look::StartLF } } else { hir::Look::Start }; Hir::look(look) } fn hir_anchor_end(&self) -> Hir { let look = if self.flags().multi_line { if self.flags().crlf { hir::Look::EndCRLF } else { hir::Look::EndLF } } else { hir::Look::End }; Hir::look(look) } fn hir_char(&self, ch: char) -> Hir { if self.flags().case_insensitive { let this = hir::ClassRange { start: ch, end: ch }; if let Some(folded) = this.ascii_case_fold() { return Hir::class(hir::Class::new([this, folded])); } } Hir::char(ch) } } /// This checks the depth of the given `Hir` value, and if it exceeds the given /// limit, then an error is returned. fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { if depth > limit { return Err(Error::new(ERR_TOO_MUCH_NESTING)); } let Some(next_depth) = depth.checked_add(1) else { return Err(Error::new(ERR_TOO_MUCH_NESTING)); }; match *hir.kind() { HirKind::Empty | HirKind::Char(_) | HirKind::Class(_) | HirKind::Look(_) => Ok(()), HirKind::Repetition(hir::Repetition { ref sub, .. }) => { recurse(sub, limit, next_depth) } HirKind::Capture(hir::Capture { ref sub, .. }) => { recurse(sub, limit, next_depth) } HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { for sub in subs.iter() { recurse(sub, limit, next_depth)?; } Ok(()) } } } recurse(hir, limit, 0) } /// Converts the given Hir to a literal char if the Hir is just a single /// character. Otherwise this returns an error. /// /// This is useful in contexts where you can only accept a single character, /// but where it is convenient to parse something more general. For example, /// parsing a single part of a character class range. It's useful to reuse /// the literal parsing code, but that code can itself return entire classes /// which can't be used as the start/end of a class range. fn into_class_item_range(hir: Hir) -> Result { match hir.kind { HirKind::Char(ch) => Ok(ch), _ => Err(Error::new(ERR_CLASS_INVALID_RANGE_ITEM)), } } fn into_class_item_ranges( mut hir: Hir, ) -> Result, Error> { match core::mem::replace(&mut hir.kind, HirKind::Empty) { HirKind::Char(ch) => Ok(vec![hir::ClassRange { start: ch, end: ch }]), HirKind::Class(hir::Class { ranges }) => Ok(ranges), _ => Err(Error::new(ERR_CLASS_INVALID_ITEM)), } } /// Returns an iterator of character class ranges for the given named POSIX /// character class. If no such character class exists for the name given, then /// an error is returned. fn posix_class( kind: &str, ) -> Result, Error> { let slice: &'static [(u8, u8)] = match kind { "alnum" => &[(b'0', b'9'), (b'A', b'Z'), (b'a', b'z')], "alpha" => &[(b'A', b'Z'), (b'a', b'z')], "ascii" => &[(b'\x00', b'\x7F')], "blank" => &[(b'\t', b'\t'), (b' ', b' ')], "cntrl" => &[(b'\x00', b'\x1F'), (b'\x7F', b'\x7F')], "digit" => &[(b'0', b'9')], "graph" => &[(b'!', b'~')], "lower" => &[(b'a', b'z')], "print" => &[(b' ', b'~')], "punct" => &[(b'!', b'/'), (b':', b'@'), (b'[', b'`'), (b'{', b'~')], "space" => &[ (b'\t', b'\t'), (b'\n', b'\n'), (b'\x0B', b'\x0B'), (b'\x0C', b'\x0C'), (b'\r', b'\r'), (b' ', b' '), ], "upper" => &[(b'A', b'Z')], "word" => &[(b'0', b'9'), (b'A', b'Z'), (b'_', b'_'), (b'a', b'z')], "xdigit" => &[(b'0', b'9'), (b'A', b'F'), (b'a', b'f')], _ => return Err(Error::new(ERR_POSIX_CLASS_UNRECOGNIZED)), }; Ok(slice.iter().map(|&(start, end)| hir::ClassRange { start: char::from(start), end: char::from(end), })) } /// Returns true if the given character is a hexadecimal digit. fn is_hex(c: char) -> bool { ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F') } /// Returns true if the given character is a valid in a capture group name. /// /// If `first` is true, then `c` is treated as the first character in the /// group name (which must be alphabetic or underscore). fn is_capture_char(c: char, first: bool) -> bool { if first { c == '_' || c.is_alphabetic() } else { c == '_' || c == '.' || c == '[' || c == ']' || c.is_alphanumeric() } } #[cfg(test)] mod tests { use super::*; fn p(pattern: &str) -> Hir { Parser::new(Config::default(), pattern).parse_inner().unwrap() } fn perr(pattern: &str) -> String { Parser::new(Config::default(), pattern) .parse_inner() .unwrap_err() .to_string() } fn class>(it: I) -> Hir { Hir::class(hir::Class::new( it.into_iter().map(|(start, end)| hir::ClassRange { start, end }), )) } fn singles>(it: I) -> Hir { Hir::class(hir::Class::new( it.into_iter().map(|ch| hir::ClassRange { start: ch, end: ch }), )) } fn posix(name: &str) -> Hir { Hir::class(hir::Class::new(posix_class(name).unwrap())) } fn cap(index: u32, sub: Hir) -> Hir { Hir::capture(hir::Capture { index, name: None, sub: Box::new(sub) }) } fn named_cap(index: u32, name: &str, sub: Hir) -> Hir { Hir::capture(hir::Capture { index, name: Some(Box::from(name)), sub: Box::new(sub), }) } #[test] fn ok_literal() { assert_eq!(p("a"), Hir::char('a')); assert_eq!(p("ab"), Hir::concat(vec![Hir::char('a'), Hir::char('b')])); assert_eq!(p("💩"), Hir::char('💩')); } #[test] fn ok_meta_escapes() { assert_eq!(p(r"\*"), Hir::char('*')); assert_eq!(p(r"\+"), Hir::char('+')); assert_eq!(p(r"\?"), Hir::char('?')); assert_eq!(p(r"\|"), Hir::char('|')); assert_eq!(p(r"\("), Hir::char('(')); assert_eq!(p(r"\)"), Hir::char(')')); assert_eq!(p(r"\^"), Hir::char('^')); assert_eq!(p(r"\$"), Hir::char('$')); assert_eq!(p(r"\["), Hir::char('[')); assert_eq!(p(r"\]"), Hir::char(']')); } #[test] fn ok_special_escapes() { assert_eq!(p(r"\a"), Hir::char('\x07')); assert_eq!(p(r"\f"), Hir::char('\x0C')); assert_eq!(p(r"\t"), Hir::char('\t')); assert_eq!(p(r"\n"), Hir::char('\n')); assert_eq!(p(r"\r"), Hir::char('\r')); assert_eq!(p(r"\v"), Hir::char('\x0B')); assert_eq!(p(r"\A"), Hir::look(hir::Look::Start)); assert_eq!(p(r"\z"), Hir::look(hir::Look::End)); assert_eq!(p(r"\b"), Hir::look(hir::Look::Word)); assert_eq!(p(r"\B"), Hir::look(hir::Look::WordNegate)); } #[test] fn ok_hex() { // fixed length assert_eq!(p(r"\x41"), Hir::char('A')); assert_eq!(p(r"\u2603"), Hir::char('☃')); assert_eq!(p(r"\U0001F4A9"), Hir::char('💩')); // braces assert_eq!(p(r"\x{1F4A9}"), Hir::char('💩')); assert_eq!(p(r"\u{1F4A9}"), Hir::char('💩')); assert_eq!(p(r"\U{1F4A9}"), Hir::char('💩')); } #[test] fn ok_perl() { assert_eq!(p(r"\d"), posix("digit")); assert_eq!(p(r"\s"), posix("space")); assert_eq!(p(r"\w"), posix("word")); let negated = |name| { let mut class = hir::Class::new(posix_class(name).unwrap()); class.negate(); Hir::class(class) }; assert_eq!(p(r"\D"), negated("digit")); assert_eq!(p(r"\S"), negated("space")); assert_eq!(p(r"\W"), negated("word")); } #[test] fn ok_flags_and_primitives() { assert_eq!(p(r"a"), Hir::char('a')); assert_eq!(p(r"(?i:a)"), singles(['A', 'a'])); assert_eq!(p(r"^"), Hir::look(hir::Look::Start)); assert_eq!(p(r"(?m:^)"), Hir::look(hir::Look::StartLF)); assert_eq!(p(r"(?mR:^)"), Hir::look(hir::Look::StartCRLF)); assert_eq!(p(r"$"), Hir::look(hir::Look::End)); assert_eq!(p(r"(?m:$)"), Hir::look(hir::Look::EndLF)); assert_eq!(p(r"(?mR:$)"), Hir::look(hir::Look::EndCRLF)); assert_eq!(p(r"."), class([('\x00', '\x09'), ('\x0B', '\u{10FFFF}')])); assert_eq!( p(r"(?R:.)"), class([ ('\x00', '\x09'), ('\x0B', '\x0C'), ('\x0E', '\u{10FFFF}'), ]) ); assert_eq!(p(r"(?s:.)"), class([('\x00', '\u{10FFFF}')])); assert_eq!(p(r"(?sR:.)"), class([('\x00', '\u{10FFFF}')])); } #[test] fn ok_alternate() { assert_eq!( p(r"a|b"), Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) ); assert_eq!( p(r"(?:a|b)"), Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) ); assert_eq!( p(r"(a|b)"), cap(1, Hir::alternation(vec![Hir::char('a'), Hir::char('b')])) ); assert_eq!( p(r"(?a|b)"), named_cap( 1, "foo", Hir::alternation(vec![Hir::char('a'), Hir::char('b')]) ) ); assert_eq!( p(r"a|b|c"), Hir::alternation(vec![ Hir::char('a'), Hir::char('b'), Hir::char('c') ]) ); assert_eq!( p(r"ax|by|cz"), Hir::alternation(vec![ Hir::concat(vec![Hir::char('a'), Hir::char('x')]), Hir::concat(vec![Hir::char('b'), Hir::char('y')]), Hir::concat(vec![Hir::char('c'), Hir::char('z')]), ]) ); assert_eq!( p(r"(ax|(by|(cz)))"), cap( 1, Hir::alternation(vec![ Hir::concat(vec![Hir::char('a'), Hir::char('x')]), cap( 2, Hir::alternation(vec![ Hir::concat(vec![Hir::char('b'), Hir::char('y')]), cap( 3, Hir::concat(vec![ Hir::char('c'), Hir::char('z') ]) ), ]) ), ]) ) ); assert_eq!( p(r"|"), Hir::alternation(vec![Hir::empty(), Hir::empty()]) ); assert_eq!( p(r"||"), Hir::alternation(vec![Hir::empty(), Hir::empty(), Hir::empty()]) ); assert_eq!( p(r"a|"), Hir::alternation(vec![Hir::char('a'), Hir::empty()]) ); assert_eq!( p(r"|a"), Hir::alternation(vec![Hir::empty(), Hir::char('a')]) ); assert_eq!( p(r"(|)"), cap(1, Hir::alternation(vec![Hir::empty(), Hir::empty()])) ); assert_eq!( p(r"(a|)"), cap(1, Hir::alternation(vec![Hir::char('a'), Hir::empty()])) ); assert_eq!( p(r"(|a)"), cap(1, Hir::alternation(vec![Hir::empty(), Hir::char('a')])) ); } #[test] fn ok_flag_group() { assert_eq!( p("a(?i:b)"), Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) ); } #[test] fn ok_flag_directive() { assert_eq!(p("(?i)a"), singles(['A', 'a'])); assert_eq!(p("a(?i)"), Hir::char('a')); assert_eq!( p("a(?i)b"), Hir::concat(vec![Hir::char('a'), singles(['B', 'b'])]) ); assert_eq!( p("a(?i)a(?-i)a"), Hir::concat(vec![ Hir::char('a'), singles(['A', 'a']), Hir::char('a'), ]) ); assert_eq!( p("a(?:(?i)a)a"), Hir::concat(vec![ Hir::char('a'), singles(['A', 'a']), Hir::char('a'), ]) ); assert_eq!( p("a((?i)a)a"), Hir::concat(vec![ Hir::char('a'), cap(1, singles(['A', 'a'])), Hir::char('a'), ]) ); } #[test] fn ok_uncounted_repetition() { assert_eq!( p(r"a?"), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a*"), Hir::repetition(hir::Repetition { min: 0, max: None, greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a+"), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a??"), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: false, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a*?"), Hir::repetition(hir::Repetition { min: 0, max: None, greedy: false, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a+?"), Hir::repetition(hir::Repetition { min: 1, max: None, greedy: false, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a?b"), Hir::concat(vec![ Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(Hir::char('a')), }), Hir::char('b'), ]), ); assert_eq!( p(r"ab?"), Hir::concat(vec![ Hir::char('a'), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(Hir::char('b')), }), ]), ); assert_eq!( p(r"(?:ab)?"), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(Hir::concat(vec![ Hir::char('a'), Hir::char('b') ])), }), ); assert_eq!( p(r"(ab)?"), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(cap( 1, Hir::concat(vec![Hir::char('a'), Hir::char('b')]) )), }), ); assert_eq!( p(r"|a?"), Hir::alternation(vec![ Hir::empty(), Hir::repetition(hir::Repetition { min: 0, max: Some(1), greedy: true, sub: Box::new(Hir::char('a')), }) ]), ); } #[test] fn ok_counted_repetition() { assert_eq!( p(r"a{5}"), Hir::repetition(hir::Repetition { min: 5, max: Some(5), greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a{5}?"), Hir::repetition(hir::Repetition { min: 5, max: Some(5), greedy: false, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a{5,}"), Hir::repetition(hir::Repetition { min: 5, max: None, greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a{5,9}"), Hir::repetition(hir::Repetition { min: 5, max: Some(9), greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"ab{5}c"), Hir::concat(vec![ Hir::char('a'), Hir::repetition(hir::Repetition { min: 5, max: Some(5), greedy: true, sub: Box::new(Hir::char('b')), }), Hir::char('c'), ]), ); assert_eq!( p(r"a{ 5 }"), Hir::repetition(hir::Repetition { min: 5, max: Some(5), greedy: true, sub: Box::new(Hir::char('a')), }), ); assert_eq!( p(r"a{ 5 , 9 }"), Hir::repetition(hir::Repetition { min: 5, max: Some(9), greedy: true, sub: Box::new(Hir::char('a')), }), ); } #[test] fn ok_group_unnamed() { assert_eq!(p("(a)"), cap(1, Hir::char('a'))); assert_eq!( p("(ab)"), cap(1, Hir::concat(vec![Hir::char('a'), Hir::char('b')])) ); } #[test] fn ok_group_named() { assert_eq!(p("(?Pa)"), named_cap(1, "foo", Hir::char('a'))); assert_eq!(p("(?a)"), named_cap(1, "foo", Hir::char('a'))); assert_eq!( p("(?Pab)"), named_cap( 1, "foo", Hir::concat(vec![Hir::char('a'), Hir::char('b')]) ) ); assert_eq!( p("(?ab)"), named_cap( 1, "foo", Hir::concat(vec![Hir::char('a'), Hir::char('b')]) ) ); assert_eq!(p(r"(?z)"), named_cap(1, "a", Hir::char('z'))); assert_eq!(p(r"(?Pz)"), named_cap(1, "a", Hir::char('z'))); assert_eq!(p(r"(?z)"), named_cap(1, "a_1", Hir::char('z'))); assert_eq!(p(r"(?Pz)"), named_cap(1, "a_1", Hir::char('z'))); assert_eq!(p(r"(?z)"), named_cap(1, "a.1", Hir::char('z'))); assert_eq!(p(r"(?Pz)"), named_cap(1, "a.1", Hir::char('z'))); assert_eq!(p(r"(?z)"), named_cap(1, "a[1]", Hir::char('z'))); assert_eq!(p(r"(?Pz)"), named_cap(1, "a[1]", Hir::char('z'))); assert_eq!(p(r"(?z)"), named_cap(1, "a¾", Hir::char('z'))); assert_eq!(p(r"(?Pz)"), named_cap(1, "a¾", Hir::char('z'))); assert_eq!(p(r"(?<名字>z)"), named_cap(1, "名字", Hir::char('z'))); assert_eq!(p(r"(?P<名字>z)"), named_cap(1, "名字", Hir::char('z'))); } #[test] fn ok_class() { assert_eq!(p(r"[a]"), singles(['a'])); assert_eq!(p(r"[a\]]"), singles(['a', ']'])); assert_eq!(p(r"[a\-z]"), singles(['a', '-', 'z'])); assert_eq!(p(r"[ab]"), class([('a', 'b')])); assert_eq!(p(r"[a-]"), singles(['a', '-'])); assert_eq!(p(r"[-a]"), singles(['a', '-'])); assert_eq!(p(r"[--a]"), singles(['a', '-'])); assert_eq!(p(r"[---a]"), singles(['a', '-'])); assert_eq!(p(r"[[:alnum:]]"), posix("alnum")); assert_eq!(p(r"[\w]"), posix("word")); assert_eq!(p(r"[a\wz]"), posix("word")); assert_eq!(p(r"[\s\S]"), class([('\x00', '\u{10FFFF}')])); assert_eq!(p(r"[^\s\S]"), Hir::fail()); assert_eq!(p(r"[a-cx-z]"), class([('a', 'c'), ('x', 'z')])); assert_eq!(p(r"[☃-⛄]"), class([('☃', '⛄')])); assert_eq!(p(r"[]]"), singles([']'])); assert_eq!(p(r"[]a]"), singles([']', 'a'])); assert_eq!(p(r"[]\[]"), singles(['[', ']'])); assert_eq!(p(r"[\[]"), singles(['['])); assert_eq!(p(r"(?i)[a]"), singles(['A', 'a'])); assert_eq!(p(r"(?i)[A]"), singles(['A', 'a'])); assert_eq!(p(r"(?i)[k]"), singles(['K', 'k'])); assert_eq!(p(r"(?i)[s]"), singles(['S', 's'])); assert_eq!(p(r"(?i)[β]"), singles(['β'])); assert_eq!(p(r"[^^]"), class([('\x00', ']'), ('_', '\u{10FFFF}')])); assert_eq!( p(r"[^-a]"), class([('\x00', ','), ('.', '`'), ('b', '\u{10FFFF}')]) ); assert_eq!( p(r"[-]a]"), Hir::concat(vec![singles(['-']), Hir::char('a'), Hir::char(']')]) ); } #[test] fn ok_verbatim() { assert_eq!( p(r"(?x)a{5,9} ?"), Hir::repetition(hir::Repetition { min: 5, max: Some(9), greedy: false, sub: Box::new(Hir::char('a')), }) ); assert_eq!(p(r"(?x)[ a]"), singles(['a'])); assert_eq!( p(r"(?x)[ ^ a]"), class([('\x00', '`'), ('b', '\u{10FFFF}')]) ); assert_eq!(p(r"(?x)[ - a]"), singles(['a', '-'])); assert_eq!(p(r"(?x)[ ] a]"), singles([']', 'a'])); assert_eq!( p(r"(?x)a b"), Hir::concat(vec![Hir::char('a'), Hir::char('b')]) ); assert_eq!( p(r"(?x)a b(?-x)a b"), Hir::concat(vec![ Hir::char('a'), Hir::char('b'), Hir::char('a'), Hir::char(' '), Hir::char('b'), ]) ); assert_eq!( p(r"a (?x:a )a "), Hir::concat(vec![ Hir::char('a'), Hir::char(' '), Hir::char('a'), Hir::char('a'), Hir::char(' '), ]) ); assert_eq!( p(r"(?x)( ?P a )"), named_cap(1, "foo", Hir::char('a')), ); assert_eq!(p(r"(?x)( a )"), cap(1, Hir::char('a'))); assert_eq!(p(r"(?x)( ?: a )"), Hir::char('a')); assert_eq!(p(r"(?x)\x { 53 }"), Hir::char('\x53')); assert_eq!(p(r"(?x)\ "), Hir::char(' ')); } #[test] fn ok_comments() { let pat = "(?x) # This is comment 1. foo # This is comment 2. # This is comment 3. bar # This is comment 4."; assert_eq!( p(pat), Hir::concat(vec![ Hir::char('f'), Hir::char('o'), Hir::char('o'), Hir::char('b'), Hir::char('a'), Hir::char('r'), ]) ); } #[test] fn err_standard() { assert_eq!( ERR_TOO_MUCH_NESTING, perr("(((((((((((((((((((((((((((((((((((((((((((((((((((a)))))))))))))))))))))))))))))))))))))))))))))))))))"), ); // This one is tricky, because the only way it can happen is if the // number of captures overflows u32. Perhaps we should allow setting a // lower limit? // assert_eq!(ERR_TOO_MANY_CAPTURES, perr("")); assert_eq!(ERR_DUPLICATE_CAPTURE_NAME, perr(r"(?Py)(?Pz)")); assert_eq!(ERR_UNCLOSED_GROUP, perr("(")); assert_eq!(ERR_UNCLOSED_GROUP_QUESTION, perr("(?")); assert_eq!(ERR_UNOPENED_GROUP, perr(")")); assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?=a)")); assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?!a)")); assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?<=a)")); assert_eq!(ERR_LOOK_UNSUPPORTED, perr(r"(?z)")); assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<1abc>z)")); assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾>z)")); assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<¾a>z)")); assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?<☃>z)")); assert_eq!(ERR_INVALID_GROUP_NAME, perr(r"(?z)")); assert_eq!(ERR_UNCLOSED_GROUP_NAME, perr(r"(?Pz)")); assert_eq!(ERR_EMPTY_GROUP_NAME, perr(r"(?<>z)")); assert_eq!(ERR_FLAG_UNRECOGNIZED, perr(r"(?z:foo)")); assert_eq!(ERR_FLAG_REPEATED_NEGATION, perr(r"(?s-i-R)")); assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?isi)")); assert_eq!(ERR_FLAG_DUPLICATE, perr(r"(?is-i)")); assert_eq!(ERR_FLAG_UNEXPECTED_EOF, perr(r"(?is")); assert_eq!(ERR_FLAG_DANGLING_NEGATION, perr(r"(?is-:foo)")); assert_eq!(ERR_HEX_BRACE_INVALID_DIGIT, perr(r"\x{Z}")); assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{")); assert_eq!(ERR_HEX_BRACE_UNEXPECTED_EOF, perr(r"\x{A")); assert_eq!(ERR_HEX_BRACE_EMPTY, perr(r"\x{}")); assert_eq!(ERR_HEX_BRACE_INVALID, perr(r"\x{FFFFFFFFFFFFFFFFF}")); assert_eq!(ERR_HEX_FIXED_UNEXPECTED_EOF, perr(r"\xA")); assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZ")); assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xZA")); assert_eq!(ERR_HEX_FIXED_INVALID_DIGIT, perr(r"\xAZ")); assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\uD800")); assert_eq!(ERR_HEX_FIXED_INVALID, perr(r"\UFFFFFFFF")); assert_eq!(ERR_HEX_UNEXPECTED_EOF, perr(r"\x")); assert_eq!(ERR_ESCAPE_UNEXPECTED_EOF, perr(r"\")); assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\0")); assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\1")); assert_eq!(ERR_BACKREF_UNSUPPORTED, perr(r"\8")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\pL")); assert_eq!(ERR_UNICODE_CLASS_UNSUPPORTED, perr(r"\p{L}")); assert_eq!(ERR_ESCAPE_UNRECOGNIZED, perr(r"\i")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"*")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"+")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(+)")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"|?")); assert_eq!(ERR_UNCOUNTED_REP_SUB_MISSING, perr(r"(?i)?")); assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"{5}")); assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"({5})")); assert_eq!(ERR_COUNTED_REP_SUB_MISSING, perr(r"(?i){5}")); assert_eq!(ERR_COUNTED_REP_UNCLOSED, perr(r"a{")); assert_eq!(ERR_COUNTED_REP_MIN_UNCLOSED, perr(r"a{5")); assert_eq!(ERR_COUNTED_REP_COMMA_UNCLOSED, perr(r"a{5,")); assert_eq!(ERR_COUNTED_REP_MIN_MAX_UNCLOSED, perr(r"a{5,6")); assert_eq!(ERR_COUNTED_REP_INVALID, perr(r"a{5,6Z")); assert_eq!(ERR_COUNTED_REP_INVALID_RANGE, perr(r"a{6,5}")); assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{}")); assert_eq!(ERR_DECIMAL_NO_DIGITS, perr(r"a{]}")); assert_eq!(ERR_DECIMAL_INVALID, perr(r"a{999999999999999}")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"[a")); assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[\w-a]")); assert_eq!(ERR_CLASS_INVALID_RANGE_ITEM, perr(r"[a-\w]")); assert_eq!(ERR_CLASS_INVALID_ITEM, perr(r"[\b]")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"[a-")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_NEGATION, perr(r"[^")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_CLOSING, perr(r"[]")); assert_eq!(ERR_CLASS_INVALID_RANGE, perr(r"[z-a]")); assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[")); assert_eq!(ERR_CLASS_UNCLOSED, perr(r"[a-z")); assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[a-z[A-Z]]")); assert_eq!(ERR_CLASS_NEST_UNSUPPORTED, perr(r"[[:alnum]]")); assert_eq!(ERR_CLASS_INTERSECTION_UNSUPPORTED, perr(r"[a&&b]")); assert_eq!(ERR_CLASS_DIFFERENCE_UNSUPPORTED, perr(r"[a--b]")); assert_eq!(ERR_CLASS_SYMDIFFERENCE_UNSUPPORTED, perr(r"[a~~b]")); assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo")); assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNCLOSED, perr(r"\b{foo!}")); assert_eq!(ERR_SPECIAL_WORD_BOUNDARY_UNRECOGNIZED, perr(r"\b{foo}")); assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"\b{")); assert_eq!(ERR_SPECIAL_WORD_OR_REP_UNEXPECTED_EOF, perr(r"(?x)\b{ ")); } #[test] fn err_verbatim() { // See: https://github.com/rust-lang/regex/issues/792 assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[-#]")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_ITEM, perr(r"(?x)[a ")); assert_eq!(ERR_CLASS_UNCLOSED_AFTER_DASH, perr(r"(?x)[a- ")); assert_eq!(ERR_CLASS_UNCLOSED, perr(r"(?x)[ ")); } // This tests a bug fix where the nest limit checker wasn't decrementing // its depth during post-traversal, which causes long regexes to trip // the default limit too aggressively. #[test] fn regression_454_nest_too_big() { let pattern = r#" 2(?: [45]\d{3}| 7(?: 1[0-267]| 2[0-289]| 3[0-29]| 4[01]| 5[1-3]| 6[013]| 7[0178]| 91 )| 8(?: 0[125]| [139][1-6]| 2[0157-9]| 41| 6[1-35]| 7[1-5]| 8[1-8]| 90 )| 9(?: 0[0-2]| 1[0-4]| 2[568]| 3[3-6]| 5[5-7]| 6[0167]| 7[15]| 8[0146-9] ) )\d{4} "#; p(pattern); } // This tests that we treat a trailing `-` in a character class as a // literal `-` even when whitespace mode is enabled and there is whitespace // after the trailing `-`. #[test] fn regression_455_trailing_dash_ignore_whitespace() { p("(?x)[ / - ]"); p("(?x)[ a - ]"); p("(?x)[ a - ] "); p("(?x)[ a # wat - ] "); perr("(?x)[ / -"); perr("(?x)[ / - "); perr( "(?x)[ / - ", ); perr( "(?x)[ / - # wat ", ); } #[test] fn regression_capture_indices() { let got = p(r"(a|ab|c|bcd){4,10}(d*)"); assert_eq!( got, Hir::concat(vec![ Hir::repetition(hir::Repetition { min: 4, max: Some(10), greedy: true, sub: Box::new(cap( 1, Hir::alternation(vec![ Hir::char('a'), Hir::concat(vec![Hir::char('a'), Hir::char('b')]), Hir::char('c'), Hir::concat(vec![ Hir::char('b'), Hir::char('c'), Hir::char('d') ]), ]) )) }), cap( 2, Hir::repetition(hir::Repetition { min: 0, max: None, greedy: true, sub: Box::new(Hir::char('d')), }) ), ]) ); } } regex-lite-0.1.6/src/int.rs000064400000000000000000000037671046102023000136410ustar 00000000000000use core::num::NonZeroUsize; /// An extension trait that adds routines to the `u32` primitive type. pub(crate) trait U32 { fn as_usize(self) -> usize; } impl U32 for u32 { fn as_usize(self) -> usize { // OK because we require 32 or 64 bit targets. Therefore, every u32 // necessarily fits into a usize. self as usize } } /// A `usize` that can never be `usize::MAX`. /// /// This is similar to `core::num::NonZeroUsize`, but instead of not permitting /// a zero value, this does not permit a max value. /// /// This is useful in certain contexts where one wants to optimize the memory /// usage of things that contain match offsets. Namely, since Rust slices /// are guaranteed to never have a length exceeding `isize::MAX`, we can use /// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, /// types like `Option` have exactly the same size in memory as a /// `usize`. /// /// This type is defined to be `repr(transparent)` for /// `core::num::NonZeroUsize`, which is in turn defined to be /// `repr(transparent)` for `usize`. #[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub(crate) struct NonMaxUsize(NonZeroUsize); impl NonMaxUsize { /// Create a new `NonMaxUsize` from the given value. /// /// This returns `None` only when the given value is equal to `usize::MAX`. pub(crate) fn new(value: usize) -> Option { NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) } /// Return the underlying `usize` value. The returned value is guaranteed /// to not equal `usize::MAX`. pub(crate) fn get(self) -> usize { self.0.get().wrapping_sub(1) } } // We provide our own Debug impl because seeing the internal repr can be quite // surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. impl core::fmt::Debug for NonMaxUsize { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{:?}", self.get()) } } regex-lite-0.1.6/src/interpolate.rs000064400000000000000000000400761046102023000153670ustar 00000000000000/*! Provides routines for interpolating capture group references. That is, if a replacement string contains references like `$foo` or `${foo1}`, then they are replaced with the corresponding capture values for the groups named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` is supported as well, with `1` corresponding to a capture group index and not a name. This module provides the free functions [`string`] and [`bytes`], which interpolate Rust Unicode strings and byte strings, respectively. # Format These routines support two different kinds of capture references: unbraced and braced. For the unbraced format, the format supported is `$ref` where `name` can be any character in the class `[0-9A-Za-z_]`. `ref` is always the longest possible parse. So for example, `$1a` corresponds to the capture group named `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then it is treated as a capture group index itself and not a name. For the braced format, the format supported is `${ref}` where `ref` can be any sequence of bytes except for `}`. If no closing brace occurs, then it is not considered a capture reference. As with the unbraced format, if `ref` matches `^[0-9]+$`, then it is treated as a capture group index and not a name. The braced format is useful for exerting precise control over the name of the capture reference. For example, `${1}a` corresponds to the capture group reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) corresponds to the capture group reference `1a`. The braced format is also useful for expressing capture group names that use characters not supported by the unbraced format. For example, `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. If a capture group reference is found and it does not refer to a valid capture group, then it will be replaced with the empty string. To write a literal `$`, use `$$`. To be clear, and as exhibited via the type signatures in the routines in this module, it is impossible for a replacement string to be invalid. A replacement string may not have the intended semantics, but the interpolation procedure itself can never fail. */ use alloc::string::String; /// Accepts a replacement string and interpolates capture references with their /// corresponding values. /// /// `append` should be a function that appends the string value of a capture /// group at a particular index to the string given. If the capture group /// index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. pub fn string( mut replacement: &str, mut append: impl FnMut(usize, &mut String), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut String, ) { while !replacement.is_empty() { match replacement.find('$') { None => break, Some(i) => { dst.push_str(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { dst.push_str("$"); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { dst.push_str("$"); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.push_str(replacement); } /* This should be uncommented and used if we ever provide public APIs for searching `&[u8]`. /// Accepts a replacement byte string and interpolates capture references with /// their corresponding values. /// /// `append` should be a function that appends the byte string value of a /// capture group at a particular index to the byte string given. If the /// capture group index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. pub fn bytes( mut replacement: &[u8], mut append: impl FnMut(usize, &mut Vec), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut Vec, ) { while !replacement.is_empty() { match replacement.iter().position(|&b| b == b'$') { None => break, Some(i) => { dst.extend_from_slice(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.get(1).map_or(false, |&b| b == b'$') { dst.push(b'$'); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement) { Some(cap_ref) => cap_ref, None => { dst.push(b'$'); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.extend_from_slice(replacement); } */ /// `CaptureRef` represents a reference to a capture group inside some text. /// The reference is either a capture group name or a number. /// /// It is also tagged with the position in the text following the /// capture reference. #[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { cap: Ref<'a>, end: usize, } /// A reference to a capture group in some text. /// /// e.g., `$2`, `$foo`, `${foo}`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } impl<'a> From<&'a str> for Ref<'a> { fn from(x: &'a str) -> Ref<'a> { Ref::Named(x) } } impl From for Ref<'static> { fn from(x: usize) -> Ref<'static> { Ref::Number(x) } } /// Parses a possible reference to a capture group name in the given text, /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. /// /// Note that this returns a "possible" reference because this routine doesn't /// know whether the reference is to a valid group or not. If it winds up not /// being a valid reference, then it should be replaced with the empty string. fn find_cap_ref(replacement: &[u8]) -> Option> { let mut i = 0; let rep: &[u8] = replacement; if rep.len() <= 1 || rep[0] != b'$' { return None; } i += 1; if rep[i] == b'{' { return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { cap_end += 1; } if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check via an unchecked conversion or by parsing the number straight from // &[u8]. let cap = core::str::from_utf8(&rep[i..cap_end]) .expect("valid UTF-8 capture name"); Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: cap_end, }) } /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening /// brace has been found at `i-1` in `rep`. This then looks for a closing /// brace and returns the capture reference within the brace. fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); let start = i; while rep.get(i).map_or(false, |&b| b != b'}') { i += 1; } if !rep.get(i).map_or(false, |&b| b == b'}') { return None; } // When looking at braced names, we don't put any restrictions on the name, // so it's possible it could be invalid UTF-8. But a capture group name // can never be invalid UTF-8, so if we have invalid UTF-8, then we can // safely return None. let cap = match core::str::from_utf8(&rep[start..i]) { Err(_) => return None, Ok(cap) => cap, }; Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: i + 1, }) } /// Returns true if and only if the given byte is allowed in a capture name /// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, _ => false, } } #[cfg(test)] mod tests { use alloc::{string::String, vec, vec::Vec}; use super::{find_cap_ref, CaptureRef}; macro_rules! find { ($name:ident, $text:expr) => { #[test] fn $name() { assert_eq!(None, find_cap_ref($text.as_bytes())); } }; ($name:ident, $text:expr, $capref:expr) => { #[test] fn $name() { assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); } }; } macro_rules! c { ($name_or_number:expr, $pos:expr) => { CaptureRef { cap: $name_or_number.into(), end: $pos } }; } find!(find_cap_ref1, "$foo", c!("foo", 4)); find!(find_cap_ref2, "${foo}", c!("foo", 6)); find!(find_cap_ref3, "$0", c!(0, 2)); find!(find_cap_ref4, "$5", c!(5, 2)); find!(find_cap_ref5, "$10", c!(10, 3)); // See https://github.com/rust-lang/regex/pull/585 // for more on characters following numbers find!(find_cap_ref6, "$42a", c!("42a", 4)); find!(find_cap_ref7, "${42}a", c!(42, 5)); find!(find_cap_ref8, "${42"); find!(find_cap_ref9, "${42 "); find!(find_cap_ref10, " $0 "); find!(find_cap_ref11, "$"); find!(find_cap_ref12, " "); find!(find_cap_ref13, ""); find!(find_cap_ref14, "$1-$2", c!(1, 2)); find!(find_cap_ref15, "$1_$2", c!("1_", 3)); find!(find_cap_ref16, "$x-$y", c!("x", 2)); find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); find!(find_cap_ref20, "${¾}", c!("¾", 5)); find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); find!(find_cap_ref23, "${☃}", c!("☃", 6)); find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); find!(find_cap_ref26, "${名字}", c!("名字", 9)); fn interpolate_string( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = String::new(); super::string( replacement, |i, dst| { if let Some(&s) = caps.get(i) { dst.push_str(s); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); dst } /* fn interpolate_bytes( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = vec![]; super::bytes( replacement.as_bytes(), |i, dst| { if let Some(&s) = caps.get(i) { dst.extend_from_slice(s.as_bytes()); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); String::from_utf8(dst).unwrap() } */ macro_rules! interp { ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { #[test] fn $name() { assert_eq!( $expected, interpolate_string($map, $caps, $hay), "interpolate::string failed", ); /* assert_eq!( $expected, interpolate_bytes($map, $caps, $hay), "interpolate::bytes failed", ); */ } }; } interp!( interp1, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo test", "test xxx test", ); interp!( interp2, vec![("foo", 2)], vec!["", "", "xxx"], "test$footest", "test", ); interp!( interp3, vec![("foo", 2)], vec!["", "", "xxx"], "test${foo}test", "testxxxtest", ); interp!( interp4, vec![("foo", 2)], vec!["", "", "xxx"], "test$2test", "test", ); interp!( interp5, vec![("foo", 2)], vec!["", "", "xxx"], "test${2}test", "testxxxtest", ); interp!( interp6, vec![("foo", 2)], vec!["", "", "xxx"], "test $$foo test", "test $foo test", ); interp!( interp7, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo", "test xxx", ); interp!( interp8, vec![("foo", 2)], vec!["", "", "xxx"], "$foo test", "xxx test", ); interp!( interp9, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $bar$foo", "test yyyxxx", ); interp!( interp10, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $ test", "test $ test", ); interp!( interp11, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${} test", "test test", ); interp!( interp12, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${ } test", "test test", ); interp!( interp13, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a b} test", "test test", ); interp!( interp14, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a} test", "test test", ); // This is a funny case where a braced reference is never closed, but // within the unclosed braced reference, there is an unbraced reference. // In this case, the braced reference is just treated literally and the // unbraced reference is found. interp!( interp15, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${wat $bar ok", "test ${wat yyy ok", ); } regex-lite-0.1.6/src/lib.rs000064400000000000000000001065111046102023000136040ustar 00000000000000/*! This crate provides a **lightweight** regex engine for searching strings. The regex syntax supported by this crate is nearly identical to what is found in the [`regex`](https://docs.rs/regex) crate. Like the `regex` crate, all regex searches in this crate have worst case `O(m * n)` time complexity, where `m` is proportional to the size of the regex and `n` is proportional to the size of the string being searched. The principal difference between the `regex` and `regex-lite` crates is that the latter prioritizes smaller binary sizes and shorter Rust compile times over performance and functionality. As a result, regex searches in this crate are typically substantially slower than what is provided by the `regex` crate. Moreover, this crate only has the most basic level of Unicode support: it matches codepoint by codepoint but otherwise doesn't support Unicode case insensivity or things like `\p{Letter}`. In exchange, this crate contributes far less to binary size and compiles much more quickly. If you just want API documentation, then skip to the [`Regex`] type. Otherwise, here's a quick example showing one way of parsing the output of a grep-like program: ```rust use regex_lite::Regex; let re = Regex::new(r"(?m)^([^:]+):([0-9]+):(.+)$").unwrap(); let hay = "\ path/to/foo:54:Blue Harvest path/to/bar:90:Something, Something, Something, Dark Side path/to/baz:3:It's a Trap! "; let mut results = vec![]; for (_, [path, lineno, line]) in re.captures_iter(hay).map(|c| c.extract()) { results.push((path, lineno.parse::()?, line)); } assert_eq!(results, vec![ ("path/to/foo", 54, "Blue Harvest"), ("path/to/bar", 90, "Something, Something, Something, Dark Side"), ("path/to/baz", 3, "It's a Trap!"), ]); # Ok::<(), Box>(()) ``` # Overview The primary type in this crate is a [`Regex`]. Its most important methods are as follows: * [`Regex::new`] compiles a regex using the default configuration. A [`RegexBuilder`] permits setting a non-default configuration. (For example, case insensitive matching, verbose mode and others.) * [`Regex::is_match`] reports whether a match exists in a particular haystack. * [`Regex::find`] reports the byte offsets of a match in a haystack, if one exists. [`Regex::find_iter`] returns an iterator over all such matches. * [`Regex::captures`] returns a [`Captures`], which reports both the byte offsets of a match in a haystack and the byte offsets of each matching capture group from the regex in the haystack. [`Regex::captures_iter`] returns an iterator over all such matches. Otherwise, this top-level crate documentation is organized as follows: * [Usage](#usage) shows how to add the `regex` crate to your Rust project. * [Examples](#examples) provides a limited selection of regex search examples. * [Differences with the regex crate](#differences-with-the-regex-crate) provides a precise description of how `regex-lite` differs from `regex`. * [Syntax](#syntax) enumerates the specific regex syntax supported by this crate. * [Untrusted input](#untrusted-input) discusses how this crate deals with regex patterns or haystacks that are untrusted. # Usage The `regex-lite` crate is [on crates.io](https://crates.io/crates/regex-lite) and can be used by adding `regex-lite` to your dependencies in your project's `Cargo.toml`. Or more simply, just run `cargo add regex-lite`. Here is a complete example that creates a new Rust project, adds a dependency on `regex-lite`, creates the source code for a regex search and then runs the program. First, create the project in a new directory: ```text $ mkdir regex-example $ cd regex-example $ cargo init ``` Second, add a dependency on `regex`: ```text $ cargo add regex-lite ``` Third, edit `src/main.rs`. Delete what's there and replace it with this: ``` use regex_lite::Regex; fn main() { let re = Regex::new(r"Hello (?\w+)!").unwrap(); let Some(caps) = re.captures("Hello Murphy!") else { println!("no match!"); return; }; println!("The name is: {}", &caps["name"]); } ``` Fourth, run it with `cargo run`: ```text $ cargo run Compiling regex-lite v0.1.0 Compiling regex-example v0.1.0 (/tmp/regex-example) Finished dev [unoptimized + debuginfo] target(s) in 4.22s Running `target/debug/regex-example` The name is: Murphy ``` The first time you run the program will show more output like above. But subsequent runs shouldn't have to re-compile the dependencies. # Examples This section provides a few examples, in tutorial style, showing how to search a haystack with a regex. There are more examples throughout the API documentation. Before starting though, it's worth defining a few terms: * A **regex** is a Rust value whose type is `Regex`. We use `re` as a variable name for a regex. * A **pattern** is the string that is used to build a regex. We use `pat` as a variable name for a pattern. * A **haystack** is the string that is searched by a regex. We use `hay` as a variable name for a haystack. Sometimes the words "regex" and "pattern" are used interchangeably. General use of regular expressions in this crate proceeds by compiling a **pattern** into a **regex**, and then using that regex to search, split or replace parts of a **haystack**. ### Example: find a middle initial We'll start off with a very simple example: a regex that looks for a specific name but uses a wildcard to match a middle initial. Our pattern serves as something like a template that will match a particular name with *any* middle initial. ```rust use regex_lite::Regex; // We use 'unwrap()' here because it would be a bug in our program if the // pattern failed to compile to a regex. Panicking in the presence of a bug // is okay. let re = Regex::new(r"Homer (.)\. Simpson").unwrap(); let hay = "Homer J. Simpson"; let Some(caps) = re.captures(hay) else { return }; assert_eq!("J", &caps[1]); ``` There are a few things worth noticing here in our first example: * The `.` is a special pattern meta character that means "match any single character except for new lines." (More precisely, in this crate, it means "match any UTF-8 encoding of any Unicode scalar value other than `\n`.") * We can match an actual `.` literally by escaping it, i.e., `\.`. * We use Rust's [raw strings] to avoid needing to deal with escape sequences in both the regex pattern syntax and in Rust's string literal syntax. If we didn't use raw strings here, we would have had to use `\\.` to match a literal `.` character. That is, `r"\."` and `"\\."` are equivalent patterns. * We put our wildcard `.` instruction in parentheses. These parentheses have a special meaning that says, "make whatever part of the haystack matches within these parentheses available as a capturing group." After finding a match, we access this capture group with `&caps[1]`. [raw strings]: https://doc.rust-lang.org/stable/reference/tokens.html#raw-string-literals Otherwise, we execute a search using `re.captures(hay)` and return from our function if no match occurred. We then reference the middle initial by asking for the part of the haystack that matched the capture group indexed at `1`. (The capture group at index 0 is implicit and always corresponds to the entire match. In this case, that's `Homer J. Simpson`.) ### Example: named capture groups Continuing from our middle initial example above, we can tweak the pattern slightly to give a name to the group that matches the middle initial: ```rust use regex_lite::Regex; // Note that (?P.) is a different way to spell the same thing. let re = Regex::new(r"Homer (?.)\. Simpson").unwrap(); let hay = "Homer J. Simpson"; let Some(caps) = re.captures(hay) else { return }; assert_eq!("J", &caps["middle"]); ``` Giving a name to a group can be useful when there are multiple groups in a pattern. It makes the code referring to those groups a bit easier to understand. ### Example: validating a particular date format This examples shows how to confirm whether a haystack, in its entirety, matches a particular date format: ```rust use regex_lite::Regex; let re = Regex::new(r"^\d{4}-\d{2}-\d{2}$").unwrap(); assert!(re.is_match("2010-03-14")); ``` Notice the use of the `^` and `$` anchors. In this crate, every regex search is run with an implicit `(?s:.)*?` at the beginning of its pattern, which allows the regex to match anywhere in a haystack. Anchors, as above, can be used to ensure that the full haystack matches a pattern. ### Example: finding dates in a haystack In the previous example, we showed how one might validate that a haystack, in its entirety, corresponded to a particular date format. But what if we wanted to extract all things that look like dates in a specific format from a haystack? To do this, we can use an iterator API to find all matches (notice that we've removed the anchors): ```rust use regex_lite::Regex; let re = Regex::new(r"\d{4}-\d{2}-\d{2}").unwrap(); let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; // 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. let dates: Vec<&str> = re.find_iter(hay).map(|m| m.as_str()).collect(); assert_eq!(dates, vec![ "1865-04-14", "1881-07-02", "1901-09-06", "1963-11-22", ]); ``` We can also iterate over [`Captures`] values instead of [`Match`] values, and that in turn permits accessing each component of the date via capturing groups: ```rust use regex_lite::Regex; let re = Regex::new(r"(?\d{4})-(?\d{2})-(?\d{2})").unwrap(); let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; // 'm' is a 'Match', and 'as_str()' returns the matching part of the haystack. let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { // The unwraps are okay because every capture group must match if the whole // regex matches, and in this context, we know we have a match. // // Note that we use `caps.name("y").unwrap().as_str()` instead of // `&caps["y"]` because the lifetime of the former is the same as the // lifetime of `hay` above, but the lifetime of the latter is tied to the // lifetime of `caps` due to how the `Index` trait is defined. let year = caps.name("y").unwrap().as_str(); let month = caps.name("m").unwrap().as_str(); let day = caps.name("d").unwrap().as_str(); (year, month, day) }).collect(); assert_eq!(dates, vec![ ("1865", "04", "14"), ("1881", "07", "02"), ("1901", "09", "06"), ("1963", "11", "22"), ]); ``` ### Example: simpler capture group extraction One can use [`Captures::extract`] to make the code from the previous example a bit simpler in this case: ```rust use regex_lite::Regex; let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); let hay = "What do 1865-04-14, 1881-07-02, 1901-09-06 and 1963-11-22 have in common?"; let dates: Vec<(&str, &str, &str)> = re.captures_iter(hay).map(|caps| { let (_, [year, month, day]) = caps.extract(); (year, month, day) }).collect(); assert_eq!(dates, vec![ ("1865", "04", "14"), ("1881", "07", "02"), ("1901", "09", "06"), ("1963", "11", "22"), ]); ``` `Captures::extract` works by ensuring that the number of matching groups match the number of groups requested via the `[year, month, day]` syntax. If they do, then the substrings for each corresponding capture group are automatically returned in an appropriately sized array. Rust's syntax for pattern matching arrays does the rest. ### Example: replacement with named capture groups Building on the previous example, perhaps we'd like to rearrange the date formats. This can be done by finding each match and replacing it with something different. The [`Regex::replace_all`] routine provides a convenient way to do this, including by supporting references to named groups in the replacement string: ```rust use regex_lite::Regex; let re = Regex::new(r"(?\d{4})-(?\d{2})-(?\d{2})").unwrap(); let before = "1973-01-05, 1975-08-25 and 1980-10-18"; let after = re.replace_all(before, "$m/$d/$y"); assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); ``` The replace methods are actually polymorphic in the replacement, which provides more flexibility than is seen here. (See the documentation for [`Regex::replace`] for more details.) ### Example: verbose mode When your regex gets complicated, you might consider using something other than regex. But if you stick with regex, you can use the `x` flag to enable insignificant whitespace mode or "verbose mode." In this mode, whitespace is treated as insignificant and one may write comments. This may make your patterns easier to comprehend. ```rust use regex_lite::Regex; let re = Regex::new(r"(?x) (?P\d{4}) # the year - (?P\d{2}) # the month - (?P\d{2}) # the day ").unwrap(); let before = "1973-01-05, 1975-08-25 and 1980-10-18"; let after = re.replace_all(before, "$m/$d/$y"); assert_eq!(after, "01/05/1973, 08/25/1975 and 10/18/1980"); ``` If you wish to match against whitespace in this mode, you can still use `\s`, `\n`, `\t`, etc. For escaping a single space character, you can escape it directly with `\ `, use its hex character code `\x20` or temporarily disable the `x` flag, e.g., `(?-x: )`. # Differences with the `regex` crate As mentioned in the introduction above, the purpose of this crate is to prioritize small binary sizes and shorter Rust compilation times as much as possible. Namely, while the `regex` crate tends to eschew both binary size and compilation time in favor of faster searches and features, the `regex-lite` crate gives up faster searches and some functionality in exchange for smaller binary sizes and faster compilation times. The precise set of differences at the syntax level: * The Perl character classes are limited to ASCII codepoints. That is, `\d` is `[0-9]`, `\s` is `[\t\n\v\f\r ]` and `\w` is `[0-9A-Za-z_]`. * Unicode character classes of the form `\p{...}` and `\P{...}` are not supported at all. Note though that things like `[^β]` are still supported and will match any Unicode scalar value except for `β`. * Case insensitive searching is limited to ASCII case insensitivity. * Character class set operations other than union are not supported. That is, difference (`--`), intersection (`&&`) and symmetric difference (`~~`) are not available. These tend to be most useful with Unicode classes, which also aren't available. * Opt-in octal support is not available in this crate. And now at the API level: * Currently, this crate only supports searching `&str`. It does not have APIs for searching `&[u8]` haystacks, although it is planned to add these in the future if there's demand. * There is no `RegexSet` in this crate and there are no plans to add it. * The `Error` type in this crate is completely opaque. Other than these things, the `regex-lite` crate is intended to be a drop-in replacement for the `regex` crate. In most cases, you can just replace `use regex::Regex;` with `use regex_lite::Regex;` and everything will work. (Unless you're depending on Unicode support in your regexes.) # Syntax The syntax supported in this crate is documented below. ### Matching one character
.             any character except new line (includes new line with s flag)
[0-9]         any ASCII digit
\d            digit ([0-9])
\D            not digit
### Character classes
[xyz]         A character class matching either x, y or z (union).
[^xyz]        A character class matching any character except x, y and z.
[a-z]         A character class matching any character in range a-z.
[[:alpha:]]   ASCII character class ([A-Za-z])
[[:^alpha:]]  Negated ASCII character class ([^A-Za-z])
[\[\]]        Escaping in character classes (matching [ or ])
Any ASCII or Perl character class may appear inside a bracketed `[...]` character class. For example, `[\s[:digit:]]` matches any digit or space character. Precedence in character classes, from most binding to least: 1. Ranges: `[a-cd]` == `[[a-c]d]` 2. Union: `[ab&&bc]` == `[[ab]&&[bc]]` 3. Negation: `[^a-z&&b]` == `[^[a-z&&b]]`. ### Composites
xy    concatenation (x followed by y)
x|y   alternation (x or y, prefer x)
This example shows how an alternation works, and what it means to prefer a branch in the alternation over subsequent branches. ``` use regex_lite::Regex; let haystack = "samwise"; // If 'samwise' comes first in our alternation, then it is // preferred as a match, even if the regex engine could // technically detect that 'sam' led to a match earlier. let re = Regex::new(r"samwise|sam").unwrap(); assert_eq!("samwise", re.find(haystack).unwrap().as_str()); // But if 'sam' comes first, then it will match instead. // In this case, it is impossible for 'samwise' to match // because 'sam' is a prefix of it. let re = Regex::new(r"sam|samwise").unwrap(); assert_eq!("sam", re.find(haystack).unwrap().as_str()); ``` ### Repetitions
x*        zero or more of x (greedy)
x+        one or more of x (greedy)
x?        zero or one of x (greedy)
x*?       zero or more of x (ungreedy/lazy)
x+?       one or more of x (ungreedy/lazy)
x??       zero or one of x (ungreedy/lazy)
x{n,m}    at least n x and at most m x (greedy)
x{n,}     at least n x (greedy)
x{n}      exactly n x
x{n,m}?   at least n x and at most m x (ungreedy/lazy)
x{n,}?    at least n x (ungreedy/lazy)
x{n}?     exactly n x
### Empty matches
^               the beginning of a haystack (or start-of-line with multi-line mode)
$               the end of a haystack (or end-of-line with multi-line mode)
\A              only the beginning of a haystack (even with multi-line mode enabled)
\z              only the end of a haystack (even with multi-line mode enabled)
\b              an ASCII word boundary (\w on one side and \W, \A, or \z on other)
\B              not an ASCII word boundary
\b{start}, \<   an ASCII start-of-word boundary (\W|\A on the left, \w on the right)
\b{end}, \>     an ASCII end-of-word boundary (\w on the left, \W|\z on the right))
\b{start-half}  half of an ASCII start-of-word boundary (\W|\A on the left)
\b{end-half}    half of an ASCII end-of-word boundary (\W|\z on the right)
The empty regex is valid and matches the empty string. For example, the empty regex matches `abc` at positions `0`, `1`, `2` and `3`. When using the top-level [`Regex`] on `&str` haystacks, an empty match that splits a codepoint is guaranteed to never be returned. For example: ```rust let re = regex_lite::Regex::new(r"").unwrap(); let ranges: Vec<_> = re.find_iter("💩").map(|m| m.range()).collect(); assert_eq!(ranges, vec![0..0, 4..4]); ``` Note that an empty regex is distinct from a regex that can never match. For example, the regex `[^\s\S]` is a character class that represents the negation of `[\s\S]`, where the union of `\s` and `\S` corresponds to all Unicode scalar values. The negation of everything is nothing, which means the character class is empty. Since nothing is in the empty set, `[^\s\S]` matches nothing, not even the empty string. ### Grouping and flags
(exp)          numbered capture group (indexed by opening parenthesis)
(?P<name>exp)  named (also numbered) capture group (names must be alpha-numeric)
(?<name>exp)   named (also numbered) capture group (names must be alpha-numeric)
(?:exp)        non-capturing group
(?flags)       set flags within current group
(?flags:exp)   set flags for exp (non-capturing)
Capture group names must be any sequence of alpha-numeric Unicode codepoints, in addition to `.`, `_`, `[` and `]`. Names must start with either an `_` or an alphabetic codepoint. Alphabetic codepoints correspond to the `Alphabetic` Unicode property, while numeric codepoints correspond to the union of the `Decimal_Number`, `Letter_Number` and `Other_Number` general categories. Flags are each a single character. For example, `(?x)` sets the flag `x` and `(?-x)` clears the flag `x`. Multiple flags can be set or cleared at the same time: `(?xy)` sets both the `x` and `y` flags and `(?x-y)` sets the `x` flag and clears the `y` flag. All flags are by default disabled unless stated otherwise. They are:
i     case-insensitive: letters match both upper and lower case
m     multi-line mode: ^ and $ match begin/end of line
s     allow . to match \n
R     enables CRLF mode: when multi-line mode is enabled, \r\n is used
U     swap the meaning of x* and x*?
x     verbose mode, ignores whitespace and allow line comments (starting with `#`)
Note that in verbose mode, whitespace is ignored everywhere, including within character classes. To insert whitespace, use its escaped form or a hex literal. For example, `\ ` or `\x20` for an ASCII space. Flags can be toggled within a pattern. Here's an example that matches case-insensitively for the first part but case-sensitively for the second part: ```rust use regex_lite::Regex; let re = Regex::new(r"(?i)a+(?-i)b+").unwrap(); let m = re.find("AaAaAbbBBBb").unwrap(); assert_eq!(m.as_str(), "AaAaAbb"); ``` Notice that the `a+` matches either `a` or `A`, but the `b+` only matches `b`. Multi-line mode means `^` and `$` no longer match just at the beginning/end of the input, but also at the beginning/end of lines: ``` use regex_lite::Regex; let re = Regex::new(r"(?m)^line \d+").unwrap(); let m = re.find("line one\nline 2\n").unwrap(); assert_eq!(m.as_str(), "line 2"); ``` Note that `^` matches after new lines, even at the end of input: ``` use regex_lite::Regex; let re = Regex::new(r"(?m)^").unwrap(); let m = re.find_iter("test\n").last().unwrap(); assert_eq!((m.start(), m.end()), (5, 5)); ``` When both CRLF mode and multi-line mode are enabled, then `^` and `$` will match either `\r` and `\n`, but never in the middle of a `\r\n`: ``` use regex_lite::Regex; let re = Regex::new(r"(?mR)^foo$").unwrap(); let m = re.find("\r\nfoo\r\n").unwrap(); assert_eq!(m.as_str(), "foo"); ``` ### Escape sequences Note that this includes all possible escape sequences, even ones that are documented elsewhere.
\*              literal *, applies to all ASCII except [0-9A-Za-z<>]
\a              bell (\x07)
\f              form feed (\x0C)
\t              horizontal tab
\n              new line
\r              carriage return
\v              vertical tab (\x0B)
\A              matches at the beginning of a haystack
\z              matches at the end of a haystack
\b              word boundary assertion
\B              negated word boundary assertion
\b{start}, \<   start-of-word boundary assertion
\b{end}, \>     end-of-word boundary assertion
\b{start-half}  half of a start-of-word boundary assertion
\b{end-half}    half of a end-of-word boundary assertion
\x7F            hex character code (exactly two digits)
\x{10FFFF}      any hex character code corresponding to a Unicode code point
\u007F          hex character code (exactly four digits)
\u{7F}          any hex character code corresponding to a Unicode code point
\U0000007F      hex character code (exactly eight digits)
\U{7F}          any hex character code corresponding to a Unicode code point
\d, \s, \w      Perl character class
\D, \S, \W      negated Perl character class
### Perl character classes (ASCII only) These character classes are short-hands for common groups of characters. In this crate, `\d`, `\s` and `\w` only consist of ASCII codepoints.
\d     digit ([0-9])
\D     not digit
\s     whitespace ([\t\n\v\f\r ])
\S     not whitespace
\w     word character ([0-9A-Za-z_])
\W     not word character
### ASCII character classes These reflect additional groups of characters taken from POSIX regex syntax that are sometimes useful to have. In this crate, all of these classes only consist of ASCII codepoints.
[[:alnum:]]    alphanumeric ([0-9A-Za-z])
[[:alpha:]]    alphabetic ([A-Za-z])
[[:ascii:]]    ASCII ([\x00-\x7F])
[[:blank:]]    blank ([\t ])
[[:cntrl:]]    control ([\x00-\x1F\x7F])
[[:digit:]]    digits ([0-9])
[[:graph:]]    graphical ([!-~])
[[:lower:]]    lower case ([a-z])
[[:print:]]    printable ([ -~])
[[:punct:]]    punctuation ([!-/:-@\[-`{-~])
[[:space:]]    whitespace ([\t\n\v\f\r ])
[[:upper:]]    upper case ([A-Z])
[[:word:]]     word characters ([0-9A-Za-z_])
[[:xdigit:]]   hex digit ([0-9A-Fa-f])
# Untrusted input This crate is meant to be able to run regex searches on untrusted haystacks without fear of [ReDoS]. This crate also, to a certain extent, supports untrusted patterns. [ReDoS]: https://en.wikipedia.org/wiki/ReDoS This crate differs from most (but not all) other regex engines in that it doesn't use unbounded backtracking to run a regex search. In those cases, one generally cannot use untrusted patterns *or* untrusted haystacks because it can be very difficult to know whether a particular pattern will result in catastrophic backtracking or not. We'll first discuss how this crate deals with untrusted inputs and then wrap it up with a realistic discussion about what practice really looks like. ### Panics Outside of clearly documented cases, most APIs in this crate are intended to never panic regardless of the inputs given to them. For example, `Regex::new`, `Regex::is_match`, `Regex::find` and `Regex::captures` should never panic. That is, it is an API promise that those APIs will never panic no matter what inputs are given to them. With that said, regex engines are complicated beasts, and providing a rock solid guarantee that these APIs literally never panic is essentially equivalent to saying, "there are no bugs in this library." That is a bold claim, and not really one that can be feasibly made with a straight face. Don't get the wrong impression here. This crate is extensively tested, not just with unit and integration tests, but also via fuzz testing. For example, this crate is part of the [OSS-fuzz project]. Panics should be incredibly rare, but it is possible for bugs to exist, and thus possible for a panic to occur. If you need a rock solid guarantee against panics, then you should wrap calls into this library with [`std::panic::catch_unwind`]. It's also worth pointing out that this library will generally panic when other regex engines would commit undefined behavior. When undefined behavior occurs, your program might continue as if nothing bad has happened, but it also might mean your program is open to the worst kinds of exploits. In contrast, the worst thing a panic can do is a denial of service. [OSS-fuzz project]: https://android.googlesource.com/platform/external/oss-fuzz/+/refs/tags/android-t-preview-1/projects/rust-regex/ [`std::panic::catch_unwind`]: https://doc.rust-lang.org/std/panic/fn.catch_unwind.html ### Untrusted patterns The principal way this crate deals with them is by limiting their size by default. The size limit can be configured via [`RegexBuilder::size_limit`]. The idea of a size limit is that compiling a pattern into a `Regex` will fail if it becomes "too big." Namely, while *most* resources consumed by compiling a regex are approximately proportional to the length of the pattern itself, there is one particular exception to this: counted repetitions. Namely, this pattern: ```text a{5}{5}{5}{5}{5}{5} ``` Is equivalent to this pattern: ```text a{15625} ``` In both of these cases, the actual pattern string is quite small, but the resulting `Regex` value is quite large. Indeed, as the first pattern shows, it isn't enough to locally limit the size of each repetition because they can be stacked in a way that results in exponential growth. To provide a bit more context, a simplified view of regex compilation looks like this: * The pattern string is parsed into a structured representation called an HIR (high-level intermediate representation). Counted repetitions are not expanded in this stage. That is, the size of the HIR is proportional to the size of the pattern with "reasonable" constant factors. In other words, one can reasonably limit the memory used by an HIR by limiting the length of the pattern string. * The HIR is compiled into a [Thompson NFA]. This is the stage at which something like `\w{5}` is rewritten to `\w\w\w\w\w`. Thus, this is the stage at which [`RegexBuilder::size_limit`] is enforced. If the NFA exceeds the configured size, then this stage will fail. [Thompson NFA]: https://en.wikipedia.org/wiki/Thompson%27s_construction The size limit helps avoid two different kinds of exorbitant resource usage: * It avoids permitting exponential memory usage based on the size of the pattern string. * It avoids long search times. This will be discussed in more detail in the next section, but worst case search time *is* dependent on the size of the regex. So keeping regexes limited to a reasonable size is also a way of keeping search times reasonable. Finally, it's worth pointing out that regex compilation is guaranteed to take worst case `O(m)` time, where `m` is proportional to the size of regex. The size of the regex here is *after* the counted repetitions have been expanded. **Advice for those using untrusted regexes**: limit the pattern length to something small and expand it as needed. Configure [`RegexBuilder::size_limit`] to something small and then expand it as needed. ### Untrusted haystacks The main way this crate guards against searches from taking a long time is by using algorithms that guarantee a `O(m * n)` worst case time and space bound. Namely: * `m` is proportional to the size of the regex, where the size of the regex includes the expansion of all counted repetitions. (See the previous section on untrusted patterns.) * `n` is proportional to the length, in bytes, of the haystack. In other words, if you consider `m` to be a constant (for example, the regex pattern is a literal in the source code), then the search can be said to run in "linear time." Or equivalently, "linear time with respect to the size of the haystack." But the `m` factor here is important not to ignore. If a regex is particularly big, the search times can get quite slow. This is why, in part, [`RegexBuilder::size_limit`] exists. **Advice for those searching untrusted haystacks**: As long as your regexes are not enormous, you should expect to be able to search untrusted haystacks without fear. If you aren't sure, you should benchmark it. Unlike backtracking engines, if your regex is so big that it's likely to result in slow searches, this is probably something you'll be able to observe regardless of what the haystack is made up of. ### Iterating over matches One thing that is perhaps easy to miss is that the worst case time complexity bound of `O(m * n)` applies to methods like [`Regex::is_match`], [`Regex::find`] and [`Regex::captures`]. It does **not** apply to [`Regex::find_iter`] or [`Regex::captures_iter`]. Namely, since iterating over all matches can execute many searches, and each search can scan the entire haystack, the worst case time complexity for iterators is `O(m * n^2)`. One example of where this occurs is when a pattern consists of an alternation, where an earlier branch of the alternation requires scanning the entire haystack only to discover that there is no match. It also requires a later branch of the alternation to have matched at the beginning of the search. For example, consider the pattern `.*[^A-Z]|[A-Z]` and the haystack `AAAAA`. The first search will scan to the end looking for matches of `.*[^A-Z]` even though a finite automata engine (as in this crate) knows that `[A-Z]` has already matched the first character of the haystack. This is due to the greedy nature of regex searching. That first search will report a match at the first `A` only after scanning to the end to discover that no other match exists. The next search then begins at the second `A` and the behavior repeats. There is no way to avoid this. This means that if both patterns and haystacks are untrusted and you're iterating over all matches, you're susceptible to worst case quadratic time complexity. One possible way to mitigate this is to switch to the lower level `regex-automata` crate and use its `meta::Regex` iterator APIs. There, you can configure the search to operate in "earliest" mode by passing a `Input::new(haystack).earliest(true)` to `meta::Regex::find_iter` (for example). By enabling this mode, you give up the normal greedy match semantics of regex searches and instead ask the regex engine to immediately stop as soon as a match has been found. Enabling this mode will thus restore the worst case `O(m * n)` time complexity bound, but at the cost of different semantics. ### Untrusted inputs in practice While providing a `O(m * n)` worst case time bound on all searches goes a long way toward preventing [ReDoS], that doesn't mean every search you can possibly run will complete without burning CPU time. In general, there are a few ways for the `m * n` time bound to still bite you: * You are searching an exceptionally long haystack. No matter how you slice it, a longer haystack will take more time to search. * Very large regexes can searches to be quite slow due to increasing the size `m` in the worst case `O(m * n)` bound. This is especially true when they are combined with counted repetitions. While the regex size limit above will protect you from the most egregious cases, the default size limit still permits pretty big regexes that can execute more slowly than one might expect. * While routines like [`Regex::find`] and [`Regex::captures`] guarantee worst case `O(m * n)` search time, routines like [`Regex::find_iter`] and [`Regex::captures_iter`] actually have worst case `O(m * n^2)` search time. This is because `find_iter` runs many searches, and each search takes worst case `O(m * n)` time. Thus, iteration of all matches in a haystack has worst case `O(m * n^2)`. A good example of a pattern that exhibits this is `(?:A+){1000}|` or even `.*[^A-Z]|[A-Z]`. In general, unstrusted haystacks are easier to stomach than untrusted patterns. Untrusted patterns give a lot more control to the caller to impact the performance of a search. Therefore, permitting untrusted patterns means that your only line of defense is to put a limit on how big `m` (and perhaps also `n`) can be in `O(m * n)`. `n` is limited by simply inspecting the length of the haystack while `m` is limited by *both* applying a limit to the length of the pattern *and* a limit on the compiled size of the regex via [`RegexBuilder::size_limit`]. It bears repeating: if you're accepting untrusted patterns, it would be a good idea to start with conservative limits on `m` and `n`, and then carefully increase them as needed. */ #![no_std] // I'm not ideologically opposed to allowing non-safe code in this crate, but // IMO it needs really excellent justification. One likely place where this // could show up is if and when we support a non-std alloc mode. In that case, // we need some way to synchronize access to a PikeVM cache. That in turn will // likely require rolling our own primitive spin-lock or similar structure. #![forbid(unsafe_code)] #![deny(missing_docs, rustdoc::broken_intra_doc_links)] #![warn(missing_debug_implementations)] // When the main features are disabled, squash dead code warnings. The // alternative is to litter conditional compilation directives everywhere, // which is super annoying. #![cfg_attr(not(feature = "string"), allow(dead_code))] #[cfg(not(feature = "std"))] compile_error!("'std' is currently a required feature, please file an issue"); #[cfg(not(any(target_pointer_width = "32", target_pointer_width = "64")))] compile_error!("not supported on non-{32,64}, please file an issue"); extern crate alloc; #[cfg(any(test, feature = "std"))] extern crate std; #[cfg(feature = "string")] pub use self::string::*; pub use self::{error::Error, hir::escape}; mod error; mod hir; mod int; mod interpolate; mod nfa; mod pikevm; mod pool; #[cfg(feature = "string")] mod string; mod utf8; regex-lite-0.1.6/src/nfa.rs000064400000000000000000000640101046102023000135770ustar 00000000000000use core::{cell::RefCell, mem::size_of}; use alloc::{string::String, sync::Arc, vec, vec::Vec}; use crate::{ error::Error, hir::{self, Hir, HirKind}, int::U32, }; pub(crate) type StateID = u32; #[derive(Clone, Copy, Debug)] pub(crate) struct Config { pub(crate) size_limit: Option, } impl Default for Config { fn default() -> Config { Config { size_limit: Some(10 * (1 << 20)) } } } #[derive(Clone)] pub(crate) struct NFA { /// The pattern string this NFA was generated from. /// /// We put it here for lack of a better place to put it. ¯\_(ツ)_/¯ pattern: String, /// The states that make up this NFA. states: Vec, /// The ID of the start state. start: StateID, /// Whether this NFA can only match at the beginning of a haystack. is_start_anchored: bool, /// Whether this NFA can match the empty string. is_match_empty: bool, /// If every match has the same number of matching capture groups, then /// this corresponds to the number of groups. static_explicit_captures_len: Option, /// A map from capture group name to its corresponding index. cap_name_to_index: CaptureNameMap, /// A map from capture group index to the corresponding name, if one /// exists. cap_index_to_name: Vec>>, /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this /// incrementally. memory_extra: usize, } impl NFA { /// Creates a new NFA from the given configuration and HIR. pub(crate) fn new( config: Config, pattern: String, hir: &Hir, ) -> Result { Compiler::new(config, pattern).compile(hir) } /// Returns the pattern string used to construct this NFA. pub(crate) fn pattern(&self) -> &str { &self.pattern } /// Returns the state corresponding to the given ID. /// /// # Panics /// /// If the ID does not refer to a valid state, then this panics. pub(crate) fn state(&self, id: StateID) -> &State { &self.states[id.as_usize()] } /// Returns the total number of states in this NFA. pub(crate) fn len(&self) -> usize { self.states.len() } /// Returns the ID of the starting state for this NFA. pub(crate) fn start(&self) -> StateID { self.start } /// Returns the capture group index for the corresponding named group. /// If no such group with the given name exists, then `None` is returned. pub(crate) fn to_index(&self, name: &str) -> Option { self.cap_name_to_index.get(name).cloned().map(|i| i.as_usize()) } /* /// Returns the capture group name for the corresponding index. /// If no such group with the given index, then `None` is returned. pub(crate) fn to_name(&self, index: usize) -> Option<&str> { self.cap_index_to_name.get(index)?.as_deref() } */ /// Returns an iterator over all of the capture groups, along with their /// names if they exist, in this NFA. pub(crate) fn capture_names(&self) -> CaptureNames<'_> { CaptureNames { it: self.cap_index_to_name.iter() } } /// Returns the total number of capture groups, including the first and /// implicit group, in this NFA. pub(crate) fn group_len(&self) -> usize { self.cap_index_to_name.len() } /// Returns true if and only if this NFA can only match at the beginning of /// a haystack. pub(crate) fn is_start_anchored(&self) -> bool { self.is_start_anchored } /// If the pattern always reports the same number of matching capture groups /// for every match, then this returns the number of those groups. This /// doesn't include the implicit group found in every pattern. pub(crate) fn static_explicit_captures_len(&self) -> Option { self.static_explicit_captures_len } /// Returns the heap memory usage, in bytes, used by this NFA. fn memory_usage(&self) -> usize { (self.states.len() * size_of::()) + (self.cap_index_to_name.len() * size_of::>>()) + self.memory_extra } } impl core::fmt::Debug for NFA { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { writeln!(f, "NFA(")?; writeln!(f, "pattern: {}", self.pattern)?; for (sid, state) in self.states.iter().enumerate() { writeln!(f, "{:07?}: {:?}", sid, state)?; } writeln!(f, ")")?; Ok(()) } } /// An iterator over all capture groups in an NFA. /// /// If a particular group has a name, then it is yielded. Otherwise, `None` /// is yielded. #[derive(Clone, Debug)] pub(crate) struct CaptureNames<'a> { it: core::slice::Iter<'a, Option>>, } impl<'a> Iterator for CaptureNames<'a> { type Item = Option<&'a str>; fn next(&mut self) -> Option> { self.it.next().map(|n| n.as_deref()) } } #[derive(Clone, Eq, PartialEq)] pub(crate) enum State { Char { target: StateID, ch: char }, Ranges { target: StateID, ranges: Vec<(char, char)> }, Splits { targets: Vec, reverse: bool }, Goto { target: StateID, look: Option }, Capture { target: StateID, slot: u32 }, Fail, Match, } impl State { /// Returns the heap memory usage of this NFA state in bytes. fn memory_usage(&self) -> usize { match *self { State::Char { .. } | State::Goto { .. } | State::Capture { .. } | State::Fail { .. } | State::Match => 0, State::Splits { ref targets, .. } => { targets.len() * size_of::() } State::Ranges { ref ranges, .. } => { ranges.len() * size_of::<(char, char)>() } } } /// Returns an iterator over the given split targets. The order of the /// iterator yields elements in reverse when `reverse` is true. pub(crate) fn iter_splits<'a>( splits: &'a [StateID], reverse: bool, ) -> impl Iterator + 'a { let mut it = splits.iter(); core::iter::from_fn(move || { if reverse { it.next_back() } else { it.next() }.copied() }) } } impl core::fmt::Debug for State { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { State::Char { target, ch } => { write!(f, "{:?} => {:?}", ch, target) } State::Ranges { target, ref ranges } => { for (i, &(start, end)) in ranges.iter().enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?}-{:?} => {:?}", start, end, target)?; } Ok(()) } State::Splits { ref targets, reverse } => { write!(f, "splits(")?; for (i, sid) in State::iter_splits(targets, reverse).enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?}", sid)?; } write!(f, ")") } State::Goto { target, look: None } => { write!(f, "goto({:?})", target) } State::Goto { target, look: Some(look) } => { write!(f, "{:?} => {:?}", look, target) } State::Capture { target, slot } => { write!(f, "capture(slot={:?}) => {:?}", slot, target,) } State::Fail => write!(f, "FAIL"), State::Match => { write!(f, "MATCH") } } } } /// A map from capture group name to its corresponding capture group index. /// /// We define a type alias here so that we can transparently use a `HashMap` /// whenever it's available. We do so presumably because it's faster, although /// there are no benchmarks verifying this. #[cfg(feature = "std")] type CaptureNameMap = std::collections::HashMap, u32>; #[cfg(not(feature = "std"))] type CaptureNameMap = alloc::collections::BTreeMap, u32>; #[derive(Debug)] struct Compiler { config: Config, nfa: RefCell, } impl Compiler { fn new(config: Config, pattern: String) -> Compiler { let nfa = RefCell::new(NFA { pattern, states: vec![], start: 0, is_start_anchored: false, is_match_empty: false, static_explicit_captures_len: None, cap_name_to_index: CaptureNameMap::default(), cap_index_to_name: vec![], memory_extra: 0, }); Compiler { config, nfa } } fn compile(self, hir: &Hir) -> Result { self.nfa.borrow_mut().is_start_anchored = hir.is_start_anchored(); self.nfa.borrow_mut().is_match_empty = hir.is_match_empty(); self.nfa.borrow_mut().static_explicit_captures_len = hir.static_explicit_captures_len(); let compiled = self.c_capture(0, None, hir)?; let mat = self.add(State::Match)?; self.patch(compiled.end, mat)?; self.nfa.borrow_mut().start = compiled.start; Ok(self.nfa.into_inner()) } fn c(&self, hir: &Hir) -> Result { match *hir.kind() { HirKind::Empty => self.c_empty(), HirKind::Char(ch) => self.c_char(ch), HirKind::Class(ref class) => self.c_class(class), HirKind::Look(ref look) => self.c_look(look), HirKind::Repetition(ref rep) => self.c_repetition(rep), HirKind::Capture(ref cap) => { self.c_capture(cap.index, cap.name.as_deref(), &cap.sub) } HirKind::Concat(ref subs) => { self.c_concat(subs.iter().map(|s| self.c(s))) } HirKind::Alternation(ref subs) => { self.c_alternation(subs.iter().map(|s| self.c(s))) } } } /// Compile a "fail" state that can never be transitioned out of. fn c_fail(&self) -> Result { let id = self.add(State::Fail)?; Ok(ThompsonRef { start: id, end: id }) } /// Compile an "empty" state with one unconditional epsilon transition. /// /// Both the `start` and `end` locations point to the state created. /// Callers will likely want to keep the `start`, but patch the `end` to /// point to some other state. fn c_empty(&self) -> Result { let id = self.add_empty()?; Ok(ThompsonRef { start: id, end: id }) } /// Compile the given literal char to an NFA. fn c_char(&self, ch: char) -> Result { let id = self.add(State::Char { target: 0, ch })?; Ok(ThompsonRef { start: id, end: id }) } /// Compile the given character class into an NFA. /// /// If the class is empty, then this compiles to a `Fail` state. fn c_class(&self, class: &hir::Class) -> Result { let id = if class.ranges.is_empty() { // Technically using an explicit fail state probably isn't // necessary. Because if you try to match against an empty Ranges, // then it should turn up with nothing regardless of input, and // thus "acts" like a Fail state. But it's better to be more // explicit, and there's no real cost to doing so. self.add(State::Fail) } else { let ranges = class.ranges.iter().map(|r| (r.start, r.end)).collect(); self.add(State::Ranges { target: 0, ranges }) }?; Ok(ThompsonRef { start: id, end: id }) } /// Compile the given HIR look-around assertion to an NFA look-around /// assertion. fn c_look(&self, look: &hir::Look) -> Result { let id = self.add(State::Goto { target: 0, look: Some(*look) })?; Ok(ThompsonRef { start: id, end: id }) } /// Compile the given repetition expression. This handles all types of /// repetitions and greediness. fn c_repetition( &self, rep: &hir::Repetition, ) -> Result { match (rep.min, rep.max) { (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), } } /// Compile the given expression such that it matches at least `min` times, /// but no more than `max` times. /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_bounded( &self, hir: &Hir, greedy: bool, min: u32, max: u32, ) -> Result { let prefix = self.c_exactly(hir, min)?; if min == max { return Ok(prefix); } // It is tempting here to compile the rest here as a concatenation // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it // were `aaa?a?a?`. The problem here is that it leads to this program: // // >000000: 61 => 01 // 000001: 61 => 02 // 000002: union(03, 04) // 000003: 61 => 04 // 000004: union(05, 06) // 000005: 61 => 06 // 000006: union(07, 08) // 000007: 61 => 08 // 000008: MATCH // // And effectively, once you hit state 2, the epsilon closure will // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better // to instead compile it like so: // // >000000: 61 => 01 // 000001: 61 => 02 // 000002: union(03, 08) // 000003: 61 => 04 // 000004: union(05, 08) // 000005: 61 => 06 // 000006: union(07, 08) // 000007: 61 => 08 // 000008: MATCH // // So that the epsilon closure of state 2 is now just 3 and 8. let empty = self.add_empty()?; let mut prev_end = prefix.end; for _ in min..max { let splits = self.add(State::Splits { targets: vec![], reverse: !greedy })?; let compiled = self.c(hir)?; self.patch(prev_end, splits)?; self.patch(splits, compiled.start)?; self.patch(splits, empty)?; prev_end = compiled.end; } self.patch(prev_end, empty)?; Ok(ThompsonRef { start: prefix.start, end: empty }) } /// Compile the given expression such that it may be matched `n` or more /// times, where `n` can be any integer. (Although a particularly large /// integer is likely to run afoul of any configured size limits.) /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_at_least( &self, hir: &Hir, greedy: bool, n: u32, ) -> Result { if n == 0 { // When the expression cannot match the empty string, then we // can get away with something much simpler: just one 'alt' // instruction that optionally repeats itself. But if the expr // can match the empty string... see below. if !hir.is_match_empty() { let splits = self.add(State::Splits { targets: vec![], reverse: !greedy, })?; let compiled = self.c(hir)?; self.patch(splits, compiled.start)?; self.patch(compiled.end, splits)?; return Ok(ThompsonRef { start: splits, end: splits }); } // What's going on here? Shouldn't x* be simpler than this? It // turns out that when implementing leftmost-first (Perl-like) // match semantics, x* results in an incorrect preference order // when computing the transitive closure of states if and only if // 'x' can match the empty string. So instead, we compile x* as // (x+)?, which preserves the correct preference order. // // See: https://github.com/rust-lang/regex/issues/779 let compiled = self.c(hir)?; let plus = self.add(State::Splits { targets: vec![], reverse: !greedy })?; self.patch(compiled.end, plus)?; self.patch(plus, compiled.start)?; let question = self.add(State::Splits { targets: vec![], reverse: !greedy })?; let empty = self.add_empty()?; self.patch(question, compiled.start)?; self.patch(question, empty)?; self.patch(plus, empty)?; Ok(ThompsonRef { start: question, end: empty }) } else if n == 1 { let compiled = self.c(hir)?; let splits = self.add(State::Splits { targets: vec![], reverse: !greedy })?; self.patch(compiled.end, splits)?; self.patch(splits, compiled.start)?; Ok(ThompsonRef { start: compiled.start, end: splits }) } else { let prefix = self.c_exactly(hir, n - 1)?; let last = self.c(hir)?; let splits = self.add(State::Splits { targets: vec![], reverse: !greedy })?; self.patch(prefix.end, last.start)?; self.patch(last.end, splits)?; self.patch(splits, last.start)?; Ok(ThompsonRef { start: prefix.start, end: splits }) } } /// Compile the given expression such that it may be matched zero or one /// times. /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_zero_or_one( &self, hir: &Hir, greedy: bool, ) -> Result { let splits = self.add(State::Splits { targets: vec![], reverse: !greedy })?; let compiled = self.c(hir)?; let empty = self.add_empty()?; self.patch(splits, compiled.start)?; self.patch(splits, empty)?; self.patch(compiled.end, empty)?; Ok(ThompsonRef { start: splits, end: empty }) } /// Compile the given HIR expression exactly `n` times. fn c_exactly(&self, hir: &Hir, n: u32) -> Result { self.c_concat((0..n).map(|_| self.c(hir))) } /// Compile the given expression and insert capturing states at the /// beginning and end of it. The slot for the capture states is computed /// from the index. fn c_capture( &self, index: u32, name: Option<&str>, hir: &Hir, ) -> Result { // For discontiguous indices, push placeholders for earlier capture // groups that weren't explicitly added. This can happen, for example, // with patterns like '(a){0}(a)' where '(a){0}' is completely removed // from the pattern. let existing_groups_len = self.nfa.borrow().cap_index_to_name.len(); for _ in 0..(index.as_usize().saturating_sub(existing_groups_len)) { self.nfa.borrow_mut().cap_index_to_name.push(None); } if index.as_usize() >= existing_groups_len { if let Some(name) = name { let name = Arc::from(name); let mut nfa = self.nfa.borrow_mut(); nfa.cap_name_to_index.insert(Arc::clone(&name), index); nfa.cap_index_to_name.push(Some(Arc::clone(&name))); // This is an approximation. nfa.memory_extra += name.len() + size_of::(); } else { self.nfa.borrow_mut().cap_index_to_name.push(None); } } let Some(slot) = index.checked_mul(2) else { return Err(Error::new("capture group slots exhausted")); }; let start = self.add(State::Capture { target: 0, slot })?; let inner = self.c(hir)?; let Some(slot) = slot.checked_add(1) else { return Err(Error::new("capture group slots exhausted")); }; let end = self.add(State::Capture { target: 0, slot })?; self.patch(start, inner.start)?; self.patch(inner.end, end)?; Ok(ThompsonRef { start, end }) } /// Compile a concatenation of the sub-expressions yielded by the given /// iterator. If the iterator yields no elements, then this compiles down /// to an "empty" state that always matches. fn c_concat(&self, mut it: I) -> Result where I: Iterator>, { let ThompsonRef { start, mut end } = match it.next() { Some(result) => result?, None => return self.c_empty(), }; for result in it { let compiled = result?; self.patch(end, compiled.start)?; end = compiled.end; } Ok(ThompsonRef { start, end }) } /// Compile an alternation, where each element yielded by the given /// iterator represents an item in the alternation. If the iterator yields /// no elements, then this compiles down to a "fail" state. /// /// In an alternation, expressions appearing earlier are "preferred" at /// match time over expressions appearing later. (This is currently always /// true, as this crate only supports leftmost-first semantics.) fn c_alternation(&self, mut it: I) -> Result where I: Iterator>, { let first = match it.next() { None => return self.c_fail(), Some(result) => result?, }; let second = match it.next() { None => return Ok(first), Some(result) => result?, }; let splits = self.add(State::Splits { targets: vec![], reverse: false })?; let end = self.add_empty()?; self.patch(splits, first.start)?; self.patch(first.end, end)?; self.patch(splits, second.start)?; self.patch(second.end, end)?; for result in it { let compiled = result?; self.patch(splits, compiled.start)?; self.patch(compiled.end, end)?; } Ok(ThompsonRef { start: splits, end }) } /// A convenience routine for adding an empty state, also known as an /// unconditional epsilon transition. These are quite useful for making /// NFA construction simpler. /// /// (In the regex crate, we do a second pass to remove these, but don't /// bother with that here.) fn add_empty(&self) -> Result { self.add(State::Goto { target: 0, look: None }) } /// The common implementation of "add a state." It handles the common /// error cases of state ID exhausting (by owning state ID allocation) and /// whether the size limit has been exceeded. fn add(&self, state: State) -> Result { let id = u32::try_from(self.nfa.borrow().states.len()) .map_err(|_| Error::new("exhausted state IDs, too many states"))?; self.nfa.borrow_mut().memory_extra += state.memory_usage(); self.nfa.borrow_mut().states.push(state); self.check_size_limit()?; Ok(id) } /// Add a transition from one state to another. /// /// This routine is called "patch" since it is very common to add the /// states you want, typically with "dummy" state ID transitions, and then /// "patch" in the real state IDs later. This is because you don't always /// know all of the necessary state IDs to add because they might not /// exist yet. /// /// # Errors /// /// This may error if patching leads to an increase in heap usage beyond /// the configured size limit. Heap usage only grows when patching adds a /// new transition (as in the case of a "splits" state). fn patch(&self, from: StateID, to: StateID) -> Result<(), Error> { let mut new_memory_extra = self.nfa.borrow().memory_extra; match self.nfa.borrow_mut().states[from.as_usize()] { State::Char { ref mut target, .. } => { *target = to; } State::Ranges { ref mut target, .. } => { *target = to; } State::Splits { ref mut targets, .. } => { targets.push(to); new_memory_extra += size_of::(); } State::Goto { ref mut target, .. } => { *target = to; } State::Capture { ref mut target, .. } => { *target = to; } State::Fail | State::Match => {} } if new_memory_extra != self.nfa.borrow().memory_extra { self.nfa.borrow_mut().memory_extra = new_memory_extra; self.check_size_limit()?; } Ok(()) } /// Checks that the current heap memory usage of the NFA being compiled /// doesn't exceed the configured size limit. If it does, an error is /// returned. fn check_size_limit(&self) -> Result<(), Error> { if let Some(limit) = self.config.size_limit { if self.nfa.borrow().memory_usage() > limit { return Err(Error::new("compiled regex exceeded size limit")); } } Ok(()) } } /// A value that represents the result of compiling a sub-expression of a /// regex's HIR. Specifically, this represents a sub-graph of the NFA that /// has an initial state at `start` and a final state at `end`. #[derive(Clone, Copy, Debug)] struct ThompsonRef { start: StateID, end: StateID, } regex-lite-0.1.6/src/pikevm.rs000064400000000000000000001117021046102023000143270ustar 00000000000000use alloc::{vec, vec::Vec}; use crate::{ int::{NonMaxUsize, U32}, nfa::{State, StateID, NFA}, pool::CachePoolGuard, utf8, }; /// A PikeVM searcher. /// /// A PikeVM uses the standard Thompson NFA linear time search algorithm, but /// augmented to support tracking the offsets of matching capture groups. #[derive(Clone, Debug)] pub(crate) struct PikeVM { nfa: NFA, } impl PikeVM { /// Create a new PikeVM searcher that uses the given NFA. pub(crate) fn new(nfa: NFA) -> PikeVM { PikeVM { nfa } } /// Return the underlying NFA used by this PikeVM. pub(crate) fn nfa(&self) -> &NFA { &self.nfa } /// Returns an iterator of non-overlapping matches in the given haystack. pub(crate) fn find_iter<'r, 'h>( &'r self, cache: CachePoolGuard<'r>, haystack: &'h [u8], ) -> FindMatches<'r, 'h> { FindMatches { pikevm: self, cache, haystack, at: 0, slots: vec![None, None], last_match_end: None, } } /// Returns an iterator of non-overlapping capture matches in the given /// haystack. pub(crate) fn captures_iter<'r, 'h>( &'r self, cache: CachePoolGuard<'r>, haystack: &'h [u8], ) -> CapturesMatches<'r, 'h> { // OK because the NFA wouldn't have compiled if this could overflow. let len = self.nfa().group_len().checked_mul(2).unwrap(); CapturesMatches { it: FindMatches { pikevm: self, cache, haystack, at: 0, slots: vec![None; len], last_match_end: None, }, } } /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. /// `slots` can be any length. Any slot in the NFA that is activated but /// which is out of bounds for the given `slots` is ignored. pub(crate) fn search( &self, cache: &mut Cache, haystack: &[u8], start: usize, end: usize, earliest: bool, slots: &mut [Option], ) -> bool { cache.setup_search(slots.len()); if start > end { return false; } // Why do we even care about this? Well, in our `slots` representation, // we use usize::MAX as a sentinel to indicate "no match." This isn't // problematic so long as our haystack doesn't have a maximal length. // Byte slices are guaranteed by Rust to have a length that fits into // isize, and so this assert should always pass. But we put it here to // make our assumption explicit. assert!( haystack.len() < core::usize::MAX, "byte slice lengths must be less than usize MAX", ); let Cache { ref mut stack, ref mut curr, ref mut next } = cache; let start_id = self.nfa().start(); let anchored = self.nfa().is_start_anchored(); let mut matched = false; // Yes, our search doesn't end at `end`, but includes it. This is // necessary because matches are delayed by one byte. The delay is used // to handle look-behind assertions. In the case of the PikeVM, the // delay is implemented by not considering a match to exist until it // is visited in `nexts`. Technically, we know a match exists in the // previous iteration via `epsilon_closure`. let mut at = start; while at <= end { // If we have no states left to visit, then there are some cases // where we know we can quit early or even skip ahead. if curr.set.is_empty() { // We have a match so we can quit. if matched { break; } // If we're running an anchored search and we've advanced // beyond the start position with no other states to try, then // we will never observe a match and thus can stop. if anchored && at > start { break; } } // Instead of using a hypothetical unanchored start state in the // NFA (which doesn't exist, but we could add it), we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s:.)*?' // prefix, to permit a match to appear anywhere. // // Now, we don't *have* to do things this way. We could create // a proper unanchored start state in the NFA and do one // `epsilon_closure` call from that starting state before the main // loop here. And that is just as correct. However, it turns out to // be slower than our approach here because it slightly increases // the cost of processing each byte by requiring us to visit // more NFA states to deal with the additional NFA states in the // unanchored prefix. By simulating it explicitly here, we lower // those costs substantially. The cost is itself small, but it adds // up for large haystacks. // // In order to simulate the '(?s:.)*?' prefix---which is not // greedy---we are careful not to perform an epsilon closure on // the start state if we already have a match. Namely, if we // did otherwise, we would never reach a terminating condition // because there would always be additional states to process. if !matched { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is // guaranteed to have the right length but where every element // is absent. This is exactly what we want, because this // epsilon closure is responsible for simulating an unanchored // '(?s:.)*?' prefix. It is specifically outside of any // capturing groups, and thus, using slots that are always // absent is correct. // // Note though that we can't just use `&mut []` here, since // this epsilon closure may traverse through `Capture` states // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in `curr`. let slots = next.slot_table.all_absent(); self.epsilon_closure( stack, slots, curr, haystack, at, start_id, ); } let (ch, len) = utf8::decode_lossy(&haystack[at..]); if self.nexts(stack, curr, next, haystack, at, ch, len, slots) { matched = true; } // Unless the caller asked us to return early, we need to mush // on to see if we can extend our match. (But note that 'nexts' // will quit right after seeing a match, as is consistent with // leftmost-first match priority.) if (earliest && matched) || len == 0 { break; } core::mem::swap(curr, next); next.set.clear(); at += len; } matched } /// Process the active states in 'curr' to find the states (written to /// 'next') we should process for the next byte in the haystack. /// /// 'stack' is used to perform a depth first traversal of the NFA when /// computing an epsilon closure. /// /// When a match is found, the slots for that match state (in 'curr') are /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' /// stops (unless the PikeVM was configured with MatchKind::All semantics). /// /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` /// in `haystack`. /// /// `at_len` is the number of bytes consumed by `at_ch`. This is usually /// equal to `at_ch.len_utf8()`, but not always. For example, in the case /// where `at_ch` is the replacement codepoint that results from decoding /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. fn nexts( &self, stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, haystack: &[u8], at: usize, at_ch: char, at_len: usize, slots: &mut [Option], ) -> bool { let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { if self.next( stack, slot_table, next, haystack, at, at_ch, at_len, sid, ) { slots.copy_from_slice(slot_table.for_state(sid)); return true; } } false } /// Starting from `sid`, if the position `at` in the `haystack` has a /// transition defined out of `sid`, then add the state transitioned to and /// its epsilon closure to the `next` set of states to explore. /// /// `stack` is used by the epsilon closure computation to perform a depth /// first traversal of the NFA. /// /// `curr_slot_table` should be the table of slots for the current set of /// states being explored. If there is a transition out of `sid`, then /// sid's row in the slot table is used to perform the epsilon closure. /// /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at` /// in `haystack`. The caller provides it so that this routine doesn't /// need to re-decode it. (Since it's expected that this routine is called /// multiple times for each position.) /// /// `at_len` is the number of bytes consumed by `at_ch`. This is usually /// equal to `at_ch.len_utf8()`, but not always. For example, in the case /// where `at_ch` is the replacement codepoint that results from decoding /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3. fn next( &self, stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, haystack: &[u8], at: usize, at_ch: char, at_len: usize, sid: StateID, ) -> bool { match *self.nfa.state(sid) { State::Fail | State::Goto { .. } | State::Splits { .. } | State::Capture { .. } => false, State::Char { target, ch } => { if at_ch == ch && at_len > 0 { let slots = curr_slot_table.for_state(sid); // OK because `at_len` is always derived from the number // of bytes read from `at` that make up `at_ch`. So this // will never wrap. let at = at.wrapping_add(at_len); self.epsilon_closure( stack, slots, next, haystack, at, target, ); } false } State::Ranges { target, ref ranges } => { for (start, end) in ranges.iter().copied() { if start > at_ch { break; } else if start <= at_ch && at_ch <= end { if at_len == 0 { return false; } let slots = curr_slot_table.for_state(sid); // OK because `at_len` is always derived from the // number of bytes read from `at` that make up `at_ch`. // So this will never wrap. let at = at.wrapping_add(at_len); self.epsilon_closure( stack, slots, next, haystack, at, target, ); } } false } State::Match => true, } } /// Compute the epsilon closure of `sid`, writing the closure into `next` /// while copying slot values from `curr_slots` into corresponding states /// in `next`. `curr_slots` should be the slot values corresponding to /// `sid`. /// /// The given `stack` is used to perform a depth first traversal of the /// NFA by recursively following all epsilon transitions out of `sid`. /// Conditional epsilon transitions are followed if and only if they are /// satisfied for the position `at` in the `input` haystack. /// /// While this routine may write to `curr_slots`, once it returns, any /// writes are undone and the original values (even if absent) are /// restored. fn epsilon_closure( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, haystack: &[u8], at: usize, sid: StateID, ) { stack.push(FollowEpsilon::Explore(sid)); while let Some(frame) = stack.pop() { match frame { FollowEpsilon::RestoreCapture { slot, offset } => { curr_slots[slot.as_usize()] = offset; } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( stack, curr_slots, next, haystack, at, sid, ); } } } } /// Explore all of the epsilon transitions out of `sid`. This is mostly /// split out from `epsilon_closure` in order to clearly delineate /// the actual work of computing an epsilon closure from the stack /// book-keeping. /// /// This will push any additional explorations needed on to `stack`. /// /// `curr_slots` should refer to the slots for the currently active NFA /// state. That is, the current state we are stepping through. These /// slots are mutated in place as new `Captures` states are traversed /// during epsilon closure, but the slots are restored to their original /// values once the full epsilon closure is completed. The ultimate use of /// `curr_slots` is to copy them to the corresponding `next_slots`, so that /// the capturing group spans are forwarded from the currently active state /// to the next. /// /// `next` refers to the next set of active states. Computing an epsilon /// closure may increase the next set of active states. /// /// `haystack` refers to the what we're searching and `at` refers to the /// current position in the haystack. These are used to check whether /// conditional epsilon transitions (like look-around) are satisfied at /// the current position. If they aren't, then the epsilon closure won't /// include them. fn epsilon_closure_explore( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, haystack: &[u8], at: usize, mut sid: StateID, ) { // We can avoid pushing some state IDs on to our stack in precisely // the cases where a 'push(x)' would be immediately followed by a 'x // = pop()'. This is achieved by this outer-loop. We simply set 'sid' // to be the next state ID we want to explore once we're done with // our initial exploration. In practice, this avoids a lot of stack // thrashing. loop { // Record this state as part of our next set of active states. If // we've already explored it, then no need to do it again. if !next.set.insert(sid) { return; } match *self.nfa.state(sid) { State::Fail | State::Match { .. } | State::Char { .. } | State::Ranges { .. } => { next.slot_table.for_state(sid).copy_from_slice(curr_slots); return; } State::Goto { target, look: None } => { sid = target; } State::Goto { target, look: Some(look) } => { if !look.is_match(haystack, at) { return; } sid = target; } State::Splits { ref targets, reverse: false } => { sid = match targets.get(0) { None => return, Some(&sid) => sid, }; stack.extend( targets[1..] .iter() .copied() .rev() .map(FollowEpsilon::Explore), ); } State::Splits { ref targets, reverse: true } => { sid = match targets.last() { None => return, Some(&sid) => sid, }; stack.extend( targets[..targets.len() - 1] .iter() .copied() .map(FollowEpsilon::Explore), ); } State::Capture { target, slot } => { // There's no need to do anything with slots that // ultimately won't be copied into the caller-provided // 'Captures' value. So we just skip dealing with them at // all. if slot.as_usize() < curr_slots.len() { stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot.as_usize()], }); // OK because length of a slice must fit into an isize. curr_slots[slot.as_usize()] = Some(NonMaxUsize::new(at).unwrap()); } sid = target; } } } } } /// An iterator over all successive non-overlapping matches in a particular /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime /// of the cache and `'h` represents the lifetime of the haystack. #[derive(Debug)] pub(crate) struct FindMatches<'r, 'h> { pikevm: &'r PikeVM, cache: CachePoolGuard<'r>, haystack: &'h [u8], at: usize, slots: Vec>, last_match_end: Option, } impl<'r, 'h> Iterator for FindMatches<'r, 'h> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { if !self.pikevm.search( &mut self.cache, self.haystack, self.at, self.haystack.len(), false, &mut self.slots, ) { return None; } let mut m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); if m.0 >= m.1 { m = self.handle_overlapping_empty_match(m)?; } self.at = m.1; self.last_match_end = Some(m.1); Some(m) } } impl<'r, 'h> FindMatches<'r, 'h> { /// Handles the special case of an empty match by ensuring that 1) the /// iterator always advances and 2) empty matches never overlap with other /// matches. /// /// Note that we mark this cold and forcefully prevent inlining because /// handling empty matches like this is extremely rare and does require a /// bit of code, comparatively. Keeping this code out of the main iterator /// function keeps it smaller and more amenable to inlining itself. #[cold] #[inline(never)] fn handle_overlapping_empty_match( &mut self, mut m: (usize, usize), ) -> Option<(usize, usize)> { assert!(m.0 >= m.1); if Some(m.1) == self.last_match_end { let len = core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1); self.at = self.at.checked_add(len).unwrap(); if !self.pikevm.search( &mut self.cache, self.haystack, self.at, self.haystack.len(), false, &mut self.slots, ) { return None; } m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get()); } Some(m) } } /// An iterator over all successive non-overlapping capture matches in a particular /// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime /// of the cache and `'h` represents the lifetime of the haystack. #[derive(Debug)] pub(crate) struct CapturesMatches<'r, 'h> { it: FindMatches<'r, 'h>, } impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { type Item = Vec>; fn next(&mut self) -> Option>> { self.it.next()?; Some(self.it.slots.clone()) } } /// A cache represents mutable state that a `PikeVM` requires during a search. /// /// For a given `PikeVM`, its corresponding cache may be created either via /// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every /// way, except the former does not require explicitly importing `Cache`. /// /// A particular `Cache` is coupled with the `PikeVM` from which it was /// created. It may only be used with that `PikeVM`. A cache and its /// allocations may be re-purposed via `Cache::reset`, in which case, it can /// only be used with the new `PikeVM` (and not the old one). #[derive(Clone, Debug)] pub(crate) struct Cache { /// Stack used while computing epsilon closure. This effectively lets us /// move what is more naturally expressed through recursion to a stack /// on the heap. stack: Vec, /// The current active states being explored for the current byte in the /// haystack. curr: ActiveStates, /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, } impl Cache { /// Create a new `PikeVM` cache. /// /// A potentially more convenient routine to create a cache is /// `PikeVM::create_cache`, as it does not require also importing the /// `Cache` type. /// /// If you want to reuse the returned `Cache` with some other `PikeVM`, /// then you must call `Cache::reset` with the desired `PikeVM`. pub(crate) fn new(re: &PikeVM) -> Cache { Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), } } /// Clears this cache. This should be called at the start of every search /// to ensure we start with a clean slate. /// /// This also sets the length of the capturing groups used in the current /// search. This permits an optimization where by 'SlotTable::for_state' /// only returns the number of slots equivalent to the number of slots /// given in the 'Captures' value. This may be less than the total number /// of possible slots, e.g., when one only wants to track overall match /// offsets. This in turn permits less copying of capturing group spans /// in the PikeVM. fn setup_search(&mut self, captures_slot_len: usize) { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); } } /// A set of active states used to "simulate" the execution of an NFA via the /// PikeVM. /// /// There are two sets of these used during NFA simulation. One set corresponds /// to the "current" set of states being traversed for the current position /// in a haystack. The other set corresponds to the "next" set of states being /// built, which will become the new "current" set for the next position in the /// haystack. These two sets correspond to CLIST and NLIST in Thompson's /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 /// /// In addition to representing a set of NFA states, this also maintains slot /// values for each state. These slot values are what turn the NFA simulation /// into the "Pike VM." Namely, they track capturing group values for each /// state. During the computation of epsilon closure, we copy slot values from /// states in the "current" set to the "next" set. Eventually, once a match /// is found, the slot values for that match state are what we write to the /// caller provided slots. #[derive(Clone, Debug)] struct ActiveStates { /// The set of active NFA states. This set preserves insertion order, which /// is critical for simulating the match semantics of backtracking regex /// engines. set: SparseSet, /// The slots for every NFA state, where each slot stores a (possibly /// absent) offset. Every capturing group has two slots. One for a start /// offset and one for an end offset. slot_table: SlotTable, } impl ActiveStates { /// Create a new set of active states for the given PikeVM. The active /// states returned may only be used with the given PikeVM. (Use 'reset' /// to re-purpose the allocation for a different PikeVM.) fn new(re: &PikeVM) -> ActiveStates { let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new(), }; active.reset(re); active } /// Reset this set of active states such that it can be used with the given /// PikeVM (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { self.set.resize(re.nfa().len()); self.slot_table.reset(re); } /// Setup this set of active states for a new search. The given slot /// length should be the number of slots in a caller provided 'Captures' /// (and may be zero). fn setup_search(&mut self, captures_slot_len: usize) { self.set.clear(); self.slot_table.setup_search(captures_slot_len); } } /// A table of slots, where each row represent a state in an NFA. Thus, the /// table has room for storing slots for every single state in an NFA. /// /// This table is represented with a single contiguous allocation. In general, /// the notion of "capturing group" doesn't really exist at this level of /// abstraction, hence the name "slot" instead. (Indeed, every capturing group /// maps to a pair of slots, one for the start offset and one for the end /// offset.) Slots are indexed by the `Captures` NFA state. #[derive(Clone, Debug)] struct SlotTable { /// The actual table of offsets. table: Vec>, /// The number of slots per state, i.e., the table's stride or the length /// of each row. slots_per_state: usize, /// The number of slots in the caller-provided `Captures` value for the /// current search. Setting this to `slots_per_state` is always correct, /// but may be wasteful. slots_for_captures: usize, } impl SlotTable { /// Create a new slot table. /// /// One should call 'reset' with the corresponding PikeVM before use. fn new() -> SlotTable { SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } } /// Reset this slot table such that it can be used with the given PikeVM /// (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { let nfa = re.nfa(); // OK because NFA construction would have failed if this overflowed. self.slots_per_state = nfa.group_len().checked_mul(2).unwrap(); // This is always correct, but may be reduced for a particular search // if fewer slots were given by the caller, e.g., none at all or only // slots for tracking the overall match instead of all slots for every // group. self.slots_for_captures = self.slots_per_state; let len = nfa .len() // We add 1 so that our last row is always empty. We use it as // "scratch" space for computing the epsilon closure off of the // starting state. .checked_add(1) .and_then(|x| x.checked_mul(self.slots_per_state)) // It seems like this could actually panic on legitimate inputs // on 32-bit targets. Should we somehow convert this to an error? // What about something similar for the lazy DFA cache? If you're // tripping this assert, please file a bug. .expect("slot table length doesn't overflow"); self.table.resize(len, None); } /// Perform any per-search setup for this slot table. /// /// In particular, this sets the length of the number of slots used in the /// slots given by the caller (if any at all). This number may be smaller /// than the total number of slots available, e.g., when the caller is only /// interested in tracking the overall match and not the spans of every /// matching capturing group. Only tracking the overall match can save a /// substantial amount of time copying capturing spans during a search. fn setup_search(&mut self, captures_slot_len: usize) { self.slots_for_captures = captures_slot_len; } /// Return a mutable slice of the slots for the given state. /// /// Note that the length of the slice returned may be less than the total /// number of slots available for this state. In particular, the length /// always matches the number of slots indicated via `setup_search`. fn for_state(&mut self, sid: StateID) -> &mut [Option] { let i = sid.as_usize() * self.slots_per_state; &mut self.table[i..i + self.slots_for_captures] } /// Return a slice of slots of appropriate length where every slot offset /// is guaranteed to be absent. This is useful in cases where you need to /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { let i = self.table.len() - self.slots_per_state; &mut self.table[i..i + self.slots_for_captures] } } /// Represents a stack frame for use while computing an epsilon closure. /// /// (An "epsilon closure" refers to the set of reachable NFA states from a /// single state without consuming any input. That is, the set of all epsilon /// transitions not only from that single state, but from every other state /// reachable by an epsilon transition as well. This is why it's called a /// "closure.") /// /// Computing the epsilon closure in a Thompson NFA proceeds via a depth /// first traversal over all epsilon transitions from a particular state. /// (A depth first traversal is important because it emulates the same priority /// of matches that is typically found in backtracking regex engines.) This /// depth first traversal is naturally expressed using recursion, but to avoid /// a call stack size proportional to the size of a regex, we put our stack on /// the heap instead. /// /// This stack thus consists of call frames. The typical call frame is /// `Explore`, which instructs epsilon closure to explore the epsilon /// transitions from that state. (Subsequent epsilon transitions are then /// pushed on to the stack as more `Explore` frames.) If the state ID being /// explored has no epsilon transitions, then the capturing group slots are /// copied from the original state that sparked the epsilon closure (from the /// 'step' routine) to the state ID being explored. This way, capturing group /// slots are forwarded from the previous state to the next. /// /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to /// set the position for a particular slot back to some particular offset. This /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will /// set the offset of the slot indicated in `Capture` to the current offset, /// and then push the old offset on to the stack as a `RestoreCapture` frame. /// Thus, the new offset is only used until the epsilon closure reverts back to /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon /// transition its "scope" to only states that come "after" it during depth /// first traversal. #[derive(Clone, Debug)] enum FollowEpsilon { /// Explore the epsilon transitions from a state ID. Explore(StateID), /// Reset the given `slot` to the given `offset` (which might be `None`). RestoreCapture { slot: u32, offset: Option }, } /// A sparse set used for representing ordered NFA states. /// /// This supports constant time addition and membership testing. Clearing an /// entire set can also be done in constant time. Iteration yields elements /// in the order in which they were inserted. /// /// The data structure is based on: https://research.swtch.com/sparse /// Note though that we don't actually use uninitialized memory. We generally /// reuse sparse sets, so the initial allocation cost is bareable. However, its /// other properties listed above are extremely useful. #[derive(Clone)] struct SparseSet { /// The number of elements currently in this set. len: usize, /// Dense contains the ids in the order in which they were inserted. dense: Vec, /// Sparse maps ids to their location in dense. /// /// A state ID is in the set if and only if /// sparse[id] < len && id == dense[sparse[id]]. /// /// Note that these are indices into 'dense'. It's a little weird to use /// StateID here, but we know our length can never exceed the bounds of /// StateID (enforced by 'resize') and StateID will be at most 4 bytes /// where as a usize is likely double that in most cases. sparse: Vec, } impl SparseSet { /// Create a new sparse set with the given capacity. /// /// Sparse sets have a fixed size and they cannot grow. Attempting to /// insert more distinct elements than the total capacity of the set will /// result in a panic. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. fn new(capacity: usize) -> SparseSet { let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; set.resize(capacity); set } /// Resizes this sparse set to have the new capacity given. /// /// This set is automatically cleared. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. fn resize(&mut self, new_capacity: usize) { assert!( new_capacity <= u32::MAX.as_usize(), "sparse set capacity cannot excced {:?}", u32::MAX, ); self.clear(); self.dense.resize(new_capacity, 0); self.sparse.resize(new_capacity, 0); } /// Returns the capacity of this set. /// /// The capacity represents a fixed limit on the number of distinct /// elements that are allowed in this set. The capacity cannot be changed. fn capacity(&self) -> usize { self.dense.len() } /// Returns the number of elements in this set. fn len(&self) -> usize { self.len } /// Returns true if and only if this set is empty. fn is_empty(&self) -> bool { self.len() == 0 } /// Insert the state ID value into this set and return true if the given /// state ID was not previously in this set. /// /// This operation is idempotent. If the given value is already in this /// set, then this is a no-op. /// /// If more than `capacity` ids are inserted, then this panics. fn insert(&mut self, id: StateID) -> bool { if self.contains(id) { return false; } let index = self.len(); assert!( index < self.capacity(), "{:?} exceeds capacity of {:?} when inserting {:?}", index, self.capacity(), id, ); self.dense[index] = id; // OK because we don't permit the capacity to be set higher than // u32::MAX. self.sparse[id.as_usize()] = u32::try_from(index).unwrap(); self.len += 1; true } /// Returns true if and only if this set contains the given value. fn contains(&self, id: StateID) -> bool { let index = self.sparse[id.as_usize()]; index.as_usize() < self.len() && self.dense[index.as_usize()] == id } /// Clear this set such that it has no members. fn clear(&mut self) { self.len = 0; } /// Returns an iterator over all the state IDs in this set in the order in /// which they were inserted. fn iter(&self) -> SparseSetIter<'_> { SparseSetIter(self.dense[..self.len()].iter()) } } impl core::fmt::Debug for SparseSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let elements: Vec = self.iter().collect(); f.debug_tuple("SparseSet").field(&elements).finish() } } /// An iterator over all elements in a sparse set. /// /// The lifetime `'a` refers to the lifetime of the set being iterated over. #[derive(Debug)] struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); impl<'a> Iterator for SparseSetIter<'a> { type Item = StateID; fn next(&mut self) -> Option { self.0.next().map(|&id| id) } } regex-lite-0.1.6/src/pool.rs000064400000000000000000000101651046102023000140060ustar 00000000000000use core::panic::{RefUnwindSafe, UnwindSafe}; use alloc::{boxed::Box, vec, vec::Vec}; use crate::pikevm; // Literally the only reason that this crate requires 'std' currently. // // In regex-automata, we support the no-std use case by rolling our own // spin-lock based Mutex. That's questionable on its own, but it's not clear if // we should be doing that here. It will require introducing non-safe code in a // crate that is otherwise safe. But maybe it's worth doing? use std::sync::Mutex; /// A type alias for our pool of meta::Cache that fixes the type parameters to /// what we use for the meta regex below. pub(crate) type CachePool = Pool; /// Same as above, but for the guard returned by a pool. pub(crate) type CachePoolGuard<'a> = PoolGuard<'a, pikevm::Cache, CachePoolFn>; /// The type of the closure we use to create new caches. We need to spell out /// all of the marker traits or else we risk leaking !MARKER impls. pub(crate) type CachePoolFn = Box pikevm::Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; /// A thread safe pool utilizing alloc-only features. /// /// Unlike the pool in regex-automata, this has no "fast path." We could add /// it, but it's more code and requires reasoning about safety. pub(crate) struct Pool { /// A stack of T values to hand out. These are used when a Pool is /// accessed by a thread that didn't create it. stack: Mutex>>, /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, } // If T is UnwindSafe, then since we provide exclusive access to any // particular value in the pool, it should therefore also be considered // RefUnwindSafe. impl RefUnwindSafe for Pool {} impl Pool { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(crate) const fn new(create: F) -> Pool { Pool { stack: Mutex::new(vec![]), create } } } impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. pub(crate) fn get(&self) -> PoolGuard<'_, T, F> { let mut stack = self.stack.lock().unwrap(); let value = match stack.pop() { None => Box::new((self.create)()), Some(value) => value, }; PoolGuard { pool: self, value: Some(value) } } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. fn put_value(&self, value: Box) { let mut stack = self.stack.lock().unwrap(); stack.push(value); } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool").field("stack", &self.stack).finish() } } /// A guard that is returned when a caller requests a value from the pool. pub(crate) struct PoolGuard<'a, T: Send, F: Fn() -> T> { /// The pool that this guard is attached to. pool: &'a Pool, /// This is None after the guard has been put back into the pool. value: Option>, } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { fn drop(&mut self) { if let Some(value) = self.value.take() { self.pool.put_value(value); } } } impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { type Target = T; fn deref(&self) -> &T { self.value.as_deref().unwrap() } } impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { fn deref_mut(&mut self) -> &mut T { self.value.as_deref_mut().unwrap() } } impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug for PoolGuard<'a, T, F> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("PoolGuard") .field("pool", &self.pool) .field("value", &self.value) .finish() } } regex-lite-0.1.6/src/string.rs000064400000000000000000003220611046102023000143440ustar 00000000000000use alloc::{ borrow::Cow, boxed::Box, string::String, string::ToString, sync::Arc, vec, vec::Vec, }; use crate::{ error::Error, hir::{self, Hir}, int::NonMaxUsize, interpolate, nfa::{self, NFA}, pikevm::{self, Cache, PikeVM}, pool::CachePool, }; /// A compiled regular expression for searching Unicode haystacks. /// /// A `Regex` can be used to search haystacks, split haystacks into substrings /// or replace substrings in a haystack with a different substring. All /// searching is done with an implicit `(?s:.)*?` at the beginning and end of /// an pattern. To force an expression to match the whole string (or a prefix /// or a suffix), you must use an anchor like `^` or `$` (or `\A` and `\z`). /// /// While this crate will handle Unicode strings (whether in the regular /// expression or in the haystack), all positions returned are **byte /// offsets**. Every byte offset is guaranteed to be at a Unicode code point /// boundary. That is, all offsets returned by the `Regex` API are guaranteed /// to be ranges that can slice a `&str` without panicking. /// /// The only methods that allocate new strings are the string replacement /// methods. All other methods (searching and splitting) return borrowed /// references into the haystack given. /// /// # Example /// /// Find the offsets of a US phone number: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new("[0-9]{3}-[0-9]{3}-[0-9]{4}").unwrap(); /// let m = re.find("phone: 111-222-3333").unwrap(); /// assert_eq!(7..19, m.range()); /// ``` /// /// # Example: extracting capture groups /// /// A common way to use regexes is with capture groups. That is, instead of /// just looking for matches of an entire regex, parentheses are used to create /// groups that represent part of the match. /// /// For example, consider a haystack with multiple lines, and each line has /// three whitespace delimited fields where the second field is expected to be /// a number and the third field a boolean. To make this convenient, we use /// the [`Captures::extract`] API to put the strings that match each group /// into a fixed size array: /// /// ``` /// use regex_lite::Regex; /// /// let hay = " /// rabbit 54 true /// groundhog 2 true /// does not match /// fox 109 false /// "; /// let re = Regex::new(r"(?m)^\s*(\S+)\s+([0-9]+)\s+(true|false)\s*$").unwrap(); /// let mut fields: Vec<(&str, i64, bool)> = vec![]; /// for (_, [f1, f2, f3]) in re.captures_iter(hay).map(|caps| caps.extract()) { /// fields.push((f1, f2.parse()?, f3.parse()?)); /// } /// assert_eq!(fields, vec![ /// ("rabbit", 54, true), /// ("groundhog", 2, true), /// ("fox", 109, false), /// ]); /// /// # Ok::<(), Box>(()) /// ``` pub struct Regex { pikevm: Arc, pool: CachePool, } impl Clone for Regex { fn clone(&self) -> Regex { let pikevm = Arc::clone(&self.pikevm); let pool = { let pikevm = Arc::clone(&self.pikevm); let create = Box::new(move || Cache::new(&pikevm)); CachePool::new(create) }; Regex { pikevm, pool } } } impl core::fmt::Display for Regex { /// Shows the original regular expression. fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "{}", self.as_str()) } } impl core::fmt::Debug for Regex { /// Shows the original regular expression. fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_tuple("Regex").field(&self.as_str()).finish() } } impl core::str::FromStr for Regex { type Err = Error; /// Attempts to parse a string into a regular expression fn from_str(s: &str) -> Result { Regex::new(s) } } impl TryFrom<&str> for Regex { type Error = Error; /// Attempts to parse a string into a regular expression fn try_from(s: &str) -> Result { Regex::new(s) } } impl TryFrom for Regex { type Error = Error; /// Attempts to parse a string into a regular expression fn try_from(s: String) -> Result { Regex::new(&s) } } /// Core regular expression methods. impl Regex { /// Compiles a regular expression. Once compiled, it can be used repeatedly /// to search, split or replace substrings in a haystack. /// /// Note that regex compilation tends to be a somewhat expensive process, /// and unlike higher level environments, compilation is not automatically /// cached for you. One should endeavor to compile a regex once and then /// reuse it. For example, it's a bad idea to compile the same regex /// repeatedly in a loop. /// /// # Errors /// /// If an invalid pattern is given, then an error is returned. /// An error is also returned if the pattern is valid, but would /// produce a regex that is bigger than the configured size limit via /// [`RegexBuilder::size_limit`]. (A reasonable size limit is enabled by /// default.) /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// // An Invalid pattern because of an unclosed parenthesis /// assert!(Regex::new(r"foo(bar").is_err()); /// // An invalid pattern because the regex would be too big /// // because Unicode tends to inflate things. /// assert!(Regex::new(r"\w{1000000}").is_err()); /// ``` pub fn new(pattern: &str) -> Result { RegexBuilder::new(pattern).build() } /// Returns true if and only if there is a match for the regex anywhere /// in the haystack given. /// /// It is recommended to use this method if all you need to do is test /// whether a match exists, since the underlying matching engine may be /// able to do less work. /// /// # Example /// /// Test if some haystack contains at least one word with exactly 13 /// word characters: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\b\w{13}\b").unwrap(); /// let hay = "I categorically deny having triskaidekaphobia."; /// assert!(re.is_match(hay)); /// ``` #[inline] pub fn is_match(&self, haystack: &str) -> bool { self.is_match_at(haystack, 0) } /// This routine searches for the first match of this regex in the /// haystack given, and if found, returns a [`Match`]. The `Match` /// provides access to both the byte offsets of the match and the actual /// substring that matched. /// /// Note that this should only be used if you want to find the entire /// match. If instead you just want to test the existence of a match, /// it's potentially faster to use `Regex::is_match(hay)` instead of /// `Regex::find(hay).is_some()`. /// /// # Example /// /// Find the first word with exactly 13 word characters: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\b\w{13}\b").unwrap(); /// let hay = "I categorically deny having triskaidekaphobia."; /// let mat = re.find(hay).unwrap(); /// assert_eq!(2..15, mat.range()); /// assert_eq!("categorically", mat.as_str()); /// ``` #[inline] pub fn find<'h>(&self, haystack: &'h str) -> Option> { self.find_at(haystack, 0) } /// Returns an iterator that yields successive non-overlapping matches in /// the given haystack. The iterator yields values of type [`Match`]. /// /// # Time complexity /// /// Note that since `find_iter` runs potentially many searches on the /// haystack and since each search has worst case `O(m * n)` time /// complexity, the overall worst case time complexity for iteration is /// `O(m * n^2)`. /// /// # Example /// /// Find every word with exactly 13 word characters: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\b\w{13}\b").unwrap(); /// let hay = "Retroactively relinquishing remunerations is reprehensible."; /// let matches: Vec<_> = re.find_iter(hay).map(|m| m.as_str()).collect(); /// assert_eq!(matches, vec![ /// "Retroactively", /// "relinquishing", /// "remunerations", /// "reprehensible", /// ]); /// ``` #[inline] pub fn find_iter<'r, 'h>(&'r self, haystack: &'h str) -> Matches<'r, 'h> { Matches { haystack, it: self.pikevm.find_iter(self.pool.get(), haystack.as_bytes()), } } /// This routine searches for the first match of this regex in the haystack /// given, and if found, returns not only the overall match but also the /// matches of each capture group in the regex. If no match is found, then /// `None` is returned. /// /// Capture group `0` always corresponds to an implicit unnamed group that /// includes the entire match. If a match is found, this group is always /// present. Subsequent groups may be named and are numbered, starting /// at 1, by the order in which the opening parenthesis appears in the /// pattern. For example, in the pattern `(?
.(?.))(?.)`, `a`, /// `b` and `c` correspond to capture group indices `1`, `2` and `3`, /// respectively. /// /// You should only use `captures` if you need access to the capture group /// matches. Otherwise, [`Regex::find`] is generally faster for discovering /// just the overall match. /// /// # Example /// /// Say you have some haystack with movie names and their release years, /// like "'Citizen Kane' (1941)". It'd be nice if we could search for /// substrings looking like that, while also extracting the movie name and /// its release year separately. The example below shows how to do that. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(hay).unwrap(); /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// assert_eq!(caps.get(1).unwrap().as_str(), "Citizen Kane"); /// assert_eq!(caps.get(2).unwrap().as_str(), "1941"); /// // You can also access the groups by index using the Index notation. /// // Note that this will panic on an invalid index. In this case, these /// // accesses are always correct because the overall regex will only /// // match when these capture groups match. /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); /// assert_eq!(&caps[1], "Citizen Kane"); /// assert_eq!(&caps[2], "1941"); /// ``` /// /// Note that the full match is at capture group `0`. Each subsequent /// capture group is indexed by the order of its opening `(`. /// /// We can make this example a bit clearer by using *named* capture groups: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"'(?[^']+)'\s+\((?<year>\d{4})\)").unwrap(); /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let caps = re.captures(hay).unwrap(); /// assert_eq!(caps.get(0).unwrap().as_str(), "'Citizen Kane' (1941)"); /// assert_eq!(caps.name("title").unwrap().as_str(), "Citizen Kane"); /// assert_eq!(caps.name("year").unwrap().as_str(), "1941"); /// // You can also access the groups by name using the Index notation. /// // Note that this will panic on an invalid group name. In this case, /// // these accesses are always correct because the overall regex will /// // only match when these capture groups match. /// assert_eq!(&caps[0], "'Citizen Kane' (1941)"); /// assert_eq!(&caps["title"], "Citizen Kane"); /// assert_eq!(&caps["year"], "1941"); /// ``` /// /// Here we name the capture groups, which we can access with the `name` /// method or the `Index` notation with a `&str`. Note that the named /// capture groups are still accessible with `get` or the `Index` notation /// with a `usize`. /// /// The `0`th capture group is always unnamed, so it must always be /// accessed with `get(0)` or `[0]`. /// /// Finally, one other way to to get the matched substrings is with the /// [`Captures::extract`] API: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)").unwrap(); /// let hay = "Not my favorite movie: 'Citizen Kane' (1941)."; /// let (full, [title, year]) = re.captures(hay).unwrap().extract(); /// assert_eq!(full, "'Citizen Kane' (1941)"); /// assert_eq!(title, "Citizen Kane"); /// assert_eq!(year, "1941"); /// ``` #[inline] pub fn captures<'h>(&self, haystack: &'h str) -> Option<Captures<'h>> { self.captures_at(haystack, 0) } /// Returns an iterator that yields successive non-overlapping matches in /// the given haystack. The iterator yields values of type [`Captures`]. /// /// This is the same as [`Regex::find_iter`], but instead of only providing /// access to the overall match, each value yield includes access to the /// matches of all capture groups in the regex. Reporting this extra match /// data is potentially costly, so callers should only use `captures_iter` /// over `find_iter` when they actually need access to the capture group /// matches. /// /// # Time complexity /// /// Note that since `captures_iter` runs potentially many searches on the /// haystack and since each search has worst case `O(m * n)` time /// complexity, the overall worst case time complexity for iteration is /// `O(m * n^2)`. /// /// # Example /// /// We can use this to find all movie titles and their release years in /// some haystack, where the movie is formatted like "'Title' (xxxx)": /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"'([^']+)'\s+\(([0-9]{4})\)").unwrap(); /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// let mut movies = vec![]; /// for (_, [title, year]) in re.captures_iter(hay).map(|c| c.extract()) { /// movies.push((title, year.parse::<i64>()?)); /// } /// assert_eq!(movies, vec![ /// ("Citizen Kane", 1941), /// ("The Wizard of Oz", 1939), /// ("M", 1931), /// ]); /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` /// /// Or with named groups: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"'(?<title>[^']+)'\s+\((?<year>[0-9]{4})\)").unwrap(); /// let hay = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; /// let mut it = re.captures_iter(hay); /// /// let caps = it.next().unwrap(); /// assert_eq!(&caps["title"], "Citizen Kane"); /// assert_eq!(&caps["year"], "1941"); /// /// let caps = it.next().unwrap(); /// assert_eq!(&caps["title"], "The Wizard of Oz"); /// assert_eq!(&caps["year"], "1939"); /// /// let caps = it.next().unwrap(); /// assert_eq!(&caps["title"], "M"); /// assert_eq!(&caps["year"], "1931"); /// ``` #[inline] pub fn captures_iter<'r, 'h>( &'r self, haystack: &'h str, ) -> CaptureMatches<'r, 'h> { CaptureMatches { haystack, re: self, it: self .pikevm .captures_iter(self.pool.get(), haystack.as_bytes()), } } /// Returns an iterator of substrings of the haystack given, delimited by a /// match of the regex. Namely, each element of the iterator corresponds to /// a part of the haystack that *isn't* matched by the regular expression. /// /// # Time complexity /// /// Since iterators over all matches requires running potentially many /// searches on the haystack, and since each search has worst case /// `O(m * n)` time complexity, the overall worst case time complexity for /// this routine is `O(m * n^2)`. /// /// # Example /// /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"[ \t]+").unwrap(); /// let hay = "a b \t c\td e"; /// let fields: Vec<&str> = re.split(hay).collect(); /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// ``` /// /// # Example: more cases /// /// Basic usage: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r" ").unwrap(); /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = ""; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); /// /// let re = Regex::new(r"::").unwrap(); /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); /// ``` /// /// If a haystack contains multiple contiguous matches, you will end up /// with empty spans yielded by the iterator: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "XXXXaXXbXc"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// /// let re = Regex::new(r"/").unwrap(); /// let hay = "(///)"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["(", "", "", ")"]); /// ``` /// /// Separators at the start or end of a haystack are neighbored by empty /// substring. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"0").unwrap(); /// let hay = "010"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["", "1", ""]); /// ``` /// /// When the empty string is used as a regex, it splits at every valid /// UTF-8 boundary by default (which includes the beginning and end of the /// haystack): /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"").unwrap(); /// let hay = "rust"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); /// /// // Splitting by an empty string is UTF-8 aware by default! /// let re = Regex::new(r"").unwrap(); /// let hay = "☃"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["", "☃", ""]); /// ``` /// /// Contiguous separators (commonly shows up with whitespace), can lead to /// possibly surprising behavior. For example, this code is correct: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r" ").unwrap(); /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// ``` /// /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want /// to match contiguous space characters: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r" +").unwrap(); /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).collect(); /// // N.B. This does still include a leading empty span because ' +' /// // matches at the beginning of the haystack. /// assert_eq!(got, vec!["", "a", "b", "c"]); /// ``` #[inline] pub fn split<'r, 'h>(&'r self, haystack: &'h str) -> Split<'r, 'h> { Split { haystack, finder: self.find_iter(haystack), last: 0 } } /// Returns an iterator of at most `limit` substrings of the haystack /// given, delimited by a match of the regex. (A `limit` of `0` will return /// no substrings.) Namely, each element of the iterator corresponds to a /// part of the haystack that *isn't* matched by the regular expression. /// The remainder of the haystack that is not split will be the last /// element in the iterator. /// /// # Time complexity /// /// Since iterators over all matches requires running potentially many /// searches on the haystack, and since each search has worst case /// `O(m * n)` time complexity, the overall worst case time complexity for /// this routine is `O(m * n^2)`. /// /// Although note that the worst case time here has an upper bound given /// by the `limit` parameter. /// /// # Example /// /// Get the first two words in some haystack: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\W+").unwrap(); /// let hay = "Hey! How are you?"; /// let fields: Vec<&str> = re.splitn(hay, 3).collect(); /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); /// ``` /// /// # Examples: more cases /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r" ").unwrap(); /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.splitn(hay, 3).collect(); /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = ""; /// let got: Vec<&str> = re.splitn(hay, 3).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.splitn(hay, 3).collect(); /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); /// /// let re = Regex::new(r"::").unwrap(); /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.splitn(hay, 2).collect(); /// assert_eq!(got, vec!["lion", "tiger::leopard"]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 1).collect(); /// assert_eq!(got, vec!["abcXdef"]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "abcdef"; /// let got: Vec<&str> = re.splitn(hay, 2).collect(); /// assert_eq!(got, vec!["abcdef"]); /// /// let re = Regex::new(r"X").unwrap(); /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 0).collect(); /// assert!(got.is_empty()); /// ``` #[inline] pub fn splitn<'r, 'h>( &'r self, haystack: &'h str, limit: usize, ) -> SplitN<'r, 'h> { SplitN { splits: self.split(haystack), limit } } /// Replaces the leftmost-first match in the given haystack with the /// replacement provided. The replacement can be a regular string (where /// `$N` and `$name` are expanded to match capture groups) or a function /// that takes a [`Captures`] and returns the replaced string. /// /// If no match is found, then the haystack is returned unchanged. In that /// case, this implementation will likely return a `Cow::Borrowed` value /// such that no allocation is performed. /// /// # Replacement string syntax /// /// All instances of `$ref` in the replacement string are replaced with /// the substring corresponding to the capture group identified by `ref`. /// /// `ref` may be an integer corresponding to the index of the capture group /// (counted by order of opening parenthesis where `0` is the entire match) /// or it can be a name (consisting of letters, digits or underscores) /// corresponding to a named capture group. /// /// If `ref` isn't a valid capture group (whether the name doesn't exist or /// isn't a valid index), then it is replaced with the empty string. /// /// The longest possible name is used. For example, `$1a` looks up the /// capture group named `1a` and not the capture group at index `1`. To /// exert more precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. /// /// # Example /// /// Note that this function is polymorphic with respect to the replacement. /// In typical usage, this can just be a normal string: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"[^01]+").unwrap(); /// assert_eq!(re.replace("1078910", ""), "1010"); /// ``` /// /// But anything satisfying the [`Replacer`] trait will work. For example, /// a closure of type `|&Captures| -> String` provides direct access to the /// captures corresponding to a match. This allows one to access capturing /// group matches easily: /// /// ``` /// use regex_lite::{Captures, Regex}; /// /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { /// format!("{} {}", &caps[2], &caps[1]) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// ``` /// /// But this is a bit cumbersome to use all the time. Instead, a simple /// syntax is supported (as described above) that expands `$name` into the /// corresponding capture group. Here's the last example, but using this /// expansion technique with named capture groups: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", "$first $last"); /// assert_eq!(result, "Bruce Springsteen"); /// ``` /// /// Note that using `$2` instead of `$first` or `$1` instead of `$last` /// would produce the same result. To write a literal `$` use `$$`. /// /// Sometimes the replacement string requires use of curly braces to /// delineate a capture group replacement when it is adjacent to some other /// literal text. For example, if we wanted to join two words together with /// an underscore: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<first>\w+)\s+(?<second>\w+)").unwrap(); /// let result = re.replace("deep fried", "${first}_$second"); /// assert_eq!(result, "deep_fried"); /// ``` /// /// Without the curly braces, the capture group name `first_` would be /// used, and since it doesn't exist, it would be replaced with the empty /// string. /// /// Finally, sometimes you just want to replace a literal string with no /// regard for capturing group expansion. This can be done by wrapping a /// string with [`NoExpand`]: /// /// ``` /// use regex_lite::{NoExpand, Regex}; /// /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); /// assert_eq!(result, "$2 $last"); /// ``` /// /// Using `NoExpand` may also be faster, since the replacement string won't /// need to be parsed for the `$` syntax. #[inline] pub fn replace<'h, R: Replacer>( &self, haystack: &'h str, rep: R, ) -> Cow<'h, str> { self.replacen(haystack, 1, rep) } /// Replaces all non-overlapping matches in the haystack with the /// replacement provided. This is the same as calling `replacen` with /// `limit` set to `0`. /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// /// # Time complexity /// /// Since iterators over all matches requires running potentially many /// searches on the haystack, and since each search has worst case /// `O(m * n)` time complexity, the overall worst case time complexity for /// this routine is `O(m * n^2)`. /// /// # Fallibility /// /// If you need to write a replacement routine where any individual /// replacement might "fail," doing so with this API isn't really feasible /// because there's no way to stop the search process if a replacement /// fails. Instead, if you need this functionality, you should consider /// implementing your own replacement routine: /// /// ``` /// use regex_lite::{Captures, Regex}; /// /// fn replace_all<E>( /// re: &Regex, /// haystack: &str, /// replacement: impl Fn(&Captures) -> Result<String, E>, /// ) -> Result<String, E> { /// let mut new = String::with_capacity(haystack.len()); /// let mut last_match = 0; /// for caps in re.captures_iter(haystack) { /// let m = caps.get(0).unwrap(); /// new.push_str(&haystack[last_match..m.start()]); /// new.push_str(&replacement(&caps)?); /// last_match = m.end(); /// } /// new.push_str(&haystack[last_match..]); /// Ok(new) /// } /// /// // Let's replace each word with the number of bytes in that word. /// // But if we see a word that is "too long," we'll give up. /// let re = Regex::new(r"\w+").unwrap(); /// let replacement = |caps: &Captures| -> Result<String, &'static str> { /// if caps[0].len() >= 5 { /// return Err("word too long"); /// } /// Ok(caps[0].len().to_string()) /// }; /// assert_eq!( /// Ok("2 3 3 3?".to_string()), /// replace_all(&re, "hi how are you?", &replacement), /// ); /// assert!(replace_all(&re, "hi there", &replacement).is_err()); /// ``` /// /// # Example /// /// This example shows how to flip the order of whitespace delimited /// fields, and normalizes the whitespace that delimits the fields: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?m)^(\S+)\s+(\S+)$").unwrap(); /// let hay = " /// Greetings 1973 /// Wild\t1973 /// BornToRun\t\t\t\t1975 /// Darkness 1978 /// TheRiver 1980 /// "; /// let new = re.replace_all(hay, "$2 $1"); /// assert_eq!(new, " /// 1973 Greetings /// 1973 Wild /// 1975 BornToRun /// 1978 Darkness /// 1980 TheRiver /// "); /// ``` #[inline] pub fn replace_all<'h, R: Replacer>( &self, haystack: &'h str, rep: R, ) -> Cow<'h, str> { self.replacen(haystack, 0, rep) } /// Replaces at most `limit` non-overlapping matches in the haystack with /// the replacement provided. If `limit` is `0`, then all non-overlapping /// matches are replaced. That is, `Regex::replace_all(hay, rep)` is /// equivalent to `Regex::replacen(hay, 0, rep)`. /// /// The documentation for [`Regex::replace`] goes into more detail about /// what kinds of replacement strings are supported. /// /// # Time complexity /// /// Since iterators over all matches requires running potentially many /// searches on the haystack, and since each search has worst case /// `O(m * n)` time complexity, the overall worst case time complexity for /// this routine is `O(m * n^2)`. /// /// Although note that the worst case time here has an upper bound given /// by the `limit` parameter. /// /// # Fallibility /// /// See the corresponding section in the docs for [`Regex::replace_all`] /// for tips on how to deal with a replacement routine that can fail. /// /// # Example /// /// This example shows how to flip the order of whitespace delimited /// fields, and normalizes the whitespace that delimits the fields. But we /// only do it for the first two matches. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?m)^(\S+)\s+(\S+)$").unwrap(); /// let hay = " /// Greetings 1973 /// Wild\t1973 /// BornToRun\t\t\t\t1975 /// Darkness 1978 /// TheRiver 1980 /// "; /// let new = re.replacen(hay, 2, "$2 $1"); /// assert_eq!(new, " /// 1973 Greetings /// 1973 Wild /// BornToRun\t\t\t\t1975 /// Darkness 1978 /// TheRiver 1980 /// "); /// ``` #[inline] pub fn replacen<'h, R: Replacer>( &self, haystack: &'h str, limit: usize, mut rep: R, ) -> Cow<'h, str> { // If we know that the replacement doesn't have any capture expansions, // then we can use the fast path. The fast path can make a tremendous // difference: // // 1) We use `find_iter` instead of `captures_iter`. Not asking for // captures generally makes the regex engines faster. // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. if let Some(rep) = rep.no_expansion() { let mut it = self.find_iter(haystack).enumerate().peekable(); if it.peek().is_none() { return Cow::Borrowed(haystack); } let mut new = String::with_capacity(haystack.len()); let mut last_match = 0; for (i, m) in it { new.push_str(&haystack[last_match..m.start()]); new.push_str(&rep); last_match = m.end(); if limit > 0 && i >= limit - 1 { break; } } new.push_str(&haystack[last_match..]); return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. let mut it = self.captures_iter(haystack).enumerate().peekable(); if it.peek().is_none() { return Cow::Borrowed(haystack); } let mut new = String::with_capacity(haystack.len()); let mut last_match = 0; for (i, cap) in it { // unwrap on 0 is OK because captures only reports matches let m = cap.get(0).unwrap(); new.push_str(&haystack[last_match..m.start()]); rep.replace_append(&cap, &mut new); last_match = m.end(); if limit > 0 && i >= limit - 1 { break; } } new.push_str(&haystack[last_match..]); Cow::Owned(new) } } /// A group of advanced or "lower level" search methods. Some methods permit /// starting the search at a position greater than `0` in the haystack. Other /// methods permit reusing allocations, for example, when extracting the /// matches for capture groups. impl Regex { /// Returns the end byte offset of the first match in the haystack given. /// /// This method may have the same performance characteristics as /// `is_match`. Behaviorlly, it doesn't just report whether it match /// occurs, but also the end offset for a match. In particular, the offset /// returned *may be shorter* than the proper end of the leftmost-first /// match that you would find via [`Regex::find`]. /// /// Note that it is not guaranteed that this routine finds the shortest or /// "earliest" possible match. Instead, the main idea of this API is that /// it returns the offset at the point at which the internal regex engine /// has determined that a match has occurred. This may vary depending on /// which internal regex engine is used, and thus, the offset itself may /// change based on internal heuristics. /// /// # Example /// /// Typically, `a+` would match the entire first sequence of `a` in some /// haystack, but `shortest_match` *may* give up as soon as it sees the /// first `a`. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"a+").unwrap(); /// let offset = re.shortest_match("aaaaa").unwrap(); /// assert_eq!(offset, 1); /// ``` #[inline] pub fn shortest_match(&self, haystack: &str) -> Option<usize> { self.shortest_match_at(haystack, 0) } /// Returns the same as [`Regex::shortest_match`], but starts the search at /// the given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only match /// when `start == 0`. /// /// If a match is found, the offset returned is relative to the beginning /// of the haystack, not the beginning of the search. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// This example shows the significance of `start` by demonstrating how it /// can be used to permit look-around assertions in a regex to take the /// surrounding context into account. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\bchew\b").unwrap(); /// let hay = "eschew"; /// // We get a match here, but it's probably not intended. /// assert_eq!(re.shortest_match(&hay[2..]), Some(4)); /// // No match because the assertions take the context into account. /// assert_eq!(re.shortest_match_at(hay, 2), None); /// ``` #[inline] pub fn shortest_match_at( &self, haystack: &str, start: usize, ) -> Option<usize> { let mut cache = self.pool.get(); let mut slots = [None, None]; let matched = self.pikevm.search( &mut cache, haystack.as_bytes(), start, haystack.len(), true, &mut slots, ); if !matched { return None; } Some(slots[1].unwrap().get()) } /// Returns the same as [`Regex::is_match`], but starts the search at the /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// This example shows the significance of `start` by demonstrating how it /// can be used to permit look-around assertions in a regex to take the /// surrounding context into account. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\bchew\b").unwrap(); /// let hay = "eschew"; /// // We get a match here, but it's probably not intended. /// assert!(re.is_match(&hay[2..])); /// // No match because the assertions take the context into account. /// assert!(!re.is_match_at(hay, 2)); /// ``` #[inline] pub fn is_match_at(&self, haystack: &str, start: usize) -> bool { let mut cache = self.pool.get(); self.pikevm.search( &mut cache, haystack.as_bytes(), start, haystack.len(), true, &mut [], ) } /// Returns the same as [`Regex::find`], but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// This example shows the significance of `start` by demonstrating how it /// can be used to permit look-around assertions in a regex to take the /// surrounding context into account. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\bchew\b").unwrap(); /// let hay = "eschew"; /// // We get a match here, but it's probably not intended. /// assert_eq!(re.find(&hay[2..]).map(|m| m.range()), Some(0..4)); /// // No match because the assertions take the context into account. /// assert_eq!(re.find_at(hay, 2), None); /// ``` #[inline] pub fn find_at<'h>( &self, haystack: &'h str, start: usize, ) -> Option<Match<'h>> { let mut cache = self.pool.get(); let mut slots = [None, None]; let matched = self.pikevm.search( &mut cache, haystack.as_bytes(), start, haystack.len(), false, &mut slots, ); if !matched { return None; } let (start, end) = (slots[0].unwrap().get(), slots[1].unwrap().get()); Some(Match::new(haystack, start, end)) } /// Returns the same as [`Regex::captures`], but starts the search at the /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// # Example /// /// This example shows the significance of `start` by demonstrating how it /// can be used to permit look-around assertions in a regex to take the /// surrounding context into account. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\bchew\b").unwrap(); /// let hay = "eschew"; /// // We get a match here, but it's probably not intended. /// assert_eq!(&re.captures(&hay[2..]).unwrap()[0], "chew"); /// // No match because the assertions take the context into account. /// assert!(re.captures_at(hay, 2).is_none()); /// ``` #[inline] pub fn captures_at<'h>( &self, haystack: &'h str, start: usize, ) -> Option<Captures<'h>> { let mut caps = Captures { haystack, slots: self.capture_locations(), pikevm: Arc::clone(&self.pikevm), }; let mut cache = self.pool.get(); let matched = self.pikevm.search( &mut cache, haystack.as_bytes(), start, haystack.len(), false, &mut caps.slots.0, ); if !matched { return None; } Some(caps) } /// This is like [`Regex::captures`], but writes the byte offsets of each /// capture group match into the locations given. /// /// A [`CaptureLocations`] stores the same byte offsets as a [`Captures`], /// but does *not* store a reference to the haystack. This makes its API /// a bit lower level and less convenience. But in exchange, callers /// may allocate their own `CaptureLocations` and reuse it for multiple /// searches. This may be helpful if allocating a `Captures` shows up in a /// profile as too costly. /// /// To create a `CaptureLocations` value, use the /// [`Regex::capture_locations`] method. /// /// This also returns the overall match if one was found. When a match is /// found, its offsets are also always stored in `locs` at index `0`. /// /// # Panics /// /// This routine may panic if the given `CaptureLocations` was not created /// by this regex. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"^([a-z]+)=(\S*)$").unwrap(); /// let mut locs = re.capture_locations(); /// assert!(re.captures_read(&mut locs, "id=foo123").is_some()); /// assert_eq!(Some((0, 9)), locs.get(0)); /// assert_eq!(Some((0, 2)), locs.get(1)); /// assert_eq!(Some((3, 9)), locs.get(2)); /// ``` #[inline] pub fn captures_read<'h>( &self, locs: &mut CaptureLocations, haystack: &'h str, ) -> Option<Match<'h>> { self.captures_read_at(locs, haystack, 0) } /// Returns the same as [`Regex::captures_read`], but starts the search at /// the given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, the `\A` anchor can only /// match when `start == 0`. /// /// # Panics /// /// This panics when `start >= haystack.len() + 1`. /// /// This routine may also panic if the given `CaptureLocations` was not /// created by this regex. /// /// # Example /// /// This example shows the significance of `start` by demonstrating how it /// can be used to permit look-around assertions in a regex to take the /// surrounding context into account. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\bchew\b").unwrap(); /// let hay = "eschew"; /// let mut locs = re.capture_locations(); /// // We get a match here, but it's probably not intended. /// assert!(re.captures_read(&mut locs, &hay[2..]).is_some()); /// // No match because the assertions take the context into account. /// assert!(re.captures_read_at(&mut locs, hay, 2).is_none()); /// ``` #[inline] pub fn captures_read_at<'h>( &self, locs: &mut CaptureLocations, haystack: &'h str, start: usize, ) -> Option<Match<'h>> { let mut cache = self.pool.get(); let matched = self.pikevm.search( &mut cache, haystack.as_bytes(), start, haystack.len(), false, &mut locs.0, ); if !matched { return None; } let (start, end) = locs.get(0).unwrap(); Some(Match::new(haystack, start, end)) } } /// Auxiliary methods. impl Regex { /// Returns the original string of this regex. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"foo\w+bar").unwrap(); /// assert_eq!(re.as_str(), r"foo\w+bar"); /// ``` #[inline] pub fn as_str(&self) -> &str { &self.pikevm.nfa().pattern() } /// Returns an iterator over the capture names in this regex. /// /// The iterator returned yields elements of type `Option<&str>`. That is, /// the iterator yields values for all capture groups, even ones that are /// unnamed. The order of the groups corresponds to the order of the group's /// corresponding opening parenthesis. /// /// The first element of the iterator always yields the group corresponding /// to the overall match, and this group is always unnamed. Therefore, the /// iterator always yields at least one group. /// /// # Example /// /// This shows basic usage with a mix of named and unnamed capture groups: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); /// let mut names = re.capture_names(); /// assert_eq!(names.next(), Some(None)); /// assert_eq!(names.next(), Some(Some("a"))); /// assert_eq!(names.next(), Some(Some("b"))); /// assert_eq!(names.next(), Some(None)); /// // the '(?:.)' group is non-capturing and so doesn't appear here! /// assert_eq!(names.next(), Some(Some("c"))); /// assert_eq!(names.next(), None); /// ``` /// /// The iterator always yields at least one element, even for regexes with /// no capture groups and even for regexes that can never match: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"").unwrap(); /// let mut names = re.capture_names(); /// assert_eq!(names.next(), Some(None)); /// assert_eq!(names.next(), None); /// /// let re = Regex::new(r"[^\s\S]").unwrap(); /// let mut names = re.capture_names(); /// assert_eq!(names.next(), Some(None)); /// assert_eq!(names.next(), None); /// ``` #[inline] pub fn capture_names(&self) -> CaptureNames<'_> { CaptureNames(self.pikevm.nfa().capture_names()) } /// Returns the number of captures groups in this regex. /// /// This includes all named and unnamed groups, including the implicit /// unnamed group that is always present and corresponds to the entire /// match. /// /// Since the implicit unnamed group is always included in this length, the /// length returned is guaranteed to be greater than zero. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"foo").unwrap(); /// assert_eq!(1, re.captures_len()); /// /// let re = Regex::new(r"(foo)").unwrap(); /// assert_eq!(2, re.captures_len()); /// /// let re = Regex::new(r"(?<a>.(?<b>.))(.)(?:.)(?<c>.)").unwrap(); /// assert_eq!(5, re.captures_len()); /// /// let re = Regex::new(r"[^\s\S]").unwrap(); /// assert_eq!(1, re.captures_len()); /// ``` #[inline] pub fn captures_len(&self) -> usize { self.pikevm.nfa().group_len() } /// Returns the total number of capturing groups that appear in every /// possible match. /// /// If the number of capture groups can vary depending on the match, then /// this returns `None`. That is, a value is only returned when the number /// of matching groups is invariant or "static." /// /// Note that like [`Regex::captures_len`], this **does** include the /// implicit capturing group corresponding to the entire match. Therefore, /// when a non-None value is returned, it is guaranteed to be at least `1`. /// Stated differently, a return value of `Some(0)` is impossible. /// /// # Example /// /// This shows a few cases where a static number of capture groups is /// available and a few cases where it is not. /// /// ``` /// use regex_lite::Regex; /// /// let len = |pattern| { /// Regex::new(pattern).map(|re| re.static_captures_len()) /// }; /// /// assert_eq!(Some(1), len("a")?); /// assert_eq!(Some(2), len("(a)")?); /// assert_eq!(Some(2), len("(a)|(b)")?); /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); /// assert_eq!(None, len("(a)|b")?); /// assert_eq!(None, len("a|(b)")?); /// assert_eq!(None, len("(b)*")?); /// assert_eq!(Some(2), len("(b)+")?); /// /// # Ok::<(), Box<dyn std::error::Error>>(()) /// ``` #[inline] pub fn static_captures_len(&self) -> Option<usize> { self.pikevm .nfa() .static_explicit_captures_len() .map(|len| len.saturating_add(1)) } /// Returns a fresh allocated set of capture locations that can /// be reused in multiple calls to [`Regex::captures_read`] or /// [`Regex::captures_read_at`]. /// /// The returned locations can be used for any subsequent search for this /// particular regex. There is no guarantee that it is correct to use for /// other regexes, even if they have the same number of capture groups. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(.)(.)(\w+)").unwrap(); /// let mut locs = re.capture_locations(); /// assert!(re.captures_read(&mut locs, "Padron").is_some()); /// assert_eq!(locs.get(0), Some((0, 6))); /// assert_eq!(locs.get(1), Some((0, 1))); /// assert_eq!(locs.get(2), Some((1, 2))); /// assert_eq!(locs.get(3), Some((2, 6))); /// ``` #[inline] pub fn capture_locations(&self) -> CaptureLocations { // OK because NFA construction would have failed if this overflowed. let len = self.pikevm.nfa().group_len().checked_mul(2).unwrap(); CaptureLocations(vec![None; len]) } } /// Represents a single match of a regex in a haystack. /// /// A `Match` contains both the start and end byte offsets of the match and the /// actual substring corresponding to the range of those byte offsets. It is /// guaranteed that `start <= end`. When `start == end`, the match is empty. /// /// Since this `Match` can only be produced by the top-level `Regex` APIs /// that only support searching UTF-8 encoded strings, the byte offsets for a /// `Match` are guaranteed to fall on valid UTF-8 codepoint boundaries. That /// is, slicing a `&str` with [`Match::range`] is guaranteed to never panic. /// /// Values with this type are created by [`Regex::find`] or /// [`Regex::find_iter`]. Other APIs can create `Match` values too. For /// example, [`Captures::get`]. /// /// The lifetime parameter `'h` refers to the lifetime of the matched of the /// haystack that this match was produced from. /// /// # Numbering /// /// The byte offsets in a `Match` form a half-open interval. That is, the /// start of the range is inclusive and the end of the range is exclusive. /// For example, given a haystack `abcFOOxyz` and a match of `FOO`, its byte /// offset range starts at `3` and ends at `6`. `3` corresponds to `F` and /// `6` corresponds to `x`, which is one past the end of the match. This /// corresponds to the same kind of slicing that Rust uses. /// /// For more on why this was chosen over other schemes (aside from being /// consistent with how Rust the language works), see [this discussion] and /// [Dijkstra's note on a related topic][note]. /// /// [this discussion]: https://github.com/rust-lang/regex/discussions/866 /// [note]: https://www.cs.utexas.edu/users/EWD/transcriptions/EWD08xx/EWD831.html /// /// # Example /// /// This example shows the value of each of the methods on `Match` for a /// particular search. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"\d+").unwrap(); /// let hay = "numbers: 1234"; /// let m = re.find(hay).unwrap(); /// assert_eq!(9, m.start()); /// assert_eq!(13, m.end()); /// assert!(!m.is_empty()); /// assert_eq!(4, m.len()); /// assert_eq!(9..13, m.range()); /// assert_eq!("1234", m.as_str()); /// ``` #[derive(Copy, Clone, Eq, PartialEq)] pub struct Match<'h> { haystack: &'h str, start: usize, end: usize, } impl<'h> Match<'h> { /// Creates a new match from the given haystack and byte offsets. #[inline] fn new(haystack: &'h str, start: usize, end: usize) -> Match<'h> { Match { haystack, start, end } } /// Returns the byte offset of the start of the match in the haystack. The /// start of the match corresponds to the position where the match begins /// and includes the first byte in the match. /// /// It is guaranteed that `Match::start() <= Match::end()`. /// /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That /// is, it will never be an offset that appears between the UTF-8 code /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is /// always safe to slice the corresponding haystack using this offset. #[inline] pub fn start(&self) -> usize { self.start } /// Returns the byte offset of the end of the match in the haystack. The /// end of the match corresponds to the byte immediately following the last /// byte in the match. This means that `&slice[start..end]` works as one /// would expect. /// /// It is guaranteed that `Match::start() <= Match::end()`. /// /// This is guaranteed to fall on a valid UTF-8 codepoint boundary. That /// is, it will never be an offset that appears between the UTF-8 code /// units of a UTF-8 encoded Unicode scalar value. Consequently, it is /// always safe to slice the corresponding haystack using this offset. #[inline] pub fn end(&self) -> usize { self.end } /// Returns true if and only if this match has a length of zero. /// /// Note that an empty match can only occur when the regex itself can /// match the empty string. Here are some examples of regexes that can /// all match the empty string: `^`, `^$`, `\b`, `a?`, `a*`, `a{0}`, /// `(foo|\d+|quux)?`. #[inline] pub fn is_empty(&self) -> bool { self.start == self.end } /// Returns the length, in bytes, of this match. #[inline] pub fn len(&self) -> usize { self.end - self.start } /// Returns the range over the starting and ending byte offsets of the /// match in the haystack. /// /// It is always correct to slice the original haystack searched with this /// range. That is, because the offsets are guaranteed to fall on valid /// UTF-8 boundaries, the range returned is always valid. #[inline] pub fn range(&self) -> core::ops::Range<usize> { self.start..self.end } /// Returns the substring of the haystack that matched. #[inline] pub fn as_str(&self) -> &'h str { &self.haystack[self.range()] } } impl<'h> core::fmt::Debug for Match<'h> { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { f.debug_struct("Match") .field("start", &self.start) .field("end", &self.end) .field("string", &self.as_str()) .finish() } } impl<'h> From<Match<'h>> for &'h str { fn from(m: Match<'h>) -> &'h str { m.as_str() } } impl<'h> From<Match<'h>> for core::ops::Range<usize> { fn from(m: Match<'h>) -> core::ops::Range<usize> { m.range() } } /// Represents the capture groups for a single match. /// /// Capture groups refer to parts of a regex enclosed in parentheses. They can /// be optionally named. The purpose of capture groups is to be able to /// reference different parts of a match based on the original pattern. For /// example, say you want to match the individual letters in a 5-letter word: /// /// ```text /// (?<first>\w)(\w)(?:\w)\w(?<last>\w) /// ``` /// /// This regex has 4 capture groups: /// /// * The group at index `0` corresponds to the overall match. It is always /// present in every match and never has a name. /// * The group at index `1` with name `first` corresponding to the first /// letter. /// * The group at index `2` with no name corresponding to the second letter. /// * The group at index `3` with name `last` corresponding to the fifth and /// last letter. /// /// Notice that `(?:\w)` was not listed above as a capture group despite it /// being enclosed in parentheses. That's because `(?:pattern)` is a special /// syntax that permits grouping but *without* capturing. The reason for not /// treating it as a capture is that tracking and reporting capture groups /// requires additional state that may lead to slower searches. So using as few /// capture groups as possible can help performance. (Although the difference /// in performance of a couple of capture groups is likely immaterial.) /// /// Values with this type are created by [`Regex::captures`] or /// [`Regex::captures_iter`]. /// /// `'h` is the lifetime of the haystack that these captures were matched from. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<first>\w)(\w)(?:\w)\w(?<last>\w)").unwrap(); /// let caps = re.captures("toady").unwrap(); /// assert_eq!("toady", &caps[0]); /// assert_eq!("t", &caps["first"]); /// assert_eq!("o", &caps[2]); /// assert_eq!("y", &caps["last"]); /// ``` pub struct Captures<'h> { haystack: &'h str, slots: CaptureLocations, // It's a little weird to put the PikeVM in our Captures, but it's the // simplest thing to do and is cheap. The PikeVM gives us access to the // NFA and the NFA gives us access to the capture name<->index mapping. pikevm: Arc<PikeVM>, } impl<'h> Captures<'h> { /// Returns the `Match` associated with the capture group at index `i`. If /// `i` does not correspond to a capture group, or if the capture group did /// not participate in the match, then `None` is returned. /// /// When `i == 0`, this is guaranteed to return a non-`None` value. /// /// # Examples /// /// Get the substring that matched with a default of an empty string if the /// group didn't participate in the match: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"[a-z]+(?:([0-9]+)|([A-Z]+))").unwrap(); /// let caps = re.captures("abc123").unwrap(); /// /// let substr1 = caps.get(1).map_or("", |m| m.as_str()); /// let substr2 = caps.get(2).map_or("", |m| m.as_str()); /// assert_eq!(substr1, "123"); /// assert_eq!(substr2, ""); /// ``` #[inline] pub fn get(&self, i: usize) -> Option<Match<'h>> { self.slots.get(i).map(|(s, e)| Match::new(self.haystack, s, e)) } /// Returns the `Match` associated with the capture group named `name`. If /// `name` isn't a valid capture group or it refers to a group that didn't /// match, then `None` is returned. /// /// Note that unlike `caps["name"]`, this returns a `Match` whose lifetime /// matches the lifetime of the haystack in this `Captures` value. /// Conversely, the substring returned by `caps["name"]` has a lifetime /// of the `Captures` value, which is likely shorter than the lifetime of /// the haystack. In some cases, it may be necessary to use this method to /// access the matching substring instead of the `caps["name"]` notation. /// /// # Examples /// /// Get the substring that matched with a default of an empty string if the /// group didn't participate in the match: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new( /// r"[a-z]+(?:(?<numbers>[0-9]+)|(?<letters>[A-Z]+))", /// ).unwrap(); /// let caps = re.captures("abc123").unwrap(); /// /// let numbers = caps.name("numbers").map_or("", |m| m.as_str()); /// let letters = caps.name("letters").map_or("", |m| m.as_str()); /// assert_eq!(numbers, "123"); /// assert_eq!(letters, ""); /// ``` #[inline] pub fn name(&self, name: &str) -> Option<Match<'h>> { let i = self.pikevm.nfa().to_index(name)?; self.get(i) } /// This is a convenience routine for extracting the substrings /// corresponding to matching capture groups. /// /// This returns a tuple where the first element corresponds to the full /// substring of the haystack that matched the regex. The second element is /// an array of substrings, with each corresponding to the substring that /// matched for a particular capture group. /// /// # Panics /// /// This panics if the number of possible matching groups in this /// `Captures` value is not fixed to `N` in all circumstances. /// More precisely, this routine only works when `N` is equivalent to /// [`Regex::static_captures_len`]. /// /// Stated more plainly, if the number of matching capture groups in a /// regex can vary from match to match, then this function always panics. /// /// For example, `(a)(b)|(c)` could produce two matching capture groups /// or one matching capture group for any given match. Therefore, one /// cannot use `extract` with such a pattern. /// /// But a pattern like `(a)(b)|(c)(d)` can be used with `extract` because /// the number of capture groups in every match is always equivalent, /// even if the capture _indices_ in each match are not. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; /// let Some((full, [year, month, day])) = /// re.captures(hay).map(|caps| caps.extract()) else { return }; /// assert_eq!("2010-03-14", full); /// assert_eq!("2010", year); /// assert_eq!("03", month); /// assert_eq!("14", day); /// ``` /// /// # Example: iteration /// /// This example shows how to use this method when iterating over all /// `Captures` matches in a haystack. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})").unwrap(); /// let hay = "1973-01-05, 1975-08-25 and 1980-10-18"; /// /// let mut dates: Vec<(&str, &str, &str)> = vec![]; /// for (_, [y, m, d]) in re.captures_iter(hay).map(|c| c.extract()) { /// dates.push((y, m, d)); /// } /// assert_eq!(dates, vec![ /// ("1973", "01", "05"), /// ("1975", "08", "25"), /// ("1980", "10", "18"), /// ]); /// ``` /// /// # Example: parsing different formats /// /// This API is particularly useful when you need to extract a particular /// value that might occur in a different format. Consider, for example, /// an identifier that might be in double quotes or single quotes: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r#"id:(?:"([^"]+)"|'([^']+)')"#).unwrap(); /// let hay = r#"The first is id:"foo" and the second is id:'bar'."#; /// let mut ids = vec![]; /// for (_, [id]) in re.captures_iter(hay).map(|c| c.extract()) { /// ids.push(id); /// } /// assert_eq!(ids, vec!["foo", "bar"]); /// ``` pub fn extract<const N: usize>(&self) -> (&'h str, [&'h str; N]) { let len = self .pikevm .nfa() .static_explicit_captures_len() .expect("number of capture groups can vary in a match"); assert_eq!(N, len, "asked for {} groups, but must ask for {}", N, len); let mut matched = self.iter().flatten(); let whole_match = matched.next().expect("a match").as_str(); let group_matches = [0; N].map(|_| { matched.next().expect("too few matching groups").as_str() }); (whole_match, group_matches) } /// Expands all instances of `$ref` in `replacement` to the corresponding /// capture group, and writes them to the `dst` buffer given. A `ref` can /// be a capture group index or a name. If `ref` doesn't refer to a capture /// group that participated in the match, then it is replaced with the /// empty string. /// /// # Format /// /// The format of the replacement string supports two different kinds of /// capture references: unbraced and braced. /// /// For the unbraced format, the format supported is `$ref` where `name` /// can be any character in the class `[0-9A-Za-z_]`. `ref` is always /// the longest possible parse. So for example, `$1a` corresponds to the /// capture group named `1a` and not the capture group at index `1`. If /// `ref` matches `^[0-9]+$`, then it is treated as a capture group index /// itself and not a name. /// /// For the braced format, the format supported is `${ref}` where `ref` can /// be any sequence of bytes except for `}`. If no closing brace occurs, /// then it is not considered a capture reference. As with the unbraced /// format, if `ref` matches `^[0-9]+$`, then it is treated as a capture /// group index and not a name. /// /// The braced format is useful for exerting precise control over the name /// of the capture reference. For example, `${1}a` corresponds to the /// capture group reference `1` followed by the letter `a`, where as `$1a` /// (as mentioned above) corresponds to the capture group reference `1a`. /// The braced format is also useful for expressing capture group names /// that use characters not supported by the unbraced format. For example, /// `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. /// /// If a capture group reference is found and it does not refer to a valid /// capture group, then it will be replaced with the empty string. /// /// To write a literal `$`, use `$$`. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new( /// r"(?<day>[0-9]{2})-(?<month>[0-9]{2})-(?<year>[0-9]{4})", /// ).unwrap(); /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; /// let caps = re.captures(hay).unwrap(); /// /// let mut dst = String::new(); /// caps.expand("year=$year, month=$month, day=$day", &mut dst); /// assert_eq!(dst, "year=2010, month=03, day=14"); /// ``` #[inline] pub fn expand(&self, replacement: &str, dst: &mut String) { interpolate::string( replacement, |index, dst| { let m = match self.get(index) { None => return, Some(m) => m, }; dst.push_str(&self.haystack[m.range()]); }, |name| self.pikevm.nfa().to_index(name), dst, ); } /// Returns an iterator over all capture groups. This includes both /// matching and non-matching groups. /// /// The iterator always yields at least one matching group: the first group /// (at index `0`) with no name. Subsequent groups are returned in the order /// of their opening parenthesis in the regex. /// /// The elements yielded have type `Option<Match<'h>>`, where a non-`None` /// value is present if the capture group matches. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); /// let caps = re.captures("AZ").unwrap(); /// /// let mut it = caps.iter(); /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("AZ")); /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("A")); /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), None); /// assert_eq!(it.next().unwrap().map(|m| m.as_str()), Some("Z")); /// assert_eq!(it.next(), None); /// ``` #[inline] pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 'h> { SubCaptureMatches { caps: self, it: self.pikevm.nfa().capture_names().enumerate(), } } /// Returns the total number of capture groups. This includes both /// matching and non-matching groups. /// /// The length returned is always equivalent to the number of elements /// yielded by [`Captures::iter`]. Consequently, the length is always /// greater than zero since every `Captures` value always includes the /// match for the entire regex. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(\w)(\d)?(\w)").unwrap(); /// let caps = re.captures("AZ").unwrap(); /// assert_eq!(caps.len(), 4); /// ``` #[inline] pub fn len(&self) -> usize { self.pikevm.nfa().group_len() } } impl<'h> core::fmt::Debug for Captures<'h> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { /// A little helper type to provide a nice map-like debug /// representation for our capturing group spans. /// /// regex-automata has something similar, but it includes the pattern /// ID in its debug output, which is confusing. It also doesn't include /// that strings that match because a regex-automata `Captures` doesn't /// borrow the haystack. struct CapturesDebugMap<'a> { caps: &'a Captures<'a>, } impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut map = f.debug_map(); let names = self.caps.pikevm.nfa().capture_names(); for (group_index, maybe_name) in names.enumerate() { let key = Key(group_index, maybe_name); match self.caps.get(group_index) { None => map.entry(&key, &None::<()>), Some(mat) => map.entry(&key, &Value(mat)), }; } map.finish() } } struct Key<'a>(usize, Option<&'a str>); impl<'a> core::fmt::Debug for Key<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{}", self.0)?; if let Some(name) = self.1 { write!(f, "/{:?}", name)?; } Ok(()) } } struct Value<'a>(Match<'a>); impl<'a> core::fmt::Debug for Value<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "{}..{}/{:?}", self.0.start(), self.0.end(), self.0.as_str() ) } } f.debug_tuple("Captures") .field(&CapturesDebugMap { caps: self }) .finish() } } /// Get a matching capture group's haystack substring by index. /// /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use /// [`Captures::get`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` /// value itself. /// /// # Panics /// /// If there is no matching group at the given index. impl<'h> core::ops::Index<usize> for Captures<'h> { type Output = str; // The lifetime is written out to make it clear that the &str returned // does NOT have a lifetime equivalent to 'h. fn index(&self, i: usize) -> &str { self.get(i) .map(|m| m.as_str()) .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } /// Get a matching capture group's haystack substring by name. /// /// The haystack substring returned can't outlive the `Captures` object if this /// method is used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it). To work around this limitation, do that, use /// [`Captures::get`] instead. /// /// `'h` is the lifetime of the matched haystack, but the lifetime of the /// `&str` returned by this implementation is the lifetime of the `Captures` /// value itself. /// /// `'n` is the lifetime of the group name used to index the `Captures` value. /// /// # Panics /// /// If there is no matching group at the given name. impl<'h, 'n> core::ops::Index<&'n str> for Captures<'h> { type Output = str; fn index<'a>(&'a self, name: &'n str) -> &'a str { self.name(name) .map(|m| m.as_str()) .unwrap_or_else(|| panic!("no group named '{}'", name)) } } /// A low level representation of the byte offsets of each capture group. /// /// You can think of this as a lower level [`Captures`], where this type does /// not support named capturing groups directly and it does not borrow the /// haystack that these offsets were matched on. /// /// Primarily, this type is useful when using the lower level `Regex` APIs such /// as [`Regex::captures_read`], which permits amortizing the allocation in /// which capture match offsets are stored. /// /// In order to build a value of this type, you'll need to call the /// [`Regex::capture_locations`] method. The value returned can then be reused /// in subsequent searches for that regex. Using it for other regexes may /// result in a panic or otherwise incorrect results. /// /// # Example /// /// This example shows how to create and use `CaptureLocations` in a search. /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); /// let mut locs = re.capture_locations(); /// let m = re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); /// assert_eq!(0..17, m.range()); /// assert_eq!(Some((0, 17)), locs.get(0)); /// assert_eq!(Some((0, 5)), locs.get(1)); /// assert_eq!(Some((6, 17)), locs.get(2)); /// /// // Asking for an invalid capture group always returns None. /// assert_eq!(None, locs.get(3)); /// # // literals are too big for 32-bit usize: #1041 /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(34973498648)); /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, locs.get(9944060567225171988)); /// ``` #[derive(Clone, Debug)] pub struct CaptureLocations(Vec<Option<NonMaxUsize>>); impl CaptureLocations { /// Returns the start and end byte offsets of the capture group at index /// `i`. This returns `None` if `i` is not a valid capture group or if the /// capture group did not match. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); /// let mut locs = re.capture_locations(); /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); /// assert_eq!(Some((0, 17)), locs.get(0)); /// assert_eq!(Some((0, 5)), locs.get(1)); /// assert_eq!(Some((6, 17)), locs.get(2)); /// ``` #[inline] pub fn get(&self, i: usize) -> Option<(usize, usize)> { let slot = i.checked_mul(2)?; let start = self.0.get(slot).copied()??.get(); let slot = slot.checked_add(1)?; let end = self.0.get(slot).copied()??.get(); Some((start, end)) } /// Returns the total number of capture groups (even if they didn't match). /// That is, the length returned is unaffected by the result of a search. /// /// This is always at least `1` since every regex has at least `1` /// capturing group that corresponds to the entire match. /// /// # Example /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"(?<first>\w+)\s+(?<last>\w+)").unwrap(); /// let mut locs = re.capture_locations(); /// assert_eq!(3, locs.len()); /// re.captures_read(&mut locs, "Bruce Springsteen").unwrap(); /// assert_eq!(3, locs.len()); /// ``` /// /// Notice that the length is always at least `1`, regardless of the regex: /// /// ``` /// use regex_lite::Regex; /// /// let re = Regex::new(r"").unwrap(); /// let locs = re.capture_locations(); /// assert_eq!(1, locs.len()); /// /// // [a&&b] is a regex that never matches anything. /// let re = Regex::new(r"[^\s\S]").unwrap(); /// let locs = re.capture_locations(); /// assert_eq!(1, locs.len()); /// ``` #[inline] pub fn len(&self) -> usize { // We always have twice as many slots as groups. self.0.len().checked_shr(1).unwrap() } } /// An iterator over all non-overlapping matches in a haystack. /// /// This iterator yields [`Match`] values. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the /// lifetime of the haystack. /// /// This iterator is created by [`Regex::find_iter`]. /// /// # Time complexity /// /// Note that since an iterator runs potentially many searches on the haystack /// and since each search has worst case `O(m * n)` time complexity, the /// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct Matches<'r, 'h> { haystack: &'h str, it: pikevm::FindMatches<'r, 'h>, } impl<'r, 'h> Iterator for Matches<'r, 'h> { type Item = Match<'h>; #[inline] fn next(&mut self) -> Option<Match<'h>> { self.it.next().map(|(s, e)| Match::new(self.haystack, s, e)) } #[inline] fn count(self) -> usize { self.it.count() } } impl<'r, 'h> core::iter::FusedIterator for Matches<'r, 'h> {} /// An iterator over all non-overlapping capture matches in a haystack. /// /// This iterator yields [`Captures`] values. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the /// lifetime of the matched string. /// /// This iterator is created by [`Regex::captures_iter`]. /// /// # Time complexity /// /// Note that since an iterator runs potentially many searches on the haystack /// and since each search has worst case `O(m * n)` time complexity, the /// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct CaptureMatches<'r, 'h> { haystack: &'h str, re: &'r Regex, it: pikevm::CapturesMatches<'r, 'h>, } impl<'r, 'h> Iterator for CaptureMatches<'r, 'h> { type Item = Captures<'h>; #[inline] fn next(&mut self) -> Option<Captures<'h>> { self.it.next().map(|slots| Captures { haystack: self.haystack, slots: CaptureLocations(slots), pikevm: Arc::clone(&self.re.pikevm), }) } #[inline] fn count(self) -> usize { self.it.count() } } impl<'r, 'h> core::iter::FusedIterator for CaptureMatches<'r, 'h> {} /// An iterator over all substrings delimited by a regex match. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the /// lifetime of the byte string being split. /// /// This iterator is created by [`Regex::split`]. /// /// # Time complexity /// /// Note that since an iterator runs potentially many searches on the haystack /// and since each search has worst case `O(m * n)` time complexity, the /// overall worst case time complexity for iteration is `O(m * n^2)`. #[derive(Debug)] pub struct Split<'r, 'h> { haystack: &'h str, finder: Matches<'r, 'h>, last: usize, } impl<'r, 'h> Iterator for Split<'r, 'h> { type Item = &'h str; #[inline] fn next(&mut self) -> Option<&'h str> { match self.finder.next() { None => { let len = self.haystack.len(); if self.last > len { None } else { let range = self.last..len; self.last = len + 1; // Next call will return None Some(&self.haystack[range]) } } Some(m) => { let range = self.last..m.start(); self.last = m.end(); Some(&self.haystack[range]) } } } } impl<'r, 't> core::iter::FusedIterator for Split<'r, 't> {} /// An iterator over at most `N` substrings delimited by a regex match. /// /// The last substring yielded by this iterator will be whatever remains after /// `N-1` splits. /// /// `'r` is the lifetime of the compiled regular expression and `'h` is the /// lifetime of the byte string being split. /// /// This iterator is created by [`Regex::splitn`]. /// /// # Time complexity /// /// Note that since an iterator runs potentially many searches on the haystack /// and since each search has worst case `O(m * n)` time complexity, the /// overall worst case time complexity for iteration is `O(m * n^2)`. /// /// Although note that the worst case time here has an upper bound given /// by the `limit` parameter to [`Regex::splitn`]. #[derive(Debug)] pub struct SplitN<'r, 'h> { splits: Split<'r, 'h>, limit: usize, } impl<'r, 'h> Iterator for SplitN<'r, 'h> { type Item = &'h str; #[inline] fn next(&mut self) -> Option<&'h str> { if self.limit == 0 { return None; } self.limit -= 1; if self.limit > 0 { return self.splits.next(); } let len = self.splits.haystack.len(); if self.splits.last > len { // We've already returned all substrings. None } else { // self.n == 0, so future calls will return None immediately Some(&self.splits.haystack[self.splits.last..len]) } } #[inline] fn size_hint(&self) -> (usize, Option<usize>) { self.splits.size_hint() } } impl<'r, 't> core::iter::FusedIterator for SplitN<'r, 't> {} /// An iterator over the names of all capture groups in a regex. /// /// This iterator yields values of type `Option<&str>` in order of the opening /// capture group parenthesis in the regex pattern. `None` is yielded for /// groups with no name. The first element always corresponds to the implicit /// and unnamed group for the overall match. /// /// `'r` is the lifetime of the compiled regular expression. /// /// This iterator is created by [`Regex::capture_names`]. #[derive(Clone, Debug)] pub struct CaptureNames<'r>(nfa::CaptureNames<'r>); impl<'r> Iterator for CaptureNames<'r> { type Item = Option<&'r str>; #[inline] fn next(&mut self) -> Option<Option<&'r str>> { self.0.next() } #[inline] fn size_hint(&self) -> (usize, Option<usize>) { self.0.size_hint() } #[inline] fn count(self) -> usize { self.0.count() } } impl<'r> ExactSizeIterator for CaptureNames<'r> {} impl<'r> core::iter::FusedIterator for CaptureNames<'r> {} /// An iterator over all group matches in a [`Captures`] value. /// /// This iterator yields values of type `Option<Match<'h>>`, where `'h` is the /// lifetime of the haystack that the matches are for. The order of elements /// yielded corresponds to the order of the opening parenthesis for the group /// in the regex pattern. `None` is yielded for groups that did not participate /// in the match. /// /// The first element always corresponds to the implicit group for the overall /// match. Since this iterator is created by a [`Captures`] value, and a /// `Captures` value is only created when a match occurs, it follows that the /// first element yielded by this iterator is guaranteed to be non-`None`. /// /// The lifetime `'c` corresponds to the lifetime of the `Captures` value that /// created this iterator, and the lifetime `'h` corresponds to the originally /// matched haystack. #[derive(Clone, Debug)] pub struct SubCaptureMatches<'c, 'h> { caps: &'c Captures<'h>, it: core::iter::Enumerate<nfa::CaptureNames<'c>>, } impl<'c, 'h> Iterator for SubCaptureMatches<'c, 'h> { type Item = Option<Match<'h>>; #[inline] fn next(&mut self) -> Option<Option<Match<'h>>> { let (group_index, _) = self.it.next()?; Some(self.caps.get(group_index)) } #[inline] fn size_hint(&self) -> (usize, Option<usize>) { self.it.size_hint() } #[inline] fn count(self) -> usize { self.it.count() } } impl<'c, 'h> ExactSizeIterator for SubCaptureMatches<'c, 'h> {} impl<'c, 'h> core::iter::FusedIterator for SubCaptureMatches<'c, 'h> {} /// A trait for types that can be used to replace matches in a haystack. /// /// In general, users of this crate shouldn't need to implement this trait, /// since implementations are already provided for `&str` along with other /// variants of string types, as well as `FnMut(&Captures) -> String` (or any /// `FnMut(&Captures) -> T` where `T: AsRef<str>`). Those cover most use cases, /// but callers can implement this trait directly if necessary. /// /// # Example /// /// This example shows a basic implementation of the `Replacer` trait. This /// can be done much more simply using the replacement string interpolation /// support (e.g., `$first $last`), but this approach avoids needing to parse /// the replacement string at all. /// /// ``` /// use regex_lite::{Captures, Regex, Replacer}; /// /// struct NameSwapper; /// /// impl Replacer for NameSwapper { /// fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { /// dst.push_str(&caps["first"]); /// dst.push_str(" "); /// dst.push_str(&caps["last"]); /// } /// } /// /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(?<first>\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", NameSwapper); /// assert_eq!(result, "Bruce Springsteen"); /// ``` pub trait Replacer { /// Appends possibly empty data to `dst` to replace the current match. /// /// The current match is represented by `caps`, which is guaranteed to /// have a match at capture group `0`. /// /// For example, a no-op replacement would be `dst.push_str(&caps[0])`. fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); /// Return a fixed unchanging replacement string. /// /// When doing replacements, if access to [`Captures`] is not needed (e.g., /// the replacement string does not need `$` expansion), then it can be /// beneficial to avoid finding sub-captures. /// /// In general, this is called once for every call to a replacement routine /// such as [`Regex::replace_all`]. fn no_expansion<'r>(&'r mut self) -> Option<Cow<'r, str>> { None } /// Returns a type that implements `Replacer`, but that borrows and wraps /// this `Replacer`. /// /// This is useful when you want to take a generic `Replacer` (which might /// not be cloneable) and use it without consuming it, so it can be used /// more than once. /// /// # Example /// /// ``` /// use regex_lite::{Regex, Replacer}; /// /// fn replace_all_twice<R: Replacer>( /// re: Regex, /// src: &str, /// mut rep: R, /// ) -> String { /// let dst = re.replace_all(src, rep.by_ref()); /// let dst = re.replace_all(&dst, rep.by_ref()); /// dst.into_owned() /// } /// ``` fn by_ref<'r>(&'r mut self) -> ReplacerRef<'r, Self> { ReplacerRef(self) } } impl<'a> Replacer for &'a str { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { caps.expand(*self, dst); } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { no_expansion(self) } } impl<'a> Replacer for &'a String { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_str().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { no_expansion(self) } } impl Replacer for String { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_str().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { no_expansion(self) } } impl<'a> Replacer for Cow<'a, str> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_ref().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { no_expansion(self) } } impl<'a> Replacer for &'a Cow<'a, str> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_ref().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { no_expansion(self) } } impl<F, T> Replacer for F where F: FnMut(&Captures<'_>) -> T, T: AsRef<str>, { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { dst.push_str((*self)(caps).as_ref()); } } /// A by-reference adaptor for a [`Replacer`]. /// /// This permits reusing the same `Replacer` value in multiple calls to a /// replacement routine like [`Regex::replace_all`]. /// /// This type is created by [`Replacer::by_ref`]. #[derive(Debug)] pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.0.replace_append(caps, dst) } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { self.0.no_expansion() } } /// A helper type for forcing literal string replacement. /// /// It can be used with routines like [`Regex::replace`] and /// [`Regex::replace_all`] to do a literal string replacement without expanding /// `$name` to their corresponding capture groups. This can be both convenient /// (to avoid escaping `$`, for example) and faster (since capture groups /// don't need to be found). /// /// `'s` is the lifetime of the literal string to use. /// /// # Example /// /// ``` /// use regex_lite::{NoExpand, Regex}; /// /// let re = Regex::new(r"(?<last>[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); /// assert_eq!(result, "$2 $last"); /// ``` #[derive(Clone, Debug)] pub struct NoExpand<'t>(pub &'t str); impl<'t> Replacer for NoExpand<'t> { fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { dst.push_str(self.0); } fn no_expansion(&mut self) -> Option<Cow<'_, str>> { Some(Cow::Borrowed(self.0)) } } /// Quickly checks the given replacement string for whether interpolation /// should be done on it. It returns `None` if a `$` was found anywhere in the /// given string, which suggests interpolation needs to be done. But if there's /// no `$` anywhere, then interpolation definitely does not need to be done. In /// that case, the given string is returned as a borrowed `Cow`. /// /// This is meant to be used to implement the `Replacer::no_expandsion` method /// in its various trait impls. fn no_expansion<T: AsRef<str>>(t: &T) -> Option<Cow<'_, str>> { let s = t.as_ref(); match s.find('$') { Some(_) => None, None => Some(Cow::Borrowed(s)), } } /// A configurable builder for a [`Regex`]. /// /// This builder can be used to programmatically set flags such as `i` (case /// insensitive) and `x` (for verbose mode). This builder can also be used to /// configure things like a size limit on the compiled regular expression. #[derive(Debug)] pub struct RegexBuilder { pattern: String, hir_config: hir::Config, nfa_config: nfa::Config, } impl RegexBuilder { /// Create a new builder with a default configuration for the given /// pattern. /// /// If the pattern is invalid or exceeds the configured size limits, then /// an error will be returned when [`RegexBuilder::build`] is called. pub fn new(pattern: &str) -> RegexBuilder { RegexBuilder { pattern: pattern.to_string(), hir_config: hir::Config::default(), nfa_config: nfa::Config::default(), } } /// Compiles the pattern given to `RegexBuilder::new` with the /// configuration set on this builder. /// /// If the pattern isn't a valid regex or if a configured size limit was /// exceeded, then an error is returned. pub fn build(&self) -> Result<Regex, Error> { let hir = Hir::parse(self.hir_config, &self.pattern)?; let nfa = NFA::new(self.nfa_config, self.pattern.clone(), &hir)?; let pikevm = Arc::new(PikeVM::new(nfa)); let pool = { let pikevm = Arc::clone(&pikevm); let create = Box::new(move || Cache::new(&pikevm)); CachePool::new(create) }; Ok(Regex { pikevm, pool }) } /// This configures whether to enable ASCII case insensitive matching for /// the entire pattern. /// /// This setting can also be configured using the inline flag `i` /// in the pattern. For example, `(?i:foo)` matches `foo` case /// insensitively while `(?-i:foo)` matches `foo` case sensitively. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"foo(?-i:bar)quux") /// .case_insensitive(true) /// .build() /// .unwrap(); /// assert!(re.is_match("FoObarQuUx")); /// // Even though case insensitive matching is enabled in the builder, /// // it can be locally disabled within the pattern. In this case, /// // `bar` is matched case sensitively. /// assert!(!re.is_match("fooBARquux")); /// ``` pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.case_insensitive = yes; self } /// This configures multi-line mode for the entire pattern. /// /// Enabling multi-line mode changes the behavior of the `^` and `$` anchor /// assertions. Instead of only matching at the beginning and end of a /// haystack, respectively, multi-line mode causes them to match at the /// beginning and end of a line *in addition* to the beginning and end of /// a haystack. More precisely, `^` will match at the position immediately /// following a `\n` and `$` will match at the position immediately /// preceding a `\n`. /// /// The behavior of this option is impacted by the [`RegexBuilder::crlf`] /// setting. Namely, CRLF mode changes the line terminator to be either /// `\r` or `\n`, but never at the position between a `\r` and `\`n. /// /// This setting can also be configured using the inline flag `m` in the /// pattern. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"^foo$") /// .multi_line(true) /// .build() /// .unwrap(); /// assert_eq!(Some(1..4), re.find("\nfoo\n").map(|m| m.range())); /// ``` pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.multi_line = yes; self } /// This configures dot-matches-new-line mode for the entire pattern. /// /// Perhaps surprisingly, the default behavior for `.` is not to match /// any character, but rather, to match any character except for the line /// terminator (which is `\n` by default). When this mode is enabled, the /// behavior changes such that `.` truly matches any character. /// /// This setting can also be configured using the inline flag `s` in the /// pattern. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"foo.bar") /// .dot_matches_new_line(true) /// .build() /// .unwrap(); /// let hay = "foo\nbar"; /// assert_eq!(Some("foo\nbar"), re.find(hay).map(|m| m.as_str())); /// ``` pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.dot_matches_new_line = yes; self } /// This configures CRLF mode for the entire pattern. /// /// When CRLF mode is enabled, both `\r` ("carriage return" or CR for /// short) and `\n` ("line feed" or LF for short) are treated as line /// terminators. This results in the following: /// /// * Unless dot-matches-new-line mode is enabled, `.` will now match any /// character except for `\n` and `\r`. /// * When multi-line mode is enabled, `^` will match immediately /// following a `\n` or a `\r`. Similarly, `$` will match immediately /// preceding a `\n` or a `\r`. Neither `^` nor `$` will ever match between /// `\r` and `\n`. /// /// This setting can also be configured using the inline flag `R` in /// the pattern. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"^foo$") /// .multi_line(true) /// .crlf(true) /// .build() /// .unwrap(); /// let hay = "\r\nfoo\r\n"; /// // If CRLF mode weren't enabled here, then '$' wouldn't match /// // immediately after 'foo', and thus no match would be found. /// assert_eq!(Some("foo"), re.find(hay).map(|m| m.as_str())); /// ``` /// /// This example demonstrates that `^` will never match at a position /// between `\r` and `\n`. (`$` will similarly not match between a `\r` /// and a `\n`.) /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"^") /// .multi_line(true) /// .crlf(true) /// .build() /// .unwrap(); /// let hay = "\r\n\r\n"; /// let ranges: Vec<_> = re.find_iter(hay).map(|m| m.range()).collect(); /// assert_eq!(ranges, vec![0..0, 2..2, 4..4]); /// ``` pub fn crlf(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.crlf = yes; self } /// This configures swap-greed mode for the entire pattern. /// /// When swap-greed mode is enabled, patterns like `a+` will become /// non-greedy and patterns like `a+?` will become greedy. In other words, /// the meanings of `a+` and `a+?` are switched. /// /// This setting can also be configured using the inline flag `U` in the /// pattern. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let re = RegexBuilder::new(r"a+") /// .swap_greed(true) /// .build() /// .unwrap(); /// assert_eq!(Some("a"), re.find("aaa").map(|m| m.as_str())); /// ``` pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.swap_greed = yes; self } /// This configures verbose mode for the entire pattern. /// /// When enabled, whitespace will treated as insignifcant in the pattern /// and `#` can be used to start a comment until the next new line. /// /// Normally, in most places in a pattern, whitespace is treated literally. /// For example ` +` will match one or more ASCII whitespace characters. /// /// When verbose mode is enabled, `\#` can be used to match a literal `#` /// and `\ ` can be used to match a literal ASCII whitespace character. /// /// Verbose mode is useful for permitting regexes to be formatted and /// broken up more nicely. This may make them more easily readable. /// /// This setting can also be configured using the inline flag `x` in the /// pattern. /// /// The default for this is `false`. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// let pat = r" /// \b /// (?<first>[A-Z]\w*) # always start with uppercase letter /// \s+ # whitespace should separate names /// (?: # middle name can be an initial! /// (?:(?<initial>[A-Z])\.|(?<middle>[A-Z]\w*)) /// \s+ /// )? /// (?<last>[A-Z]\w*) /// \b /// "; /// let re = RegexBuilder::new(pat) /// .ignore_whitespace(true) /// .build() /// .unwrap(); /// /// let caps = re.captures("Harry Potter").unwrap(); /// assert_eq!("Harry", &caps["first"]); /// assert_eq!("Potter", &caps["last"]); /// /// let caps = re.captures("Harry J. Potter").unwrap(); /// assert_eq!("Harry", &caps["first"]); /// // Since a middle name/initial isn't required for an overall match, /// // we can't assume that 'initial' or 'middle' will be populated! /// assert_eq!(Some("J"), caps.name("initial").map(|m| m.as_str())); /// assert_eq!(None, caps.name("middle").map(|m| m.as_str())); /// assert_eq!("Potter", &caps["last"]); /// /// let caps = re.captures("Harry James Potter").unwrap(); /// assert_eq!("Harry", &caps["first"]); /// // Since a middle name/initial isn't required for an overall match, /// // we can't assume that 'initial' or 'middle' will be populated! /// assert_eq!(None, caps.name("initial").map(|m| m.as_str())); /// assert_eq!(Some("James"), caps.name("middle").map(|m| m.as_str())); /// assert_eq!("Potter", &caps["last"]); /// ``` pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.hir_config.flags.ignore_whitespace = yes; self } /// Sets the approximate size limit, in bytes, of the compiled regex. /// /// This roughly corresponds to the number of heap memory, in bytes, /// occupied by a single regex. If the regex would otherwise approximately /// exceed this limit, then compiling that regex will fail. /// /// The main utility of a method like this is to avoid compiling regexes /// that use an unexpected amount of resources, such as time and memory. /// Even if the memory usage of a large regex is acceptable, its search /// time may not be. Namely, worst case time complexity for search is `O(m /// * n)`, where `m ~ len(pattern)` and `n ~ len(haystack)`. That is, /// search time depends, in part, on the size of the compiled regex. This /// means that putting a limit on the size of the regex limits how much a /// regex can impact search time. /// /// The default for this is some reasonable number that permits most /// patterns to compile successfully. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// assert!(RegexBuilder::new(r"\w").size_limit(100).build().is_err()); /// ``` pub fn size_limit(&mut self, limit: usize) -> &mut RegexBuilder { self.nfa_config.size_limit = Some(limit); self } /// Set the nesting limit for this parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow for consumers that do structural induction on an AST using /// explicit recursion. While this crate never does this (instead using /// constant stack space and moving the call stack to the heap), other /// crates may. /// /// This limit is not checked until the entire AST is parsed. Therefore, if /// callers want to put a limit on the amount of heap space used, then they /// should impose a limit on the length, in bytes, of the concrete pattern /// string. In particular, this is viable since this parser implementation /// will limit itself to heap space proportional to the length of the /// pattern string. See also the [untrusted inputs](crate#untrusted-input) /// section in the top-level crate documentation for more information about /// this. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires an explicit concatenation, which results /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. /// /// # Example /// /// ``` /// use regex_lite::RegexBuilder; /// /// assert!(RegexBuilder::new(r"").nest_limit(0).build().is_ok()); /// assert!(RegexBuilder::new(r"a").nest_limit(0).build().is_ok()); /// assert!(RegexBuilder::new(r"(a)").nest_limit(0).build().is_err()); /// ``` pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { self.hir_config.nest_limit = limit; self } } �������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/src/utf8.rs������������������������������������������������������������������������0000644�0000000�0000000�00000017434�10461020230�0013731�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������/// Returns true if and only if the given byte is considered a word character. /// This only applies to ASCII. pub(crate) fn is_word_byte(b: u8) -> bool { const fn mkwordset() -> [bool; 256] { // FIXME: Use as_usize() once const functions in traits are stable. let mut set = [false; 256]; set[b'_' as usize] = true; let mut byte = b'0'; while byte <= b'9' { set[byte as usize] = true; byte += 1; } byte = b'A'; while byte <= b'Z' { set[byte as usize] = true; byte += 1; } byte = b'a'; while byte <= b'z' { set[byte as usize] = true; byte += 1; } set } const WORD: [bool; 256] = mkwordset(); WORD[b as usize] } /// The accept state index. When we enter this state, we know we've found a /// valid Unicode scalar value. const ACCEPT: usize = 12; /// The reject state index. When we enter this state, we know that we've found /// invalid UTF-8. const REJECT: usize = 0; /// Like `decode`, but automatically converts the `None` case to the /// replacement codepoint. pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) { match decode(slice) { (Some(ch), size) => (ch, size), (None, size) => ('\u{FFFD}', size), } } /// UTF-8 decode a single Unicode scalar value from the beginning of a slice. /// /// When successful, the corresponding Unicode scalar value is returned along /// with the number of bytes it was encoded with. The number of bytes consumed /// for a successful decode is always between 1 and 4, inclusive. /// /// When unsuccessful, `None` is returned along with the number of bytes that /// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case, /// the number of bytes consumed is always between 0 and 3, inclusive, where /// 0 is only returned when `slice` is empty. pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) { let slice = slice.as_ref(); match slice.get(0) { None => return (None, 0), Some(&b) if b <= 0x7F => return (Some(b as char), 1), _ => {} } let (mut state, mut cp, mut i) = (ACCEPT, 0, 0); while i < slice.len() { decode_step(&mut state, &mut cp, slice[i]); i += 1; if state == ACCEPT { // OK since `decode_step` guarantees that `cp` is a valid Unicode // scalar value in an ACCEPT state. // // We don't have to use safe code here, but do so because perf // isn't our primary objective in regex-lite. let ch = char::from_u32(cp).unwrap(); return (Some(ch), i); } else if state == REJECT { // At this point, we always want to advance at least one byte. return (None, core::cmp::max(1, i.saturating_sub(1))); } } (None, i) } /// Transitions to the next state and updates `cp` while it does. fn decode_step(state: &mut usize, cp: &mut u32, b: u8) { // Splits the space of all bytes into equivalence classes, such that // any byte in the same class can never discriminate between whether a // particular sequence is valid UTF-8 or not. #[rustfmt::skip] const CLASSES: [u8; 256] = [ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, ]; // A state machine taken from `bstr` which was in turn adapted from: // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/ #[rustfmt::skip] const STATES_FORWARD: &'static [u8] = &[ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72, 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0, 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ]; let class = CLASSES[usize::from(b)]; if *state == ACCEPT { *cp = (0xFF >> class) & (b as u32); } else { *cp = (b as u32 & 0b111111) | (*cp << 6); } *state = usize::from(STATES_FORWARD[*state + usize::from(class)]); } #[cfg(test)] mod tests { use alloc::{vec, vec::Vec}; use super::*; #[test] fn decode_valid() { fn d(mut s: &str) -> Vec<char> { let mut chars = vec![]; while !s.is_empty() { let (ch, size) = decode(s.as_bytes()); s = &s[size..]; chars.push(ch.unwrap()); } chars } assert_eq!(vec!['☃'], d("☃")); assert_eq!(vec!['☃', '☃'], d("☃☃")); assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε")); assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇")); assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲")); } #[test] fn decode_invalid() { let (ch, size) = decode(b""); assert_eq!(None, ch); assert_eq!(0, size); let (ch, size) = decode(b"\xFF"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = decode(b"\xCE\xF0"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = decode(b"\xE2\x98\xF0"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = decode(b"\xF0\x9D\x9D"); assert_eq!(None, ch); assert_eq!(3, size); let (ch, size) = decode(b"\xF0\x9D\x9D\xF0"); assert_eq!(None, ch); assert_eq!(3, size); let (ch, size) = decode(b"\xF0\x82\x82\xAC"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = decode(b"\xED\xA0\x80"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = decode(b"\xCEa"); assert_eq!(None, ch); assert_eq!(1, size); let (ch, size) = decode(b"\xE2\x98a"); assert_eq!(None, ch); assert_eq!(2, size); let (ch, size) = decode(b"\xF0\x9D\x9Ca"); assert_eq!(None, ch); assert_eq!(3, size); } #[test] fn decode_lossily() { let (ch, size) = decode_lossy(b""); assert_eq!('\u{FFFD}', ch); assert_eq!(0, size); let (ch, size) = decode_lossy(b"\xFF"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = decode_lossy(b"\xCE\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = decode_lossy(b"\xE2\x98\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = decode_lossy(b"\xED\xA0\x80"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = decode_lossy(b"\xCEa"); assert_eq!('\u{FFFD}', ch); assert_eq!(1, size); let (ch, size) = decode_lossy(b"\xE2\x98a"); assert_eq!('\u{FFFD}', ch); assert_eq!(2, size); let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca"); assert_eq!('\u{FFFD}', ch); assert_eq!(3, size); } } ������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/tests/fuzz/mod.rs������������������������������������������������������������������0000644�0000000�0000000�00000003355�10461020230�0015170�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������#[test] fn captures_wrong_order() { let data = include_bytes!( "testdata/crash-a886ce2b0d64963f1232f9b08b8c9ad4740c26f5" ); let _ = run(data); } #[test] fn captures_wrong_order_min() { let data = include_bytes!( "testdata/minimized-from-298f84f9dbb2589cb9938a63334fa4083b609f34" ); let _ = run(data); } // Simpler regression test from a failure found by OSS-fuzz[1]. This test, // when it failed, caused a stack overflow. We fixed it by adding another nest // check on the Hir value itself, since the Hir type can have depth added to // it without recursive calls in the parser (which is where the existing nest // check was). // // Many thanks to Addison Crump for coming up with this test case[2]. // // [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 // [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1 #[test] fn many_zero_to_many_reps() { let pat = format!(".{}", "*".repeat(1 << 15)); let Ok(re) = regex_lite::Regex::new(&pat) else { return }; re.is_match(""); } // This is the fuzz target function. We duplicate it here since this is the // thing we use to interpret the data. It is ultimately what we want to // succeed. fn run(data: &[u8]) -> Option<()> { if data.len() < 2 { return None; } let mut split_at = usize::from(data[0]); let data = std::str::from_utf8(&data[1..]).ok()?; // Split data into a regex and haystack to search. let len = usize::try_from(data.chars().count()).ok()?; split_at = std::cmp::max(split_at, 1) % len; let char_index = data.char_indices().nth(split_at)?.0; let (pattern, input) = data.split_at(char_index); let re = regex_lite::Regex::new(pattern).ok()?; re.is_match(input); Some(()) } �����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/tests/fuzz/testdata/crash-a886ce2b0d64963f1232f9b08b8c9ad4740c26f5�����������������0000644�0000000�0000000�00000000064�10461020230�0024260�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������(}~@a�&){0}������~�}x}x}x}x(}~@a�){4}������~�}�����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/tests/fuzz/testdata/minimized-from-298f84f9dbb2589cb9938a63334fa4083b609f34��������0000644�0000000�0000000�00000000032�10461020230�0025765�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������(}@a�&){0}�x}x(}~@a�)�}�������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/tests/lib.rs�����������������������������������������������������������������������0000644�0000000�0000000�00000002304�10461020230�0014152�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������mod fuzz; mod string; const BLACKLIST: &[&str] = &[ // Custom line terminators aren't supported in regex-lite. We could add it, // but it didn't seem worth it. "line-terminator", ]; fn suite() -> anyhow::Result<regex_test::RegexTests> { let mut tests = regex_test::RegexTests::new(); macro_rules! load { ($name:expr) => {{ const DATA: &[u8] = include_bytes!(concat!("../../testdata/", $name, ".toml")); tests.load_slice($name, DATA)?; }}; } load!("anchored"); load!("bytes"); load!("crazy"); load!("crlf"); load!("earliest"); load!("empty"); load!("expensive"); load!("flags"); load!("iter"); load!("leftmost-all"); load!("line-terminator"); load!("misc"); load!("multiline"); load!("no-unicode"); load!("overlapping"); load!("regression"); load!("set"); load!("substring"); load!("unicode"); load!("utf8"); load!("word-boundary"); load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); // Special tests for regex-lite specifically. load!("regex-lite"); Ok(tests) } ����������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������regex-lite-0.1.6/tests/string.rs��������������������������������������������������������������������0000644�0000000�0000000�00000012554�10461020230�0014722�0����������������������������������������������������������������������������������������������������ustar �����������������������������������������������������������������0000000�0000000������������������������������������������������������������������������������������������������������������������������������������������������������������������������use { anyhow::Result, regex_lite::{Regex, RegexBuilder}, regex_test::{ CompiledRegex, Match, RegexTest, Span, TestResult, TestRunner, }, }; /// Tests the default configuration of the hybrid NFA/DFA. #[test] fn default() -> Result<()> { let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(super::BLACKLIST) .test_iter(crate::suite()?.iter(), compiler) .assert(); Ok(()) } fn run_test(re: &Regex, test: &RegexTest) -> TestResult { let hay = match std::str::from_utf8(test.haystack()) { Ok(hay) => hay, Err(err) => { return TestResult::fail(&format!( "haystack is not valid UTF-8: {}", err )); } }; match test.additional_name() { "is_match" => TestResult::matched(re.is_match(hay)), "find" => TestResult::matches( re.find_iter(hay) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: 0, span: Span { start: m.start(), end: m.end() }, }), ), "captures" => { let it = re .captures_iter(hay) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Converts the given regex test to a closure that searches with a /// `bytes::Regex`. If the test configuration is unsupported, then a /// `CompiledRegex` that skips the test is returned. fn compiler( test: &RegexTest, _patterns: &[String], ) -> anyhow::Result<CompiledRegex> { let Some(pattern) = skip_or_get_pattern(test) else { return Ok(CompiledRegex::skip()); }; let re = RegexBuilder::new(pattern) .case_insensitive(test.case_insensitive()) .build()?; Ok(CompiledRegex::compiled(move |test| run_test(&re, test))) } /// Whether we should skip the given test or not. If not, return the single /// pattern from the given test. fn skip_or_get_pattern(test: &RegexTest) -> Option<&str> { // We're only testing Regex here, which supports one pattern only. let pattern = match test.regexes().len() { 1 => &test.regexes()[0], _ => return None, }; // If the test name contains 'regex-lite', then we ALWAYS run it. Because // those tests are specifically designed for regex-lite. So if they fail, // then something needs attention. if test.full_name().contains("regex-lite/") { return Some(pattern); } // If the pattern has a \p in it, then we almost certainly don't support // it. This probably skips more than we intend, but there are likely very // few tests that contain a \p that isn't also a Unicode class. if pattern.contains(r"\p") || pattern.contains(r"\P") { return None; } // Similar deal for Perl classes, but we can abide them if the haystack // is ASCII-only. if !test.haystack().is_ascii() { if pattern.contains(r"\d") || pattern.contains(r"\D") { return None; } if pattern.contains(r"\s") || pattern.contains(r"\S") { return None; } if pattern.contains(r"\w") || pattern.contains(r"\W") { return None; } } // And also same deal for word boundaries. if !test.haystack().is_ascii() { if pattern.contains(r"\b") || pattern.contains(r"\B") { return None; } } // We only test is_match, find_iter and captures_iter. All of those are // leftmost searches. if !matches!(test.search_kind(), regex_test::SearchKind::Leftmost) { return None; } // The top-level single-pattern regex API always uses leftmost-first. if !matches!(test.match_kind(), regex_test::MatchKind::LeftmostFirst) { return None; } // The top-level regex API always runs unanchored searches. ... But we can // handle tests that are anchored but have only one match. if test.anchored() && test.match_limit() != Some(1) { return None; } // We don't support tests with explicit search bounds. We could probably // support this by using the 'find_at' (and such) APIs. let bounds = test.bounds(); if !(bounds.start == 0 && bounds.end == test.haystack().len()) { return None; } // The Regex API specifically does not support disabling UTF-8 mode because // it can only search &str which is always valid UTF-8. if !test.utf8() { return None; } // regex-lite doesn't support Unicode-aware case insensitive matching. if test.case_insensitive() && (!pattern.is_ascii() || !test.haystack().is_ascii()) { return None; } Some(pattern) } /// Convert `Captures` into the test suite's capture values. fn testify_captures(caps: ®ex_lite::Captures<'_>) -> regex_test::Captures { let spans = caps.iter().map(|group| { group.map(|m| regex_test::Span { start: m.start(), end: m.end() }) }); // This unwrap is OK because we assume our 'caps' represents a match, and // a match always gives a non-zero number of groups with the first group // being non-None. regex_test::Captures::new(0, spans).unwrap() } ��������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������������