regex-automata-0.4.9/.cargo_vcs_info.json0000644000000001540000000000100137710ustar { "git": { "sha1": "1a069b9232c607b34c4937122361aa075ef573fa" }, "path_in_vcs": "regex-automata" }regex-automata-0.4.9/Cargo.toml0000644000000072610000000000100117750ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "regex-automata" version = "0.4.9" authors = [ "The Rust Project Developers", "Andrew Gallant ", ] build = false autolib = false autobins = false autoexamples = false autotests = false autobenches = false description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" readme = "README.md" keywords = [ "regex", "dfa", "automata", "automaton", "nfa", ] categories = ["text-processing"] license = "MIT OR Apache-2.0" repository = "https://github.com/rust-lang/regex/tree/master/regex-automata" [lib] name = "regex_automata" path = "src/lib.rs" bench = false [[test]] name = "integration" path = "tests/lib.rs" [dependencies.aho-corasick] version = "1.0.0" optional = true default-features = false [dependencies.log] version = "0.4.14" optional = true [dependencies.memchr] version = "2.6.0" optional = true default-features = false [dependencies.regex-syntax] version = "0.8.5" optional = true default-features = false [dev-dependencies.anyhow] version = "1.0.69" [dev-dependencies.bstr] version = "1.3.0" features = ["std"] default-features = false [dev-dependencies.doc-comment] version = "0.3.3" [dev-dependencies.env_logger] version = "0.9.3" features = [ "atty", "humantime", "termcolor", ] default-features = false [dev-dependencies.quickcheck] version = "1.0.3" default-features = false [dev-dependencies.regex-test] version = "0.1.0" [features] alloc = [] default = [ "std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid", ] dfa = [ "dfa-build", "dfa-search", "dfa-onepass", ] dfa-build = [ "nfa-thompson", "dfa-search", ] dfa-onepass = ["nfa-thompson"] dfa-search = [] hybrid = [ "alloc", "nfa-thompson", ] internal-instrument = ["internal-instrument-pikevm"] internal-instrument-pikevm = [ "logging", "std", ] logging = [ "dep:log", "aho-corasick?/logging", "memchr?/logging", ] meta = [ "syntax", "nfa-pikevm", ] nfa = [ "nfa-thompson", "nfa-pikevm", "nfa-backtrack", ] nfa-backtrack = ["nfa-thompson"] nfa-pikevm = ["nfa-thompson"] nfa-thompson = ["alloc"] perf = [ "perf-inline", "perf-literal", ] perf-inline = [] perf-literal = [ "perf-literal-substring", "perf-literal-multisubstring", ] perf-literal-multisubstring = [ "std", "dep:aho-corasick", ] perf-literal-substring = [ "aho-corasick?/perf-literal", "dep:memchr", ] std = [ "regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc", ] syntax = [ "dep:regex-syntax", "alloc", ] unicode = [ "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "unicode-word-boundary", "regex-syntax?/unicode", ] unicode-age = ["regex-syntax?/unicode-age"] unicode-bool = ["regex-syntax?/unicode-bool"] unicode-case = ["regex-syntax?/unicode-case"] unicode-gencat = ["regex-syntax?/unicode-gencat"] unicode-perl = ["regex-syntax?/unicode-perl"] unicode-script = ["regex-syntax?/unicode-script"] unicode-segment = ["regex-syntax?/unicode-segment"] unicode-word-boundary = [] regex-automata-0.4.9/Cargo.toml.orig000064400000000000000000000101321046102023000154450ustar 00000000000000[package] name = "regex-automata" version = "0.4.9" #:version authors = ["The Rust Project Developers", "Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" repository = "https://github.com/rust-lang/regex/tree/master/regex-automata" readme = "README.md" keywords = ["regex", "dfa", "automata", "automaton", "nfa"] license = "MIT OR Apache-2.0" categories = ["text-processing"] edition = "2021" autoexamples = false rust-version = "1.65" [lib] bench = false # This crate has many many many features. See the crate docs for a description # of each and when you might want to use them. [features] default = ["std", "syntax", "perf", "unicode", "meta", "nfa", "dfa", "hybrid"] std = ["regex-syntax?/std", "memchr?/std", "aho-corasick?/std", "alloc"] alloc = [] logging = ["dep:log", "aho-corasick?/logging", "memchr?/logging"] syntax = ["dep:regex-syntax", "alloc"] meta = ["syntax", "nfa-pikevm"] nfa = ["nfa-thompson", "nfa-pikevm", "nfa-backtrack"] nfa-thompson = ["alloc"] nfa-pikevm = ["nfa-thompson"] nfa-backtrack = ["nfa-thompson"] dfa = ["dfa-build", "dfa-search", "dfa-onepass"] dfa-build = ["nfa-thompson", "dfa-search"] dfa-search = [] dfa-onepass = ["nfa-thompson"] hybrid = ["alloc", "nfa-thompson"] perf = ["perf-inline", "perf-literal"] perf-inline = [] perf-literal = ["perf-literal-substring", "perf-literal-multisubstring"] perf-literal-substring = ["aho-corasick?/perf-literal", "dep:memchr"] perf-literal-multisubstring = ["std", "dep:aho-corasick"] # Enables all Unicode features. This expands if new Unicode features are added. unicode = [ "unicode-age", "unicode-bool", "unicode-case", "unicode-gencat", "unicode-perl", "unicode-script", "unicode-segment", "unicode-word-boundary", "regex-syntax?/unicode", ] # Enables use of the `Age` property, e.g., `\p{Age:3.0}`. unicode-age = ["regex-syntax?/unicode-age"] # Enables use of a smattering of boolean properties, e.g., `\p{Emoji}`. unicode-bool = ["regex-syntax?/unicode-bool"] # Enables Unicode-aware case insensitive matching, e.g., `(?i)β`. unicode-case = ["regex-syntax?/unicode-case"] # Enables Unicode general categories, e.g., `\p{Letter}` or `\pL`. unicode-gencat = ["regex-syntax?/unicode-gencat"] # Enables Unicode-aware Perl classes corresponding to `\w`, `\s` and `\d`. unicode-perl = ["regex-syntax?/unicode-perl"] # Enables Unicode scripts and script extensions, e.g., `\p{Greek}`. unicode-script = ["regex-syntax?/unicode-script"] # Enables Unicode segmentation properties, e.g., `\p{gcb=Extend}`. unicode-segment = ["regex-syntax?/unicode-segment"] # Enables Unicode word boundary support. If this is enabled with unicode-perl, # then data tables from regex-syntax are used. Otherwise, a new data table # inside regex-automata will be included. unicode-word-boundary = [] # These are strictly internal features that may be removed or changed in # non-compatible ways. internal-instrument = ["internal-instrument-pikevm"] internal-instrument-pikevm = ["logging", "std"] [dependencies] aho-corasick = { version = "1.0.0", optional = true, default-features = false } log = { version = "0.4.14", optional = true } memchr = { version = "2.6.0", optional = true, default-features = false } regex-syntax = { path = "../regex-syntax", version = "0.8.5", optional = true, default-features = false } [dev-dependencies] anyhow = "1.0.69" bstr = { version = "1.3.0", default-features = false, features = ["std"] } doc-comment = "0.3.3" quickcheck = { version = "1.0.3", default-features = false } regex-test = { path = "../regex-test", version = "0.1.0" } [dev-dependencies.env_logger] version = "0.9.3" default-features = false features = ["atty", "humantime", "termcolor"] # We put these tests here because they are written primarily against the # regex-automata API, and in particular use regex-automata features for # conditional compilation. If we moved these up as tests on 'regex' proper, # then we'd need to duplicate regex-automata's complex features on 'regex' too, # which I really do not want to do. [[test]] path = "tests/lib.rs" name = "integration" regex-automata-0.4.9/LICENSE-APACHE000064400000000000000000000251371046102023000145150ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. regex-automata-0.4.9/LICENSE-MIT000064400000000000000000000020571046102023000142210ustar 00000000000000Copyright (c) 2014 The Rust Project Developers Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regex-automata-0.4.9/README.md000064400000000000000000000123721046102023000140450ustar 00000000000000regex-automata ============== This crate exposes a variety of regex engines used by the `regex` crate. It provides a vast, sprawling and "expert" level API to each regex engine. The regex engines provided by this crate focus heavily on finite automata implementations and specifically guarantee worst case `O(m * n)` time complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.) [![Build status](https://github.com/rust-lang/regex/workflows/ci/badge.svg)](https://github.com/rust-lang/regex/actions) [![Crates.io](https://img.shields.io/crates/v/regex-automata.svg)](https://crates.io/crates/regex-automata) ### Documentation https://docs.rs/regex-automata ### Example This example shows how to search for matches of multiple regexes, where each regex uses the same capture group names to parse different key-value formats. ```rust use regex_automata::{meta::Regex, PatternID}; let re = Regex::new_many(&[ r#"(?m)^(?[[:word:]]+)=(?[[:word:]]+)$"#, r#"(?m)^(?[[:word:]]+)="(?[^"]+)"$"#, r#"(?m)^(?[[:word:]]+)='(?[^']+)'$"#, r#"(?m)^(?[[:word:]]+):\s*(?[[:word:]]+)$"#, ]).unwrap(); let hay = r#" best_album="Blow Your Face Out" best_quote='"then as it was, then again it will be"' best_year=1973 best_simpsons_episode: HOMR "#; let mut kvs = vec![]; for caps in re.captures_iter(hay) { // N.B. One could use capture indices '1' and '2' here // as well. Capture indices are local to each pattern. // (Just like names are.) let key = &hay[caps.get_group_by_name("key").unwrap()]; let val = &hay[caps.get_group_by_name("val").unwrap()]; kvs.push((key, val)); } assert_eq!(kvs, vec![ ("best_album", "Blow Your Face Out"), ("best_quote", "\"then as it was, then again it will be\""), ("best_year", "1973"), ("best_simpsons_episode", "HOMR"), ]); ``` ### Safety **I welcome audits of `unsafe` code.** This crate tries to be extremely conservative in its use of `unsafe`, but does use it in a few spots. In general, I am very open to removing uses of `unsafe` if it doesn't result in measurable performance regressions and doesn't result in significantly more complex code. Below is an outline of how `unsafe` is used in this crate. * `util::pool::Pool` makes use of `unsafe` to implement a fast path for accessing an element of the pool. The fast path applies to the first thread that uses the pool. In effect, the fast path is fast because it avoid a mutex lock. `unsafe` is also used in the no-std version of `Pool` to implement a spin lock for synchronization. * `util::lazy::Lazy` uses `unsafe` to implement a variant of `once_cell::sync::Lazy` that works in no-std environments. A no-std no-alloc implementation is also provided that requires use of `unsafe`. * The `dfa` module makes extensive use of `unsafe` to support zero-copy deserialization of DFAs. The high level problem is that you need to get from `&[u8]` to the internal representation of a DFA without doing any copies. This is required for support in no-std no-alloc environments. It also makes deserialization extremely cheap. * The `dfa` and `hybrid` modules use `unsafe` to explicitly elide bounds checks in the core search loops. This makes the codegen tighter and typically leads to consistent 5-10% performance improvements on some workloads. In general, the above reflect the only uses of `unsafe` throughout the entire `regex` crate. At present, there are no plans to meaningfully expand the use of `unsafe`. With that said, one thing folks have been asking for is cheap deserialization of a `regex::Regex`. My sense is that this feature will require a lot more `unsafe` in places to support zero-copy deserialization. It is unclear at this point whether this will be pursued. ### Motivation I started out building this crate because I wanted to re-work the `regex` crate internals to make it more amenable to optimizations. It turns out that there are a lot of different ways to build regex engines and even more ways to compose them. Moreover, heuristic literal optimizations are often tricky to get correct, but the fruit they bear is attractive. All of these things were difficult to expand upon without risking the introduction of more bugs. So I decided to tear things down and start fresh. In the course of doing so, I ended up designing strong boundaries between each component so that each component could be reasoned and tested independently. This also made it somewhat natural to expose the components as a library unto itself. Namely, folks have been asking for more capabilities in the regex crate for a long time, but these capabilities usually come with additional API complexity that I didn't want to introduce in the `regex` crate proper. But exposing them in an "expert" level crate like `regex-automata` seemed quite fine. In the end, I do still somewhat consider this crate an experiment. It is unclear whether the strong boundaries between components will be an impediment to ongoing development or not. De-coupling tends to lead to slower development in my experience, and when you mix in the added cost of not introducing breaking changes all of the time, things can get quite complicated. But, I don't think anyone has ever release the internals of a regex engine as a library before. So it will be interesting to see how it plays out! regex-automata-0.4.9/src/dfa/accel.rs000064400000000000000000000472201046102023000155240ustar 00000000000000// This module defines some core types for dealing with accelerated DFA states. // Briefly, a DFA state can be "accelerated" if all of its transitions except // for a few loop back to itself. This directly implies that the only way out // of such a state is if a byte corresponding to one of those non-loopback // transitions is found. Such states are often found in simple repetitions in // non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its // DFA with regex-cli: // // $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table // D 000000: // Q 000001: // *000002: // A 000003: \x00-` => 3, a => 8, b-\xFF => 3 // A 000004: \x00-` => 4, a => 7, b-\xFF => 4 // 000005: \x00-` => 4, b-\xFF => 4 // 000006: \x00-` => 3, a => 6, b-\xFF => 3 // 000007: \x00-\xFF => 2, EOI => 2 // 000008: \x00-\xFF => 2, EOI => 2 // // In particular, state 3 is accelerated (shown via the 'A' indicator) since // the only way to leave that state once entered is to see an 'a' byte. If // there is a long run of non-'a' bytes, then using something like 'memchr' // to find the next 'a' byte can be significantly faster than just using the // standard byte-at-a-time state machine. // // Unfortunately, this optimization rarely applies when Unicode is enabled. // For example, patterns like '[^a]' don't actually match any byte that isn't // 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't // 'a'. This makes the state machine much more complex---far beyond a single // state---and removes the ability to easily accelerate it. (Because if the // machine sees a non-UTF-8 sequence, then the machine won't match through it.) // // In practice, we only consider accelerating states that have 3 or fewer // non-loop transitions. At a certain point, you get diminishing returns, but // also because that's what the memchr crate supports. The structures below // hard-code this assumption and provide (de)serialization APIs for use inside // a DFA. // // And finally, note that there is some trickery involved in making it very // fast to not only check whether a state is accelerated at search time, but // also to access the bytes to search for to implement the acceleration itself. // dfa/special.rs provides more detail, but the short story is that all // accelerated states appear contiguously in a DFA. This means we can represent // the ID space of all accelerated DFA states with a single range. So given // a state ID, we can determine whether it's accelerated via // // min_accel_id <= id <= max_accel_id // // And find its corresponding accelerator with: // // accels.get((id - min_accel_id) / dfa_stride) #[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; use crate::util::{ int::Pointer, memchr, wire::{self, DeserializeError, Endian, SerializeError}, }; /// The base type used to represent a collection of accelerators. /// /// While an `Accel` is represented as a fixed size array of bytes, a /// *collection* of `Accel`s (called `Accels`) is represented internally as a /// slice of u32. While it's a bit unnatural to do this and costs us a bit of /// fairly low-risk not-safe code, it lets us remove the need for a second type /// parameter in the definition of dense::DFA. (Which really wants everything /// to be a slice of u32.) type AccelTy = u32; /// The size of the unit of representation for accelerators. /// /// ACCEL_CAP *must* be a multiple of this size. const ACCEL_TY_SIZE: usize = core::mem::size_of::(); /// The maximum length in bytes that a single Accel can be. This is distinct /// from the capacity of an accelerator in that the length represents only the /// bytes that should be read. const ACCEL_LEN: usize = 4; /// The capacity of each accelerator, in bytes. We set this to 8 since it's a /// multiple of 4 (our ID size) and because it gives us a little wiggle room /// if we want to support more accel bytes in the future without a breaking /// change. /// /// This MUST be a multiple of ACCEL_TY_SIZE. const ACCEL_CAP: usize = 8; /// Search for between 1 and 3 needle bytes in the given haystack, starting the /// search at the given position. If `needles` has a length other than 1-3, /// then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_fwd( needles: &[u8], haystack: &[u8], at: usize, ) -> Option { let bs = needles; let i = match needles.len() { 1 => memchr::memchr(bs[0], &haystack[at..])?, 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, 0 => panic!("cannot find with empty needles"), n => panic!("invalid needles length: {}", n), }; Some(at + i) } /// Search for between 1 and 3 needle bytes in the given haystack in reverse, /// starting the search at the given position. If `needles` has a length other /// than 1-3, then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_rev( needles: &[u8], haystack: &[u8], at: usize, ) -> Option { let bs = needles; match needles.len() { 1 => memchr::memrchr(bs[0], &haystack[..at]), 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), 0 => panic!("cannot find with empty needles"), n => panic!("invalid needles length: {}", n), } } /// Represents the accelerators for all accelerated states in a dense DFA. /// /// The `A` type parameter represents the type of the underlying bytes. /// Generally, this is either `&[AccelTy]` or `Vec`. #[derive(Clone)] pub(crate) struct Accels { /// A length prefixed slice of contiguous accelerators. See the top comment /// in this module for more details on how we can jump from a DFA's state /// ID to an accelerator in this list. /// /// The first 4 bytes always correspond to the number of accelerators /// that follow. accels: A, } #[cfg(feature = "dfa-build")] impl Accels> { /// Create an empty sequence of accelerators for a DFA. pub fn empty() -> Accels> { Accels { accels: vec![0] } } /// Add an accelerator to this sequence. /// /// This adds to the accelerator to the end of the sequence and therefore /// should be done in correspondence with its state in the DFA. /// /// This panics if this results in more accelerators than AccelTy::MAX. pub fn add(&mut self, accel: Accel) { self.accels.extend_from_slice(&accel.as_accel_tys()); let len = self.len(); self.set_len(len + 1); } /// Set the number of accelerators in this sequence, which is encoded in /// the first 4 bytes of the underlying bytes. fn set_len(&mut self, new_len: usize) { // The only way an accelerator gets added is if a state exists for // it, and if a state exists, then its index is guaranteed to be // representable by a AccelTy by virtue of the guarantees provided by // StateID. let new_len = AccelTy::try_from(new_len).unwrap(); self.accels[0] = new_len; } } impl<'a> Accels<&'a [AccelTy]> { /// Deserialize a sequence of accelerators from the given bytes. If there /// was a problem deserializing, then an error is returned. /// /// This is guaranteed to run in constant time. This does not guarantee /// that every accelerator in the returned collection is valid. Thus, /// accessing one may panic, or not-safe code that relies on accelerators /// being correct my result in UB. /// /// Callers may check the validity of every accelerator with the `validate` /// method. pub fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); let (accel_len, _) = wire::try_read_u32_as_usize(slice, "accelerators length")?; // The accelerator length is part of the accel_tys slice that // we deserialize. This is perhaps a bit idiosyncratic. It would // probably be better to split out the length into a real field. let accel_tys_len = wire::add( wire::mul(accel_len, 2, "total number of accelerator accel_tys")?, 1, "total number of accel_tys", )?; let accel_tys_bytes_len = wire::mul( ACCEL_TY_SIZE, accel_tys_len, "total number of bytes in accelerators", )?; wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?; wire::check_alignment::(slice)?; let accel_tys = &slice[..accel_tys_bytes_len]; slice = &slice[accel_tys_bytes_len..]; // SAFETY: We've checked the length and alignment above, and since // slice is just bytes and AccelTy is just a u32, we can safely cast to // a slice of &[AccelTy]. let accels = unsafe { core::slice::from_raw_parts( accel_tys.as_ptr().cast::(), accel_tys_len, ) }; Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start)) } } impl> Accels { /// Return an owned version of the accelerators. #[cfg(feature = "alloc")] pub fn to_owned(&self) -> Accels> { Accels { accels: self.accels.as_ref().to_vec() } } /// Return a borrowed version of the accelerators. pub fn as_ref(&self) -> Accels<&[AccelTy]> { Accels { accels: self.accels.as_ref() } } /// Return the bytes representing the serialization of the accelerators. pub fn as_bytes(&self) -> &[u8] { let accels = self.accels.as_ref(); // SAFETY: This is safe because accels is a just a slice of AccelTy, // and u8 always has a smaller alignment. unsafe { core::slice::from_raw_parts( accels.as_ptr().cast::(), accels.len() * ACCEL_TY_SIZE, ) } } /// Returns the memory usage, in bytes, of these accelerators. /// /// The memory usage is computed based on the number of bytes used to /// represent all of the accelerators. /// /// This does **not** include the stack size used by this value. pub fn memory_usage(&self) -> usize { self.as_bytes().len() } /// Return the bytes to search for corresponding to the accelerator in this /// sequence at index `i`. If no such accelerator exists, then this panics. /// /// The significance of the index is that it should be in correspondence /// with the index of the corresponding DFA. That is, accelerated DFA /// states are stored contiguously in the DFA and have an ordering implied /// by their respective state IDs. The state's index in that sequence /// corresponds to the index of its corresponding accelerator. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn needles(&self, i: usize) -> &[u8] { if i >= self.len() { panic!("invalid accelerator index {}", i); } let bytes = self.as_bytes(); let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; let len = usize::from(bytes[offset]); &bytes[offset + 1..offset + 1 + len] } /// Return the total number of accelerators in this sequence. pub fn len(&self) -> usize { // This should never panic since deserialization checks that the // length can fit into a usize. usize::try_from(self.accels.as_ref()[0]).unwrap() } /// Return the accelerator in this sequence at index `i`. If no such /// accelerator exists, then this returns None. /// /// See the docs for `needles` on the significance of the index. fn get(&self, i: usize) -> Option { if i >= self.len() { return None; } let offset = ACCEL_TY_SIZE + i * ACCEL_CAP; let accel = Accel::from_slice(&self.as_bytes()[offset..]) .expect("Accels must contain valid accelerators"); Some(accel) } /// Returns an iterator of accelerators in this sequence. fn iter(&self) -> IterAccels<'_, A> { IterAccels { accels: self, i: 0 } } /// Writes these accelerators to the given byte buffer using the indicated /// endianness. If the given buffer is too small, then an error is /// returned. Upon success, the total number of bytes written is returned. /// The number of bytes written is guaranteed to be a multiple of 8. pub fn write_to( &self, dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); assert_eq!( nwrite % ACCEL_TY_SIZE, 0, "expected accelerator bytes written to be a multiple of {}", ACCEL_TY_SIZE, ); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("accelerators")); } // The number of accelerators can never exceed AccelTy::MAX. E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst); // The actual accelerators are just raw bytes and thus their endianness // is irrelevant. So we can copy them as bytes. dst[ACCEL_TY_SIZE..nwrite] .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]); Ok(nwrite) } /// Validates that every accelerator in this collection can be successfully /// deserialized as a valid accelerator. pub fn validate(&self) -> Result<(), DeserializeError> { for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) { let _ = Accel::from_slice(chunk)?; } Ok(()) } /// Returns the total number of bytes written by `write_to`. pub fn write_to_len(&self) -> usize { self.as_bytes().len() } } impl> core::fmt::Debug for Accels { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "Accels(")?; let mut list = f.debug_list(); for a in self.iter() { list.entry(&a); } list.finish()?; write!(f, ")") } } #[derive(Debug)] struct IterAccels<'a, A: AsRef<[AccelTy]>> { accels: &'a Accels, i: usize, } impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> { type Item = Accel; fn next(&mut self) -> Option { let accel = self.accels.get(self.i)?; self.i += 1; Some(accel) } } /// Accel represents a structure for determining how to "accelerate" a DFA /// state. /// /// Namely, it contains zero or more bytes that must be seen in order for the /// DFA to leave the state it is associated with. In practice, the actual range /// is 1 to 3 bytes. /// /// The purpose of acceleration is to identify states whose vast majority /// of transitions are just loops back to the same state. For example, /// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state /// (corresponding to `[^a]+`) where all transitions *except* for `a` and /// `b` loop back to itself. Thus, this state can be "accelerated" by simply /// looking for the next occurrence of either `a` or `b` instead of explicitly /// following transitions. (In this case, `b` transitions to the next state /// where as `a` would transition to the dead state.) #[derive(Clone)] pub(crate) struct Accel { /// The first byte is the length. Subsequent bytes are the accelerated /// bytes. /// /// Note that we make every accelerator 8 bytes as a slightly wasteful /// way of making sure alignment is always correct for state ID sizes of /// 1, 2, 4 and 8. This should be okay since accelerated states aren't /// particularly common, especially when Unicode is enabled. bytes: [u8; ACCEL_CAP], } impl Accel { /// Returns an empty accel, where no bytes are accelerated. #[cfg(feature = "dfa-build")] pub fn new() -> Accel { Accel { bytes: [0; ACCEL_CAP] } } /// Returns a verified accelerator derived from the beginning of the given /// slice. /// /// If the slice is not long enough or contains invalid bytes for an /// accelerator, then this returns an error. pub fn from_slice(mut slice: &[u8]) -> Result { slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())]; let bytes = slice .try_into() .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?; Accel::from_bytes(bytes) } /// Returns a verified accelerator derived from raw bytes. /// /// If the given bytes are invalid, then this returns an error. fn from_bytes(bytes: [u8; 4]) -> Result { if usize::from(bytes[0]) >= ACCEL_LEN { return Err(DeserializeError::generic( "accelerator bytes cannot have length more than 3", )); } Ok(Accel::from_bytes_unchecked(bytes)) } /// Returns an accelerator derived from raw bytes. /// /// This does not check whether the given bytes are valid. Invalid bytes /// cannot sacrifice memory safety, but may result in panics or silent /// logic bugs. fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel { Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] } } /// Attempts to add the given byte to this accelerator. If the accelerator /// is already full or thinks the byte is a poor accelerator, then this /// returns false. Otherwise, returns true. /// /// If the given byte is already in this accelerator, then it panics. #[cfg(feature = "dfa-build")] pub fn add(&mut self, byte: u8) -> bool { if self.len() >= 3 { return false; } // As a special case, we totally reject trying to accelerate a state // with an ASCII space. In most cases, it occurs very frequently, and // tends to result in worse overall performance. if byte == b' ' { return false; } assert!( !self.contains(byte), "accelerator already contains {:?}", crate::util::escape::DebugByte(byte) ); self.bytes[self.len() + 1] = byte; self.bytes[0] += 1; true } /// Return the number of bytes in this accelerator. pub fn len(&self) -> usize { usize::from(self.bytes[0]) } /// Returns true if and only if there are no bytes in this accelerator. #[cfg(feature = "dfa-build")] pub fn is_empty(&self) -> bool { self.len() == 0 } /// Returns the slice of bytes to accelerate. /// /// If this accelerator is empty, then this returns an empty slice. fn needles(&self) -> &[u8] { &self.bytes[1..1 + self.len()] } /// Returns true if and only if this accelerator will accelerate the given /// byte. #[cfg(feature = "dfa-build")] fn contains(&self, byte: u8) -> bool { self.needles().iter().position(|&b| b == byte).is_some() } /// Returns the accelerator bytes as an array of AccelTys. #[cfg(feature = "dfa-build")] fn as_accel_tys(&self) -> [AccelTy; 2] { assert_eq!(ACCEL_CAP, 8); // These unwraps are OK since ACCEL_CAP is set to 8. let first = AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap()); let second = AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap()); [first, second] } } impl core::fmt::Debug for Accel { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "Accel(")?; let mut set = f.debug_set(); for &b in self.needles() { set.entry(&crate::util::escape::DebugByte(b)); } set.finish()?; write!(f, ")") } } regex-automata-0.4.9/src/dfa/automaton.rs000064400000000000000000002745211046102023000164720ustar 00000000000000#[cfg(feature = "alloc")] use crate::util::search::PatternSet; use crate::{ dfa::search, util::{ empty, prefilter::Prefilter, primitives::{PatternID, StateID}, search::{Anchored, HalfMatch, Input, MatchError}, start, }, }; /// A trait describing the interface of a deterministic finite automaton (DFA). /// /// The complexity of this trait probably means that it's unlikely for others /// to implement it. The primary purpose of the trait is to provide for a way /// of abstracting over different types of DFAs. In this crate, that means /// dense DFAs and sparse DFAs. (Dense DFAs are fast but memory hungry, where /// as sparse DFAs are slower but come with a smaller memory footprint. But /// they otherwise provide exactly equivalent expressive power.) For example, a /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) is generic over this trait. /// /// Normally, a DFA's execution model is very simple. You might have a single /// start state, zero or more final or "match" states and a function that /// transitions from one state to the next given the next byte of input. /// Unfortunately, the interface described by this trait is significantly /// more complicated than this. The complexity has a number of different /// reasons, mostly motivated by performance, functionality or space savings: /// /// * A DFA can search for multiple patterns simultaneously. This /// means extra information is returned when a match occurs. Namely, /// a match is not just an offset, but an offset plus a pattern ID. /// [`Automaton::pattern_len`] returns the number of patterns compiled into /// the DFA, [`Automaton::match_len`] returns the total number of patterns /// that match in a particular state and [`Automaton::match_pattern`] permits /// iterating over the patterns that match in a particular state. /// * A DFA can have multiple start states, and the choice of which start /// state to use depends on the content of the string being searched and /// position of the search, as well as whether the search is an anchored /// search for a specific pattern in the DFA. Moreover, computing the start /// state also depends on whether you're doing a forward or a reverse search. /// [`Automaton::start_state_forward`] and [`Automaton::start_state_reverse`] /// are used to compute the start state for forward and reverse searches, /// respectively. /// * All matches are delayed by one byte to support things like `$` and `\b` /// at the end of a pattern. Therefore, every use of a DFA is required to use /// [`Automaton::next_eoi_state`] /// at the end of the search to compute the final transition. /// * For optimization reasons, some states are treated specially. Every /// state is either special or not, which can be determined via the /// [`Automaton::is_special_state`] method. If it's special, then the state /// must be at least one of a few possible types of states. (Note that some /// types can overlap, for example, a match state can also be an accel state. /// But some types can't. If a state is a dead state, then it can never be any /// other type of state.) Those types are: /// * A dead state. A dead state means the DFA will never enter a match /// state. This can be queried via the [`Automaton::is_dead_state`] method. /// * A quit state. A quit state occurs if the DFA had to stop the search /// prematurely for some reason. This can be queried via the /// [`Automaton::is_quit_state`] method. /// * A match state. A match state occurs when a match is found. When a DFA /// enters a match state, the search may stop immediately (when looking /// for the earliest match), or it may continue to find the leftmost-first /// match. This can be queried via the [`Automaton::is_match_state`] /// method. /// * A start state. A start state is where a search begins. For every /// search, there is exactly one start state that is used, however, a /// DFA may contain many start states. When the search is in a start /// state, it may use a prefilter to quickly skip to candidate matches /// without executing the DFA on every byte. This can be queried via the /// [`Automaton::is_start_state`] method. /// * An accel state. An accel state is a state that is accelerated. /// That is, it is a state where _most_ of its transitions loop back to /// itself and only a small number of transitions lead to other states. /// This kind of state is said to be accelerated because a search routine /// can quickly look for the bytes leading out of the state instead of /// continuing to execute the DFA on each byte. This can be queried via the /// [`Automaton::is_accel_state`] method. And the bytes that lead out of /// the state can be queried via the [`Automaton::accelerator`] method. /// /// There are a number of provided methods on this trait that implement /// efficient searching (for forwards and backwards) with a DFA using /// all of the above features of this trait. In particular, given the /// complexity of all these features, implementing a search routine in /// this trait can be a little subtle. With that said, it is possible to /// somewhat simplify the search routine. For example, handling accelerated /// states is strictly optional, since it is always correct to assume that /// `Automaton::is_accel_state` returns false. However, one complex part of /// writing a search routine using this trait is handling the 1-byte delay of a /// match. That is not optional. /// /// # Safety /// /// This trait is not safe to implement so that code may rely on the /// correctness of implementations of this trait to avoid undefined behavior. /// The primary correctness guarantees are: /// /// * `Automaton::start_state` always returns a valid state ID or an error or /// panics. /// * `Automaton::next_state`, when given a valid state ID, always returns /// a valid state ID for all values of `anchored` and `byte`, or otherwise /// panics. /// /// In general, the rest of the methods on `Automaton` need to uphold their /// contracts as well. For example, `Automaton::is_dead` should only returns /// true if the given state ID is actually a dead state. pub unsafe trait Automaton { /// Transitions from the current state to the next state, given the next /// byte of input. /// /// Implementations must guarantee that the returned ID is always a valid /// ID when `current` refers to a valid ID. Moreover, the transition /// function must be defined for all possible values of `input`. /// /// # Panics /// /// If the given ID does not refer to a valid state, then this routine /// may panic but it also may not panic and instead return an invalid ID. /// However, if the caller provides an invalid ID then this must never /// sacrifice memory safety. /// /// # Example /// /// This shows a simplistic example for walking a DFA for a given haystack /// by using the `next_state` method. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::new(r"[a-z]+r")?; /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk the /// // special "EOI" transition at the end of the search. /// state = dfa.next_eoi_state(state); /// assert!(dfa.is_match_state(state)); /// /// # Ok::<(), Box>(()) /// ``` fn next_state(&self, current: StateID, input: u8) -> StateID; /// Transitions from the current state to the next state, given the next /// byte of input. /// /// Unlike [`Automaton::next_state`], implementations may implement this /// more efficiently by assuming that the `current` state ID is valid. /// Typically, this manifests by eliding bounds checks. /// /// # Safety /// /// Callers of this method must guarantee that `current` refers to a valid /// state ID. If `current` is not a valid state ID for this automaton, then /// calling this routine may result in undefined behavior. /// /// If `current` is valid, then implementations must guarantee that the ID /// returned is valid for all possible values of `input`. unsafe fn next_state_unchecked( &self, current: StateID, input: u8, ) -> StateID; /// Transitions from the current state to the next state for the special /// EOI symbol. /// /// Implementations must guarantee that the returned ID is always a valid /// ID when `current` refers to a valid ID. /// /// This routine must be called at the end of every search in a correct /// implementation of search. Namely, DFAs in this crate delay matches /// by one byte in order to support look-around operators. Thus, after /// reaching the end of a haystack, a search implementation must follow one /// last EOI transition. /// /// It is best to think of EOI as an additional symbol in the alphabet of /// a DFA that is distinct from every other symbol. That is, the alphabet /// of DFAs in this crate has a logical size of 257 instead of 256, where /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the /// physical alphabet size may be smaller because of alphabet compression /// via equivalence classes, but EOI is always represented somehow in the /// alphabet.) /// /// # Panics /// /// If the given ID does not refer to a valid state, then this routine /// may panic but it also may not panic and instead return an invalid ID. /// However, if the caller provides an invalid ID then this must never /// sacrifice memory safety. /// /// # Example /// /// This shows a simplistic example for walking a DFA for a given haystack, /// and then finishing the search with the final EOI transition. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::new(r"[a-z]+r")?; /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// // /// // The unwrap is OK because we aren't requesting a start state for a /// // specific pattern. /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk /// // the special "EOI" transition at the end of the search. Without this /// // final transition, the assert below will fail since the DFA will not /// // have entered a match state yet! /// state = dfa.next_eoi_state(state); /// assert!(dfa.is_match_state(state)); /// /// # Ok::<(), Box>(()) /// ``` fn next_eoi_state(&self, current: StateID) -> StateID; /// Return the ID of the start state for this DFA for the given starting /// configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. /// * Whether a "look-behind" byte exists. For example, the `^` anchor /// matches if and only if there is no look-behind byte. /// * The specific value of that look-behind byte. For example, a `(?m:^)` /// assertion only matches when there is either no look-behind byte, or /// when the look-behind byte is a line terminator. /// /// The [starting configuration](start::Config) provides the above /// information. /// /// This routine can be used for either forward or reverse searches. /// Although, as a convenience, if you have an [`Input`], then it may /// be more succinct to use [`Automaton::start_state_forward`] or /// [`Automaton::start_state_reverse`]. Note, for example, that the /// convenience routines return a [`MatchError`] on failure where as this /// routine returns a [`StartError`]. /// /// # Errors /// /// This may return a [`StartError`] if the search needs to give up when /// determining the start state (for example, if it sees a "quit" byte). /// This can also return an error if the given configuration contains an /// unsupported [`Anchored`] configuration. fn start_state( &self, config: &start::Config, ) -> Result; /// Return the ID of the start state for this DFA when executing a forward /// search. /// /// This is a convenience routine for calling [`Automaton::start_state`] /// that converts the given [`Input`] to a [start /// configuration](start::Config). Additionally, if an error occurs, it is /// converted from a [`StartError`] to a [`MatchError`] using the offset /// information in the given [`Input`]. /// /// # Errors /// /// This may return a [`MatchError`] if the search needs to give up /// when determining the start state (for example, if it sees a "quit" /// byte). This can also return an error if the given `Input` contains an /// unsupported [`Anchored`] configuration. fn start_state_forward( &self, input: &Input<'_>, ) -> Result { let config = start::Config::from_input_forward(input); self.start_state(&config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input .start() .checked_sub(1) .expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => { MatchError::unsupported_anchored(mode) } }) } /// Return the ID of the start state for this DFA when executing a reverse /// search. /// /// This is a convenience routine for calling [`Automaton::start_state`] /// that converts the given [`Input`] to a [start /// configuration](start::Config). Additionally, if an error occurs, it is /// converted from a [`StartError`] to a [`MatchError`] using the offset /// information in the given [`Input`]. /// /// # Errors /// /// This may return a [`MatchError`] if the search needs to give up /// when determining the start state (for example, if it sees a "quit" /// byte). This can also return an error if the given `Input` contains an /// unsupported [`Anchored`] configuration. fn start_state_reverse( &self, input: &Input<'_>, ) -> Result { let config = start::Config::from_input_reverse(input); self.start_state(&config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input.end(); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => { MatchError::unsupported_anchored(mode) } }) } /// If this DFA has a universal starting state for the given anchor mode /// and the DFA supports universal starting states, then this returns that /// state's identifier. /// /// A DFA is said to have a universal starting state when the starting /// state is invariant with respect to the haystack. Usually, the starting /// state is chosen depending on the bytes immediately surrounding the /// starting position of a search. However, the starting state only differs /// when one or more of the patterns in the DFA have look-around assertions /// in its prefix. /// /// Stated differently, if none of the patterns in a DFA have look-around /// assertions in their prefix, then the DFA has a universal starting state /// and _may_ be returned by this method. /// /// It is always correct for implementations to return `None`, and indeed, /// this is what the default implementation does. When this returns `None`, /// callers must use either `start_state_forward` or `start_state_reverse` /// to get the starting state. /// /// # Use case /// /// There are a few reasons why one might want to use this: /// /// * If you know your regex patterns have no look-around assertions in /// their prefix, then calling this routine is likely cheaper and perhaps /// more semantically meaningful. /// * When implementing prefilter support in a DFA regex implementation, /// it is necessary to re-compute the start state after a candidate /// is returned from the prefilter. However, this is only needed when /// there isn't a universal start state. When one exists, one can avoid /// re-computing the start state. /// /// # Example /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense::DFA}, /// Anchored, /// }; /// /// // There are no look-around assertions in the prefixes of any of the /// // patterns, so we get a universal start state. /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+$", "[A-Z]+"])?; /// assert!(dfa.universal_start_state(Anchored::No).is_some()); /// assert!(dfa.universal_start_state(Anchored::Yes).is_some()); /// /// // One of the patterns has a look-around assertion in its prefix, /// // so this means there is no longer a universal start state. /// let dfa = DFA::new_many(&["[0-9]+", "^[a-z]+$", "[A-Z]+"])?; /// assert!(!dfa.universal_start_state(Anchored::No).is_some()); /// assert!(!dfa.universal_start_state(Anchored::Yes).is_some()); /// # Ok::<(), Box>(()) /// ``` #[inline] fn universal_start_state(&self, _mode: Anchored) -> Option { None } /// Returns true if and only if the given identifier corresponds to a /// "special" state. A special state is one or more of the following: /// a dead state, a quit state, a match state, a start state or an /// accelerated state. /// /// A correct implementation _may_ always return false for states that /// are either start states or accelerated states, since that information /// is only intended to be used for optimization purposes. Correct /// implementations must return true if the state is a dead, quit or match /// state. This is because search routines using this trait must be able /// to rely on `is_special_state` as an indicator that a state may need /// special treatment. (For example, when a search routine sees a dead /// state, it must terminate.) /// /// This routine permits search implementations to use a single branch to /// check whether a state needs special attention before executing the next /// transition. The example below shows how to do this. /// /// # Example /// /// This example shows how `is_special_state` can be used to implement a /// correct search routine with minimal branching. In particular, this /// search routine implements "leftmost" matching, which means that it /// doesn't immediately stop once a match is found. Instead, it continues /// until it reaches a dead state. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// HalfMatch, MatchError, Input, /// }; /// /// fn find( /// dfa: &A, /// haystack: &[u8], /// ) -> Result, MatchError> { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. Note that start states can never /// // be match states (since DFAs in this crate delay matches by 1 /// // byte), so we don't need to check if the start state is a match. /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// // Walk all the bytes in the haystack. We can quit early if we see /// // a dead or a quit state. The former means the automaton will /// // never transition to any other state. The latter means that the /// // automaton entered a condition in which its search failed. /// for (i, &b) in haystack.iter().enumerate() { /// state = dfa.next_state(state, b); /// if dfa.is_special_state(state) { /// if dfa.is_match_state(state) { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(state, 0), /// i, /// )); /// } else if dfa.is_dead_state(state) { /// return Ok(last_match); /// } else if dfa.is_quit_state(state) { /// // It is possible to enter into a quit state after /// // observing a match has occurred. In that case, we /// // should return the match instead of an error. /// if last_match.is_some() { /// return Ok(last_match); /// } /// return Err(MatchError::quit(b, i)); /// } /// // Implementors may also want to check for start or accel /// // states and handle them differently for performance /// // reasons. But it is not necessary for correctness. /// } /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk /// // the special "EOI" transition at the end of the search. /// state = dfa.next_eoi_state(state); /// if dfa.is_match_state(state) { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(state, 0), /// haystack.len(), /// )); /// } /// Ok(last_match) /// } /// /// // We use a greedy '+' operator to show how the search doesn't just /// // stop once a match is detected. It continues extending the match. /// // Using '[a-z]+?' would also work as expected and stop the search /// // early. Greediness is built into the automaton. /// let dfa = dense::DFA::new(r"[a-z]+")?; /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 10); /// /// // Here's another example that tests our handling of the special EOI /// // transition. This will fail to find a match if we don't call /// // 'next_eoi_state' at the end of the search since the match isn't /// // found until the final byte in the haystack. /// let dfa = dense::DFA::new(r"[0-9]{4}")?; /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // And note that our search implementation above automatically works /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects /// // the appropriate pattern ID for us. /// let dfa = dense::DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find(&dfa, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 3); /// let mat = find(&dfa, &haystack[3..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 7); /// let mat = find(&dfa, &haystack[10..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 5); /// /// # Ok::<(), Box>(()) /// ``` fn is_special_state(&self, id: StateID) -> bool; /// Returns true if and only if the given identifier corresponds to a dead /// state. When a DFA enters a dead state, it is impossible to leave. That /// is, every transition on a dead state by definition leads back to the /// same dead state. /// /// In practice, the dead state always corresponds to the identifier `0`. /// Moreover, in practice, there is only one dead state. /// /// The existence of a dead state is not strictly required in the classical /// model of finite state machines, where one generally only cares about /// the question of whether an input sequence matches or not. Dead states /// are not needed to answer that question, since one can immediately quit /// as soon as one enters a final or "match" state. However, we don't just /// care about matches but also care about the location of matches, and /// more specifically, care about semantics like "greedy" matching. /// /// For example, given the pattern `a+` and the input `aaaz`, the dead /// state won't be entered until the state machine reaches `z` in the /// input, at which point, the search routine can quit. But without the /// dead state, the search routine wouldn't know when to quit. In a /// classical representation, the search routine would stop after seeing /// the first `a` (which is when the search would enter a match state). But /// this wouldn't implement "greedy" matching where `a+` matches as many /// `a`'s as possible. /// /// # Example /// /// See the example for [`Automaton::is_special_state`] for how to use this /// method correctly. fn is_dead_state(&self, id: StateID) -> bool; /// Returns true if and only if the given identifier corresponds to a quit /// state. A quit state is like a dead state (it has no transitions other /// than to itself), except it indicates that the DFA failed to complete /// the search. When this occurs, callers can neither accept or reject that /// a match occurred. /// /// In practice, the quit state always corresponds to the state immediately /// following the dead state. (Which is not usually represented by `1`, /// since state identifiers are pre-multiplied by the state machine's /// alphabet stride, and the alphabet stride varies between DFAs.) /// /// The typical way in which a quit state can occur is when heuristic /// support for Unicode word boundaries is enabled via the /// [`dense::Config::unicode_word_boundary`](crate::dfa::dense::Config::unicode_word_boundary) /// option. But other options, like the lower level /// [`dense::Config::quit`](crate::dfa::dense::Config::quit) /// configuration, can also result in a quit state being entered. The /// purpose of the quit state is to provide a way to execute a fast DFA /// in common cases while delegating to slower routines when the DFA quits. /// /// The default search implementations provided by this crate will return a /// [`MatchError::quit`] error when a quit state is entered. /// /// # Example /// /// See the example for [`Automaton::is_special_state`] for how to use this /// method correctly. fn is_quit_state(&self, id: StateID) -> bool; /// Returns true if and only if the given identifier corresponds to a /// match state. A match state is also referred to as a "final" state and /// indicates that a match has been found. /// /// If all you care about is whether a particular pattern matches in the /// input sequence, then a search routine can quit early as soon as the /// machine enters a match state. However, if you're looking for the /// standard "leftmost-first" match location, then search _must_ continue /// until either the end of the input or until the machine enters a dead /// state. (Since either condition implies that no other useful work can /// be done.) Namely, when looking for the location of a match, then /// search implementations should record the most recent location in /// which a match state was entered, but otherwise continue executing the /// search as normal. (The search may even leave the match state.) Once /// the termination condition is reached, the most recently recorded match /// location should be returned. /// /// Finally, one additional power given to match states in this crate /// is that they are always associated with a specific pattern in order /// to support multi-DFAs. See [`Automaton::match_pattern`] for more /// details and an example for how to query the pattern associated with a /// particular match state. /// /// # Example /// /// See the example for [`Automaton::is_special_state`] for how to use this /// method correctly. fn is_match_state(&self, id: StateID) -> bool; /// Returns true only if the given identifier corresponds to a start /// state /// /// A start state is a state in which a DFA begins a search. /// All searches begin in a start state. Moreover, since all matches are /// delayed by one byte, a start state can never be a match state. /// /// The main role of a start state is, as mentioned, to be a starting /// point for a DFA. This starting point is determined via one of /// [`Automaton::start_state_forward`] or /// [`Automaton::start_state_reverse`], depending on whether one is doing /// a forward or a reverse search, respectively. /// /// A secondary use of start states is for prefix acceleration. Namely, /// while executing a search, if one detects that you're in a start state, /// then it may be faster to look for the next match of a prefix of the /// pattern, if one exists. If a prefix exists and since all matches must /// begin with that prefix, then skipping ahead to occurrences of that /// prefix may be much faster than executing the DFA. /// /// As mentioned in the documentation for /// [`is_special_state`](Automaton::is_special_state) implementations /// _may_ always return false, even if the given identifier is a start /// state. This is because knowing whether a state is a start state or not /// is not necessary for correctness and is only treated as a potential /// performance optimization. (For example, the implementations of this /// trait in this crate will only return true when the given identifier /// corresponds to a start state and when [specialization of start /// states](crate::dfa::dense::Config::specialize_start_states) was enabled /// during DFA construction. If start state specialization is disabled /// (which is the default), then this method will always return false.) /// /// # Example /// /// This example shows how to implement your own search routine that does /// a prefix search whenever the search enters a start state. /// /// Note that you do not need to implement your own search routine /// to make use of prefilters like this. The search routines /// provided by this crate already implement prefilter support via /// the [`Prefilter`](crate::util::prefilter::Prefilter) trait. /// A prefilter can be added to your search configuration with /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter) for /// dense and sparse DFAs in this crate. /// /// This example is meant to show how you might deal with prefilters in a /// simplified case if you are implementing your own search routine. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// HalfMatch, MatchError, Input, /// }; /// /// fn find_byte(slice: &[u8], at: usize, byte: u8) -> Option { /// // Would be faster to use the memchr crate, but this is still /// // faster than running through the DFA. /// slice[at..].iter().position(|&b| b == byte).map(|i| at + i) /// } /// /// fn find( /// dfa: &A, /// haystack: &[u8], /// prefix_byte: Option, /// ) -> Result, MatchError> { /// // See the Automaton::is_special_state example for similar code /// // with more comments. /// /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// let mut last_match = None; /// let mut pos = 0; /// while pos < haystack.len() { /// let b = haystack[pos]; /// state = dfa.next_state(state, b); /// pos += 1; /// if dfa.is_special_state(state) { /// if dfa.is_match_state(state) { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(state, 0), /// pos - 1, /// )); /// } else if dfa.is_dead_state(state) { /// return Ok(last_match); /// } else if dfa.is_quit_state(state) { /// // It is possible to enter into a quit state after /// // observing a match has occurred. In that case, we /// // should return the match instead of an error. /// if last_match.is_some() { /// return Ok(last_match); /// } /// return Err(MatchError::quit(b, pos - 1)); /// } else if dfa.is_start_state(state) { /// // If we're in a start state and know all matches begin /// // with a particular byte, then we can quickly skip to /// // candidate matches without running the DFA through /// // every byte inbetween. /// if let Some(prefix_byte) = prefix_byte { /// pos = match find_byte(haystack, pos, prefix_byte) { /// Some(pos) => pos, /// None => break, /// }; /// } /// } /// } /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk /// // the special "EOI" transition at the end of the search. /// state = dfa.next_eoi_state(state); /// if dfa.is_match_state(state) { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(state, 0), /// haystack.len(), /// )); /// } /// Ok(last_match) /// } /// /// // In this example, it's obvious that all occurrences of our pattern /// // begin with 'Z', so we pass in 'Z'. Note also that we need to /// // enable start state specialization, or else it won't be possible to /// // detect start states during a search. ('is_start_state' would always /// // return false.) /// let dfa = dense::DFA::builder() /// .configure(dense::DFA::config().specialize_start_states(true)) /// .build(r"Z[a-z]+")?; /// let haystack = "123 foobar Zbaz quux".as_bytes(); /// let mat = find(&dfa, haystack, Some(b'Z'))?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // But note that we don't need to pass in a prefix byte. If we don't, /// // then the search routine does no acceleration. /// let mat = find(&dfa, haystack, None)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // However, if we pass an incorrect byte, then the prefix search will /// // result in incorrect results. /// assert_eq!(find(&dfa, haystack, Some(b'X'))?, None); /// /// # Ok::<(), Box>(()) /// ``` fn is_start_state(&self, id: StateID) -> bool; /// Returns true if and only if the given identifier corresponds to an /// accelerated state. /// /// An accelerated state is a special optimization /// trick implemented by this crate. Namely, if /// [`dense::Config::accelerate`](crate::dfa::dense::Config::accelerate) is /// enabled (and it is by default), then DFAs generated by this crate will /// tag states meeting certain characteristics as accelerated. States meet /// this criteria whenever most of their transitions are self-transitions. /// That is, transitions that loop back to the same state. When a small /// number of transitions aren't self-transitions, then it follows that /// there are only a small number of bytes that can cause the DFA to leave /// that state. Thus, there is an opportunity to look for those bytes /// using more optimized routines rather than continuing to run through /// the DFA. This trick is similar to the prefilter idea described in /// the documentation of [`Automaton::is_start_state`] with two main /// differences: /// /// 1. It is more limited since acceleration only applies to single bytes. /// This means states are rarely accelerated when Unicode mode is enabled /// (which is enabled by default). /// 2. It can occur anywhere in the DFA, which increases optimization /// opportunities. /// /// Like the prefilter idea, the main downside (and a possible reason to /// disable it) is that it can lead to worse performance in some cases. /// Namely, if a state is accelerated for very common bytes, then the /// overhead of checking for acceleration and using the more optimized /// routines to look for those bytes can cause overall performance to be /// worse than if acceleration wasn't enabled at all. /// /// A simple example of a regex that has an accelerated state is /// `(?-u)[^a]+a`. Namely, the `[^a]+` sub-expression gets compiled down /// into a single state where all transitions except for `a` loop back to /// itself, and where `a` is the only transition (other than the special /// EOI transition) that goes to some other state. Thus, this state can /// be accelerated and implemented more efficiently by calling an /// optimized routine like `memchr` with `a` as the needle. Notice that /// the `(?-u)` to disable Unicode is necessary here, as without it, /// `[^a]` will match any UTF-8 encoding of any Unicode scalar value other /// than `a`. This more complicated expression compiles down to many DFA /// states and the simple acceleration optimization is no longer available. /// /// Typically, this routine is used to guard calls to /// [`Automaton::accelerator`], which returns the accelerated bytes for /// the specified state. fn is_accel_state(&self, id: StateID) -> bool; /// Returns the total number of patterns compiled into this DFA. /// /// In the case of a DFA that contains no patterns, this must return `0`. /// /// # Example /// /// This example shows the pattern length for a DFA that never matches: /// /// ``` /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa: DFA> = DFA::never_match()?; /// assert_eq!(dfa.pattern_len(), 0); /// # Ok::<(), Box>(()) /// ``` /// /// And another example for a DFA that matches at every position: /// /// ``` /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa: DFA> = DFA::always_match()?; /// assert_eq!(dfa.pattern_len(), 1); /// # Ok::<(), Box>(()) /// ``` /// /// And finally, a DFA that was constructed from multiple patterns: /// /// ``` /// use regex_automata::dfa::{Automaton, dense::DFA}; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(dfa.pattern_len(), 3); /// # Ok::<(), Box>(()) /// ``` fn pattern_len(&self) -> usize; /// Returns the total number of patterns that match in this state. /// /// If the given state is not a match state, then implementations may /// panic. /// /// If the DFA was compiled with one pattern, then this must necessarily /// always return `1` for all match states. /// /// Implementations must guarantee that [`Automaton::match_pattern`] can be /// called with indices up to (but not including) the length returned by /// this routine without panicking. /// /// # Panics /// /// Implementations are permitted to panic if the provided state ID does /// not correspond to a match state. /// /// # Example /// /// This example shows a simple instance of implementing overlapping /// matches. In particular, it shows not only how to determine how many /// patterns have matched in a particular state, but also how to access /// which specific patterns have matched. /// /// Notice that we must use /// [`MatchKind::All`](crate::MatchKind::All) /// when building the DFA. If we used /// [`MatchKind::LeftmostFirst`](crate::MatchKind::LeftmostFirst) /// instead, then the DFA would not be constructed in a way that /// supports overlapping matches. (It would only report a single pattern /// that matches at any particular point in time.) /// /// Another thing to take note of is the patterns used and the order in /// which the pattern IDs are reported. In the example below, pattern `3` /// is yielded first. Why? Because it corresponds to the match that /// appears first. Namely, the `@` symbol is part of `\S+` but not part /// of any of the other patterns. Since the `\S+` pattern has a match that /// starts to the left of any other pattern, its ID is returned before any /// other. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchKind}; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[ /// r"[[:word:]]+", r"[a-z]+", r"[A-Z]+", r"[[:^space:]]+", /// ])?; /// let haystack = "@bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut state = dfa.start_state_forward(&Input::new(haystack))?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// state = dfa.next_state(state, b); /// } /// state = dfa.next_eoi_state(state); /// /// assert!(dfa.is_match_state(state)); /// assert_eq!(dfa.match_len(state), 3); /// // The following calls are guaranteed to not panic since `match_len` /// // returned `3` above. /// assert_eq!(dfa.match_pattern(state, 0).as_usize(), 3); /// assert_eq!(dfa.match_pattern(state, 1).as_usize(), 0); /// assert_eq!(dfa.match_pattern(state, 2).as_usize(), 1); /// /// # Ok::<(), Box>(()) /// ``` fn match_len(&self, id: StateID) -> usize; /// Returns the pattern ID corresponding to the given match index in the /// given state. /// /// See [`Automaton::match_len`] for an example of how to use this /// method correctly. Note that if you know your DFA is compiled with a /// single pattern, then this routine is never necessary since it will /// always return a pattern ID of `0` for an index of `0` when `id` /// corresponds to a match state. /// /// Typically, this routine is used when implementing an overlapping /// search, as the example for `Automaton::match_len` does. /// /// # Panics /// /// If the state ID is not a match state or if the match index is out /// of bounds for the given state, then this routine may either panic /// or produce an incorrect result. If the state ID is correct and the /// match index is correct, then this routine must always produce a valid /// `PatternID`. fn match_pattern(&self, id: StateID, index: usize) -> PatternID; /// Returns true if and only if this automaton can match the empty string. /// When it returns false, all possible matches are guaranteed to have a /// non-zero length. /// /// This is useful as cheap way to know whether code needs to handle the /// case of a zero length match. This is particularly important when UTF-8 /// modes are enabled, as when UTF-8 mode is enabled, empty matches that /// split a codepoint must never be reported. This extra handling can /// sometimes be costly, and since regexes matching an empty string are /// somewhat rare, it can be beneficial to treat such regexes specially. /// /// # Example /// /// This example shows a few different DFAs and whether they match the /// empty string or not. Notice the empty string isn't merely a matter /// of a string of length literally `0`, but rather, whether a match can /// occur between specific pairs of bytes. /// /// ``` /// use regex_automata::{dfa::{dense::DFA, Automaton}, util::syntax}; /// /// // The empty regex matches the empty string. /// let dfa = DFA::new("")?; /// assert!(dfa.has_empty(), "empty matches empty"); /// // The '+' repetition operator requires at least one match, and so /// // does not match the empty string. /// let dfa = DFA::new("a+")?; /// assert!(!dfa.has_empty(), "+ does not match empty"); /// // But the '*' repetition operator does. /// let dfa = DFA::new("a*")?; /// assert!(dfa.has_empty(), "* does match empty"); /// // And wrapping '+' in an operator that can match an empty string also /// // causes it to match the empty string too. /// let dfa = DFA::new("(a+)*")?; /// assert!(dfa.has_empty(), "+ inside of * matches empty"); /// /// // If a regex is just made of a look-around assertion, even if the /// // assertion requires some kind of non-empty string around it (such as /// // \b), then it is still treated as if it matches the empty string. /// // Namely, if a match occurs of just a look-around assertion, then the /// // match returned is empty. /// let dfa = DFA::builder() /// .configure(DFA::config().unicode_word_boundary(true)) /// .syntax(syntax::Config::new().utf8(false)) /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; /// assert!(dfa.has_empty(), "assertions match empty"); /// // Even when an assertion is wrapped in a '+', it still matches the /// // empty string. /// let dfa = DFA::new(r"^+")?; /// assert!(dfa.has_empty(), "+ of an assertion matches empty"); /// /// // An alternation with even one branch that can match the empty string /// // is also said to match the empty string overall. /// let dfa = DFA::new("foo|(bar)?|quux")?; /// assert!(dfa.has_empty(), "alternations can match empty"); /// /// // An NFA that matches nothing does not match the empty string. /// let dfa = DFA::new("[a&&b]")?; /// assert!(!dfa.has_empty(), "never matching means not matching empty"); /// // But if it's wrapped in something that doesn't require a match at /// // all, then it can match the empty string! /// let dfa = DFA::new("[a&&b]*")?; /// assert!(dfa.has_empty(), "* on never-match still matches empty"); /// // Since a '+' requires a match, using it on something that can never /// // match will itself produce a regex that can never match anything, /// // and thus does not match the empty string. /// let dfa = DFA::new("[a&&b]+")?; /// assert!(!dfa.has_empty(), "+ on never-match still matches nothing"); /// /// # Ok::<(), Box>(()) /// ``` fn has_empty(&self) -> bool; /// Whether UTF-8 mode is enabled for this DFA or not. /// /// When UTF-8 mode is enabled, all matches reported by a DFA are /// guaranteed to correspond to spans of valid UTF-8. This includes /// zero-width matches. For example, the DFA must guarantee that the empty /// regex will not match at the positions between code units in the UTF-8 /// encoding of a single codepoint. /// /// See [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) for /// more information. /// /// # Example /// /// This example shows how UTF-8 mode can impact the match spans that may /// be reported in certain cases. /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// nfa::thompson, /// HalfMatch, Input, /// }; /// /// // UTF-8 mode is enabled by default. /// let re = DFA::new("")?; /// assert!(re.is_utf8()); /// let mut input = Input::new("☃"); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(Some(HalfMatch::must(0, 0)), got); /// /// // Even though an empty regex matches at 1..1, our next match is /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is /// // three bytes long). /// input.set_start(1); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(Some(HalfMatch::must(0, 3)), got); /// /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: /// let re = DFA::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build("")?; /// assert!(!re.is_utf8()); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(Some(HalfMatch::must(0, 1)), got); /// /// input.set_start(2); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(Some(HalfMatch::must(0, 2)), got); /// /// input.set_start(3); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(Some(HalfMatch::must(0, 3)), got); /// /// input.set_start(4); /// let got = re.try_search_fwd(&input)?; /// assert_eq!(None, got); /// /// # Ok::<(), Box>(()) /// ``` fn is_utf8(&self) -> bool; /// Returns true if and only if this DFA is limited to returning matches /// whose start position is `0`. /// /// Note that if you're using DFAs provided by /// this crate, then this is _orthogonal_ to /// [`Config::start_kind`](crate::dfa::dense::Config::start_kind). /// /// This is useful in some cases because if a DFA is limited to producing /// matches that start at offset `0`, then a reverse search is never /// required for finding the start of a match. /// /// # Example /// /// ``` /// use regex_automata::dfa::{dense::DFA, Automaton}; /// /// // The empty regex matches anywhere /// let dfa = DFA::new("")?; /// assert!(!dfa.is_always_start_anchored(), "empty matches anywhere"); /// // 'a' matches anywhere. /// let dfa = DFA::new("a")?; /// assert!(!dfa.is_always_start_anchored(), "'a' matches anywhere"); /// // '^' only matches at offset 0! /// let dfa = DFA::new("^a")?; /// assert!(dfa.is_always_start_anchored(), "'^a' matches only at 0"); /// // But '(?m:^)' matches at 0 but at other offsets too. /// let dfa = DFA::new("(?m:^)a")?; /// assert!(!dfa.is_always_start_anchored(), "'(?m:^)a' matches anywhere"); /// /// # Ok::<(), Box>(()) /// ``` fn is_always_start_anchored(&self) -> bool; /// Return a slice of bytes to accelerate for the given state, if possible. /// /// If the given state has no accelerator, then an empty slice must be /// returned. If `Automaton::is_accel_state` returns true for the given ID, /// then this routine _must_ return a non-empty slice. But note that it is /// not required for an implementation of this trait to ever return `true` /// for `is_accel_state`, even if the state _could_ be accelerated. That /// is, acceleration is an optional optimization. But the return values of /// `is_accel_state` and `accelerator` must be in sync. /// /// If the given ID is not a valid state ID for this automaton, then /// implementations may panic or produce incorrect results. /// /// See [`Automaton::is_accel_state`] for more details on state /// acceleration. /// /// By default, this method will always return an empty slice. /// /// # Example /// /// This example shows a contrived case in which we build a regex that we /// know is accelerated and extract the accelerator from a state. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// util::{primitives::StateID, syntax}, /// }; /// /// let dfa = dense::Builder::new() /// // We disable Unicode everywhere and permit the regex to match /// // invalid UTF-8. e.g., [^abc] matches \xFF, which is not valid /// // UTF-8. If we left Unicode enabled, [^abc] would match any UTF-8 /// // encoding of any Unicode scalar value except for 'a', 'b' or 'c'. /// // That translates to a much more complicated DFA, and also /// // inhibits the 'accelerator' optimization that we are trying to /// // demonstrate in this example. /// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build("[^abc]+a")?; /// /// // Here we just pluck out the state that we know is accelerated. /// // While the stride calculations are something that can be relied /// // on by callers, the specific position of the accelerated state is /// // implementation defined. /// // /// // N.B. We get '3' by inspecting the state machine using 'regex-cli'. /// // e.g., try `regex-cli debug dense dfa -p '[^abc]+a' -BbUC`. /// let id = StateID::new(3 * dfa.stride()).unwrap(); /// let accelerator = dfa.accelerator(id); /// // The `[^abc]+` sub-expression permits [a, b, c] to be accelerated. /// assert_eq!(accelerator, &[b'a', b'b', b'c']); /// # Ok::<(), Box>(()) /// ``` #[inline] fn accelerator(&self, _id: StateID) -> &[u8] { &[] } /// Returns the prefilter associated with a DFA, if one exists. /// /// The default implementation of this trait always returns `None`. And /// indeed, it is always correct to return `None`. /// /// For DFAs in this crate, a prefilter can be attached to a DFA via /// [`dense::Config::prefilter`](crate::dfa::dense::Config::prefilter). /// /// Do note that prefilters are not serialized by DFAs in this crate. /// So if you deserialize a DFA that had a prefilter attached to it /// at serialization time, then it will not have a prefilter after /// deserialization. #[inline] fn get_prefilter(&self) -> Option<&Prefilter> { None } /// Executes a forward search and returns the end position of the leftmost /// match that is found. If no match exists, then `None` is returned. /// /// In particular, this method continues searching even after it enters /// a match state. The search only terminates once it has reached the /// end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Notes for implementors /// /// Implementors of this trait are not required to implement any particular /// match semantics (such as leftmost-first), which are instead manifest in /// the DFA's transitions. But this search routine should behave as a /// general "leftmost" search. /// /// In particular, this method must continue searching even after it enters /// a match state. The search should only terminate once it has reached /// the end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// Since this trait provides an implementation for this method by default, /// it's unlikely that one will need to implement this. /// /// # Example /// /// This example shows how to use this method with a /// [`dense::DFA`](crate::dfa::dense::DFA). /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new("foo[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = dense::DFA::new("abc|a")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specific pattern search /// /// This example shows how to build a multi-DFA that permits searching for /// specific patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, dense}, /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().starts_for_each_pattern(true)) /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let haystack = "foo123".as_bytes(); /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// let expected = Some(HalfMatch::must(1, 6)); /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// // N.B. We disable Unicode here so that we use a simple ASCII word /// // boundary. Alternatively, we could enable heuristic support for /// // Unicode word boundaries. /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; /// let haystack = "foo123bar".as_bytes(); /// /// // Since we sub-slice the haystack, the search doesn't know about the /// // larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `3` instead of `6`. /// let input = Input::new(&haystack[3..6]); /// let expected = Some(HalfMatch::must(0, 3)); /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let input = Input::new(haystack).range(3..6); /// let expected = None; /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] fn try_search_fwd( &self, input: &Input<'_>, ) -> Result, MatchError> { let utf8empty = self.has_empty() && self.is_utf8(); let hm = match search::find_fwd(&self, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; // We get to this point when we know our DFA can match the empty string // AND when UTF-8 mode is enabled. In this case, we skip any matches // whose offset splits a codepoint. Such a match is necessarily a // zero-width match, because UTF-8 mode requires the underlying NFA // to be built such that all non-empty matches span valid UTF-8. // Therefore, any match that ends in the middle of a codepoint cannot // be part of a span of valid UTF-8 and thus must be an empty match. // In such cases, we skip it, so as not to report matches that split a // codepoint. // // Note that this is not a checked assumption. Callers *can* provide an // NFA with UTF-8 mode enabled but produces non-empty matches that span // invalid UTF-8. But doing so is documented to result in unspecified // behavior. empty::skip_splits_fwd(input, hm, hm.offset(), |input| { let got = search::find_fwd(&self, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes a reverse search and returns the start of the position of the /// leftmost match that is found. If no match exists, then `None` is /// returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to use this method with a /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this /// routine is principally useful when used in conjunction with the /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) /// configuration. In general, it's unlikely to be correct to use /// both `try_search_fwd` and `try_search_rev` with the same DFA since /// any particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson, /// dfa::{Automaton, dense}, /// HalfMatch, Input, /// }; /// /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("foo[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the last byte (`c`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("abc|c")?; /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"")?; /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// original NFA disabled (which results in disabling UTF-8 mode on the /// DFA): /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build(r"")?; /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] fn try_search_rev( &self, input: &Input<'_>, ) -> Result, MatchError> { let utf8empty = self.has_empty() && self.is_utf8(); let hm = match search::find_rev(self, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; empty::skip_splits_rev(input, hm, hm.offset(), |input| { let got = search::find_rev(self, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes an overlapping forward search. Matches, if one exists, can be /// obtained via the [`OverlappingState::get_match`] method. /// /// This routine is principally only useful when searching for multiple /// patterns on inputs where multiple patterns may match the same regions /// of text. In particular, callers must preserve the automaton's search /// state from prior calls so that the implementation knows where the last /// match occurred. /// /// When using this routine to implement an iterator of overlapping /// matches, the `start` of the search should always be set to the end /// of the last match. If more patterns match at the previous location, /// then they will be immediately returned. (This is tracked by the given /// overlapping state.) Otherwise, the search continues at the starting /// position given. /// /// If for some reason you want the search to forget about its previous /// state and restart the search at a particular position, then setting the /// state to [`OverlappingState::start`] will accomplish that. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to run a basic overlapping search with a /// [`dense::DFA`](crate::dfa::dense::DFA). Notice that we build the /// automaton with a `MatchKind::All` configuration. Overlapping searches /// are unlikely to work as one would expect when using the default /// `MatchKind::LeftmostFirst` match semantics, since leftmost-first /// matching is fundamentally incompatible with overlapping searches. /// Namely, overlapping searches need to report matches as they are seen, /// where as leftmost-first searches will continue searching even after a /// match has been observed in order to find the conventional end position /// of the match. More concretely, leftmost-first searches use dead states /// to terminate a search after a specific match can no longer be extended. /// Overlapping searches instead do the opposite by continuing the search /// to find totally new matches (potentially of other patterns). /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, OverlappingState, dense}, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[r"[[:word:]]+$", r"[[:^space:]]+$"])?; /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first /// // pattern is returned after the second. This is because the second /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); /// dfa.try_search_overlapping_fwd(&Input::new(haystack), &mut state)?; /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] fn try_search_overlapping_fwd( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { let utf8empty = self.has_empty() && self.is_utf8(); search::find_overlapping_fwd(self, input, state)?; match state.get_match() { None => Ok(()), Some(_) if !utf8empty => Ok(()), Some(_) => skip_empty_utf8_splits_overlapping( input, state, |input, state| { search::find_overlapping_fwd(self, input, state) }, ), } } /// Executes a reverse overlapping forward search. Matches, if one exists, /// can be obtained via the [`OverlappingState::get_match`] method. /// /// When using this routine to implement an iterator of overlapping /// matches, the `start` of the search should remain invariant throughout /// iteration. The `OverlappingState` given to the search will keep track /// of the current position of the search. (This is because multiple /// matches may be reported at the same position, so only the search /// implementation itself knows when to advance the position.) /// /// If for some reason you want the search to forget about its previous /// state and restart the search at a particular position, then setting the /// state to [`OverlappingState::start`] will accomplish that. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton, OverlappingState}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true)) /// .build_many(&[r"", r"☃"])?; /// /// // Run the reverse DFA to collect all matches. /// let input = Input::new("☃"); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// loop { /// dfa.try_search_overlapping_rev(&input, &mut state)?; /// match state.get_match() { /// None => break, /// Some(hm) => matches.push(hm), /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(1, 0), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// original NFA disabled (which results in disabling UTF-8 mode on the /// DFA): /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton, OverlappingState}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build_many(&[r"", r"☃"])?; /// /// // Run the reverse DFA to collect all matches. /// let input = Input::new("☃"); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// loop { /// dfa.try_search_overlapping_rev(&input, &mut state)?; /// match state.get_match() { /// None => break, /// Some(hm) => matches.push(hm), /// } /// } /// /// // Now *all* positions match, even within a codepoint, /// // because we lifted the requirement that matches /// // correspond to valid UTF-8 spans. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(1, 0), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] fn try_search_overlapping_rev( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { let utf8empty = self.has_empty() && self.is_utf8(); search::find_overlapping_rev(self, input, state)?; match state.get_match() { None => Ok(()), Some(_) if !utf8empty => Ok(()), Some(_) => skip_empty_utf8_splits_overlapping( input, state, |input, state| { search::find_overlapping_rev(self, input, state) }, ), } } /// Writes the set of patterns that match anywhere in the given search /// configuration to `patset`. If multiple patterns match at the same /// position and the underlying DFA supports overlapping matches, then all /// matching patterns are written to the given set. /// /// Unless all of the patterns in this DFA are anchored, then generally /// speaking, this will visit every byte in the haystack. /// /// This search routine *does not* clear the pattern set. This gives some /// flexibility to the caller (e.g., running multiple searches with the /// same pattern set), but does make the API bug-prone if you're reusing /// the same pattern set for multiple searches but intended them to be /// independent. /// /// If a pattern ID matched but the given `PatternSet` does not have /// sufficient capacity to store it, then it is not inserted and silently /// dropped. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to find all matching patterns in a haystack, /// even when some patterns match at the same position as other patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, dense::DFA}, /// Input, MatchKind, PatternSet, /// }; /// /// let patterns = &[ /// r"[[:word:]]+", /// r"[0-9]+", /// r"[[:alpha:]]+", /// r"foo", /// r"bar", /// r"barfoo", /// r"foobar", /// ]; /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(dfa.pattern_len()); /// dfa.try_which_overlapping_matches(&input, &mut patset)?; /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "alloc")] #[inline] fn try_which_overlapping_matches( &self, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), MatchError> { let mut state = OverlappingState::start(); while let Some(m) = { self.try_search_overlapping_fwd(input, &mut state)?; state.get_match() } { let _ = patset.insert(m.pattern()); // There's nothing left to find, so we can stop. Or the caller // asked us to. if patset.is_full() || input.get_earliest() { break; } } Ok(()) } } unsafe impl<'a, A: Automaton + ?Sized> Automaton for &'a A { #[inline] fn next_state(&self, current: StateID, input: u8) -> StateID { (**self).next_state(current, input) } #[inline] unsafe fn next_state_unchecked( &self, current: StateID, input: u8, ) -> StateID { (**self).next_state_unchecked(current, input) } #[inline] fn next_eoi_state(&self, current: StateID) -> StateID { (**self).next_eoi_state(current) } #[inline] fn start_state( &self, config: &start::Config, ) -> Result { (**self).start_state(config) } #[inline] fn start_state_forward( &self, input: &Input<'_>, ) -> Result { (**self).start_state_forward(input) } #[inline] fn start_state_reverse( &self, input: &Input<'_>, ) -> Result { (**self).start_state_reverse(input) } #[inline] fn universal_start_state(&self, mode: Anchored) -> Option { (**self).universal_start_state(mode) } #[inline] fn is_special_state(&self, id: StateID) -> bool { (**self).is_special_state(id) } #[inline] fn is_dead_state(&self, id: StateID) -> bool { (**self).is_dead_state(id) } #[inline] fn is_quit_state(&self, id: StateID) -> bool { (**self).is_quit_state(id) } #[inline] fn is_match_state(&self, id: StateID) -> bool { (**self).is_match_state(id) } #[inline] fn is_start_state(&self, id: StateID) -> bool { (**self).is_start_state(id) } #[inline] fn is_accel_state(&self, id: StateID) -> bool { (**self).is_accel_state(id) } #[inline] fn pattern_len(&self) -> usize { (**self).pattern_len() } #[inline] fn match_len(&self, id: StateID) -> usize { (**self).match_len(id) } #[inline] fn match_pattern(&self, id: StateID, index: usize) -> PatternID { (**self).match_pattern(id, index) } #[inline] fn has_empty(&self) -> bool { (**self).has_empty() } #[inline] fn is_utf8(&self) -> bool { (**self).is_utf8() } #[inline] fn is_always_start_anchored(&self) -> bool { (**self).is_always_start_anchored() } #[inline] fn accelerator(&self, id: StateID) -> &[u8] { (**self).accelerator(id) } #[inline] fn get_prefilter(&self) -> Option<&Prefilter> { (**self).get_prefilter() } #[inline] fn try_search_fwd( &self, input: &Input<'_>, ) -> Result, MatchError> { (**self).try_search_fwd(input) } #[inline] fn try_search_rev( &self, input: &Input<'_>, ) -> Result, MatchError> { (**self).try_search_rev(input) } #[inline] fn try_search_overlapping_fwd( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { (**self).try_search_overlapping_fwd(input, state) } #[inline] fn try_search_overlapping_rev( &self, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { (**self).try_search_overlapping_rev(input, state) } #[cfg(feature = "alloc")] #[inline] fn try_which_overlapping_matches( &self, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), MatchError> { (**self).try_which_overlapping_matches(input, patset) } } /// Represents the current state of an overlapping search. /// /// This is used for overlapping searches since they need to know something /// about the previous search. For example, when multiple patterns match at the /// same position, this state tracks the last reported pattern so that the next /// search knows whether to report another matching pattern or continue with /// the search at the next position. Additionally, it also tracks which state /// the last search call terminated in. /// /// This type provides little introspection capabilities. The only thing a /// caller can do is construct it and pass it around to permit search routines /// to use it to track state, and also ask whether a match has been found. /// /// Callers should always provide a fresh state constructed via /// [`OverlappingState::start`] when starting a new search. Reusing state from /// a previous search may result in incorrect results. #[derive(Clone, Debug, Eq, PartialEq)] pub struct OverlappingState { /// The match reported by the most recent overlapping search to use this /// state. /// /// If a search does not find any matches, then it is expected to clear /// this value. pub(crate) mat: Option, /// The state ID of the state at which the search was in when the call /// terminated. When this is a match state, `last_match` must be set to a /// non-None value. /// /// A `None` value indicates the start state of the corresponding /// automaton. We cannot use the actual ID, since any one automaton may /// have many start states, and which one is in use depends on several /// search-time factors. pub(crate) id: Option, /// The position of the search. /// /// When `id` is None (i.e., we are starting a search), this is set to /// the beginning of the search as given by the caller regardless of its /// current value. Subsequent calls to an overlapping search pick up at /// this offset. pub(crate) at: usize, /// The index into the matching patterns of the next match to report if the /// current state is a match state. Note that this may be 1 greater than /// the total number of matches to report for the current match state. (In /// which case, no more matches should be reported at the current position /// and the search should advance to the next position.) pub(crate) next_match_index: Option, /// This is set to true when a reverse overlapping search has entered its /// EOI transitions. /// /// This isn't used in a forward search because it knows to stop once the /// position exceeds the end of the search range. In a reverse search, /// since we use unsigned offsets, we don't "know" once we've gone past /// `0`. So the only way to detect it is with this extra flag. The reverse /// overlapping search knows to terminate specifically after it has /// reported all matches after following the EOI transition. pub(crate) rev_eoi: bool, } impl OverlappingState { /// Create a new overlapping state that begins at the start state of any /// automaton. pub fn start() -> OverlappingState { OverlappingState { mat: None, id: None, at: 0, next_match_index: None, rev_eoi: false, } } /// Return the match result of the most recent search to execute with this /// state. /// /// A searches will clear this result automatically, such that if no /// match is found, this will correctly report `None`. pub fn get_match(&self) -> Option { self.mat } } /// An error that can occur when computing the start state for a search. /// /// Computing a start state can fail for a few reasons, either based on /// incorrect configuration or even based on whether the look-behind byte /// triggers a quit state. Typically one does not need to handle this error /// if you're using [`Automaton::start_state_forward`] (or its reverse /// counterpart), as that routine automatically converts `StartError` to a /// [`MatchError`] for you. /// /// This error may be returned by the [`Automaton::start_state`] routine. /// /// This error implements the `std::error::Error` trait when the `std` feature /// is enabled. /// /// This error is marked as non-exhaustive. New variants may be added in a /// semver compatible release. #[non_exhaustive] #[derive(Clone, Debug)] pub enum StartError { /// An error that occurs when a starting configuration's look-behind byte /// is in this DFA's quit set. Quit { /// The quit byte that was found. byte: u8, }, /// An error that occurs when the caller requests an anchored mode that /// isn't supported by the DFA. UnsupportedAnchored { /// The anchored mode given that is unsupported. mode: Anchored, }, } impl StartError { pub(crate) fn quit(byte: u8) -> StartError { StartError::Quit { byte } } pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { StartError::UnsupportedAnchored { mode } } } #[cfg(feature = "std")] impl std::error::Error for StartError {} impl core::fmt::Display for StartError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { StartError::Quit { byte } => write!( f, "error computing start state because the look-behind byte \ {:?} triggered a quit state", crate::util::escape::DebugByte(byte), ), StartError::UnsupportedAnchored { mode: Anchored::Yes } => { write!( f, "error computing start state because \ anchored searches are not supported or enabled" ) } StartError::UnsupportedAnchored { mode: Anchored::No } => { write!( f, "error computing start state because \ unanchored searches are not supported or enabled" ) } StartError::UnsupportedAnchored { mode: Anchored::Pattern(pid), } => { write!( f, "error computing start state because \ anchored searches for a specific pattern ({}) \ are not supported or enabled", pid.as_usize(), ) } } } } /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// /// This is *not* always correct to call. It should only be called when the DFA /// has UTF-8 mode enabled *and* it can produce zero-width matches. Calling /// this when both of those things aren't true might result in legitimate /// matches getting skipped. #[cold] #[inline(never)] fn skip_empty_utf8_splits_overlapping( input: &Input<'_>, state: &mut OverlappingState, mut search: F, ) -> Result<(), MatchError> where F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, { // Note that this routine works for forwards and reverse searches // even though there's no code here to handle those cases. That's // because overlapping searches drive themselves to completion via // `OverlappingState`. So all we have to do is push it until no matches are // found. let mut hm = match state.get_match() { None => return Ok(()), Some(hm) => hm, }; if input.get_anchored().is_anchored() { if !input.is_char_boundary(hm.offset()) { state.mat = None; } return Ok(()); } while !input.is_char_boundary(hm.offset()) { search(input, state)?; hm = match state.get_match() { None => return Ok(()), Some(hm) => hm, }; } Ok(()) } /// Write a prefix "state" indicator for fmt::Debug impls. /// /// Specifically, this tries to succinctly distinguish the different types of /// states: dead states, quit states, accelerated states, start states and /// match states. It even accounts for the possible overlappings of different /// state types. pub(crate) fn fmt_state_indicator( f: &mut core::fmt::Formatter<'_>, dfa: A, id: StateID, ) -> core::fmt::Result { if dfa.is_dead_state(id) { write!(f, "D")?; if dfa.is_start_state(id) { write!(f, ">")?; } else { write!(f, " ")?; } } else if dfa.is_quit_state(id) { write!(f, "Q ")?; } else if dfa.is_start_state(id) { if dfa.is_accel_state(id) { write!(f, "A>")?; } else { write!(f, " >")?; } } else if dfa.is_match_state(id) { if dfa.is_accel_state(id) { write!(f, "A*")?; } else { write!(f, " *")?; } } else if dfa.is_accel_state(id) { write!(f, "A ")?; } else { write!(f, " ")?; } Ok(()) } #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { // A basic test ensuring that our Automaton trait is object safe. (This is // the main reason why we don't define the search routines as generic over // Into.) #[test] fn object_safe() { use crate::{ dfa::{dense, Automaton}, HalfMatch, Input, }; let dfa = dense::DFA::new("abc").unwrap(); let dfa: &dyn Automaton = &dfa; assert_eq!( Ok(Some(HalfMatch::must(0, 6))), dfa.try_search_fwd(&Input::new(b"xyzabcxyz")), ); } } regex-automata-0.4.9/src/dfa/dense.rs000064400000000000000000006513471046102023000155660ustar 00000000000000/*! Types and routines specific to dense DFAs. This module is the home of [`dense::DFA`](DFA). This module also contains a [`dense::Builder`](Builder) and a [`dense::Config`](Config) for building and configuring a dense DFA. */ #[cfg(feature = "dfa-build")] use core::cmp; use core::{fmt, iter, mem::size_of, slice}; #[cfg(feature = "dfa-build")] use alloc::{ collections::{BTreeMap, BTreeSet}, vec, vec::Vec, }; #[cfg(feature = "dfa-build")] use crate::{ dfa::{ accel::Accel, determinize, minimize::Minimizer, remapper::Remapper, sparse, }, nfa::thompson, util::{look::LookMatcher, search::MatchKind}, }; use crate::{ dfa::{ accel::Accels, automaton::{fmt_state_indicator, Automaton, StartError}, special::Special, start::StartKind, DEAD, }, util::{ alphabet::{self, ByteClasses, ByteSet}, int::{Pointer, Usize}, prefilter::Prefilter, primitives::{PatternID, StateID}, search::Anchored, start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; /// The label that is pre-pended to a serialized DFA. const LABEL: &str = "rust-regex-automata-dfa-dense"; /// The format version of dense regexes. This version gets incremented when a /// change occurs. A change may not necessarily be a breaking change, but the /// version does permit good error messages in the case where a breaking change /// is made. const VERSION: u32 = 2; /// The configuration used for compiling a dense DFA. /// /// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The /// advantage of the former is that it often lets you avoid importing the /// `Config` type directly. /// /// A dense DFA configuration is a simple data object that is typically used /// with [`dense::Builder::configure`](self::Builder::configure). /// /// The default configuration guarantees that a search will never return /// a "quit" error, although it is possible for a search to fail if /// [`Config::starts_for_each_pattern`] wasn't enabled (which it is /// not by default) and an [`Anchored::Pattern`] mode is requested via /// [`Input`](crate::Input). #[cfg(feature = "dfa-build")] #[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." // This makes it possible to easily combine multiple configurations // without default values overwriting explicitly specified values. See the // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. accelerate: Option, pre: Option>, minimize: Option, match_kind: Option, start_kind: Option, starts_for_each_pattern: Option, byte_classes: Option, unicode_word_boundary: Option, quitset: Option, specialize_start_states: Option, dfa_size_limit: Option>, determinize_size_limit: Option>, } #[cfg(feature = "dfa-build")] impl Config { /// Return a new default dense DFA compiler configuration. pub fn new() -> Config { Config::default() } /// Enable state acceleration. /// /// When enabled, DFA construction will analyze each state to determine /// whether it is eligible for simple acceleration. Acceleration typically /// occurs when most of a state's transitions loop back to itself, leaving /// only a select few bytes that will exit the state. When this occurs, /// other routines like `memchr` can be used to look for those bytes which /// may be much faster than traversing the DFA. /// /// Callers may elect to disable this if consistent performance is more /// desirable than variable performance. Namely, acceleration can sometimes /// make searching slower than it otherwise would be if the transitions /// that leave accelerated states are traversed frequently. /// /// See [`Automaton::accelerator`] for an example. /// /// This is enabled by default. pub fn accelerate(mut self, yes: bool) -> Config { self.accelerate = Some(yes); self } /// Set a prefilter to be used whenever a start state is entered. /// /// A [`Prefilter`] in this context is meant to accelerate searches by /// looking for literal prefixes that every match for the corresponding /// pattern (or patterns) must start with. Once a prefilter produces a /// match, the underlying search routine continues on to try and confirm /// the match. /// /// Be warned that setting a prefilter does not guarantee that the search /// will be faster. While it's usually a good bet, if the prefilter /// produces a lot of false positive candidates (i.e., positions matched /// by the prefilter but not by the regex), then the overall result can /// be slower than if you had just executed the regex engine without any /// prefilters. /// /// Note that unless [`Config::specialize_start_states`] has been /// explicitly set, then setting this will also enable (when `pre` is /// `Some`) or disable (when `pre` is `None`) start state specialization. /// This occurs because without start state specialization, a prefilter /// is likely to be less effective. And without a prefilter, start state /// specialization is usually pointless. /// /// **WARNING:** Note that prefilters are not preserved as part of /// serialization. Serializing a DFA will drop its prefilter. /// /// By default no prefilter is set. /// /// # Example /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); /// let re = DFA::builder() /// .configure(DFA::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let input = Input::new("foo1 barfox bar"); /// assert_eq!( /// Some(HalfMatch::must(0, 11)), /// re.try_search_fwd(&input)?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// Be warned though that an incorrect prefilter can lead to incorrect /// results! /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); /// let re = DFA::builder() /// .configure(DFA::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let input = Input::new("foo1 barfox bar"); /// assert_eq!( /// // No match reported even though there clearly is one! /// None, /// re.try_search_fwd(&input)?, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(mut self, pre: Option) -> Config { self.pre = Some(pre); if self.specialize_start_states.is_none() { self.specialize_start_states = Some(self.get_prefilter().is_some()); } self } /// Minimize the DFA. /// /// When enabled, the DFA built will be minimized such that it is as small /// as possible. /// /// Whether one enables minimization or not depends on the types of costs /// you're willing to pay and how much you care about its benefits. In /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` /// space, where `n` is the number of DFA states and `k` is the alphabet /// size. In practice, minimization can be quite costly in terms of both /// space and time, so it should only be done if you're willing to wait /// longer to produce a DFA. In general, you might want a minimal DFA in /// the following circumstances: /// /// 1. You would like to optimize for the size of the automaton. This can /// manifest in one of two ways. Firstly, if you're converting the /// DFA into Rust code (or a table embedded in the code), then a minimal /// DFA will translate into a corresponding reduction in code size, and /// thus, also the final compiled binary size. Secondly, if you are /// building many DFAs and putting them on the heap, you'll be able to /// fit more if they are smaller. Note though that building a minimal /// DFA itself requires additional space; you only realize the space /// savings once the minimal DFA is constructed (at which point, the /// space used for minimization is freed). /// 2. You've observed that a smaller DFA results in faster match /// performance. Naively, this isn't guaranteed since there is no /// inherent difference between matching with a bigger-than-minimal /// DFA and a minimal DFA. However, a smaller DFA may make use of your /// CPU's cache more efficiently. /// 3. You are trying to establish an equivalence between regular /// languages. The standard method for this is to build a minimal DFA /// for each language and then compare them. If the DFAs are equivalent /// (up to state renaming), then the languages are equivalent. /// /// Typically, minimization only makes sense as an offline process. That /// is, one might minimize a DFA before serializing it to persistent /// storage. In practical terms, minimization can take around an order of /// magnitude more time than compiling the initial DFA via determinization. /// /// This option is disabled by default. pub fn minimize(mut self, yes: bool) -> Config { self.minimize = Some(yes); self } /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the /// match semantics of Perl-like regex engines. That is, when multiple /// patterns would match at the same leftmost position, the pattern that /// appears first in the concrete syntax is chosen. /// /// Currently, the only other kind of match semantics supported is /// [`MatchKind::All`]. This corresponds to classical DFA construction /// where all possible matches are added to the DFA. /// /// Typically, `All` is used when one wants to execute an overlapping /// search and `LeftmostFirst` otherwise. In particular, it rarely makes /// sense to use `All` with the various "leftmost" find routines, since the /// leftmost routines depend on the `LeftmostFirst` automata construction /// strategy. Specifically, `LeftmostFirst` adds dead states to the DFA /// as a way to terminate the search and report a match. `LeftmostFirst` /// also supports non-greedy matches using this strategy where as `All` /// does not. /// /// # Example: overlapping search /// /// This example shows the typical use of `MatchKind::All`, which is to /// report overlapping matches. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, OverlappingState, dense}, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let input = Input::new("@foo"); /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); /// dfa.try_search_overlapping_fwd(&input, &mut state)?; /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first /// // pattern is returned after the second. This is because the second /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); /// dfa.try_search_overlapping_fwd(&input, &mut state)?; /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: reverse automaton to find start of match /// /// Another example for using `MatchKind::All` is for constructing a /// reverse automaton to find the start of a match. `All` semantics are /// used for this in order to find the longest possible match, which /// corresponds to the leftmost starting position. /// /// Note that if you need the starting position then /// [`dfa::regex::Regex`](crate::dfa::regex::Regex) will handle this for /// you, so it's usually not necessary to do this yourself. /// /// ``` /// use regex_automata::{ /// dfa::{dense, Automaton, StartKind}, /// nfa::thompson::NFA, /// Anchored, HalfMatch, Input, MatchKind, /// }; /// /// let haystack = "123foobar456".as_bytes(); /// let pattern = r"[a-z]+r"; /// /// let dfa_fwd = dense::DFA::new(pattern)?; /// let dfa_rev = dense::Builder::new() /// .thompson(NFA::config().reverse(true)) /// .configure(dense::Config::new() /// // This isn't strictly necessary since both anchored and /// // unanchored searches are supported by default. But since /// // finding the start-of-match only requires anchored searches, /// // we can get rid of the unanchored configuration and possibly /// // slim down our DFA considerably. /// .start_kind(StartKind::Anchored) /// .match_kind(MatchKind::All) /// ) /// .build(pattern)?; /// let expected_fwd = HalfMatch::must(0, 9); /// let expected_rev = HalfMatch::must(0, 3); /// let got_fwd = dfa_fwd.try_search_fwd(&Input::new(haystack))?.unwrap(); /// // Here we don't specify the pattern to search for since there's only /// // one pattern and we're doing a leftmost search. But if this were an /// // overlapping search, you'd need to specify the pattern that matched /// // in the forward direction. (Otherwise, you might wind up finding the /// // starting position of a match of some other pattern.) That in turn /// // requires building the reverse automaton with starts_for_each_pattern /// // enabled. Indeed, this is what Regex does internally. /// let input = Input::new(haystack) /// .range(..got_fwd.offset()) /// .anchored(Anchored::Yes); /// let got_rev = dfa_rev.try_search_rev(&input)?.unwrap(); /// assert_eq!(expected_fwd, got_fwd); /// assert_eq!(expected_rev, got_rev); /// /// # Ok::<(), Box>(()) /// ``` pub fn match_kind(mut self, kind: MatchKind) -> Config { self.match_kind = Some(kind); self } /// The type of starting state configuration to use for a DFA. /// /// By default, the starting state configuration is [`StartKind::Both`]. /// /// # Example /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton, StartKind}, /// Anchored, HalfMatch, Input, /// }; /// /// let haystack = "quux foo123"; /// let expected = HalfMatch::must(0, 11); /// /// // By default, DFAs support both anchored and unanchored searches. /// let dfa = DFA::new(r"[0-9]+")?; /// let input = Input::new(haystack); /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// /// // But if we only need anchored searches, then we can build a DFA /// // that only supports anchored searches. This leads to a smaller DFA /// // (potentially significantly smaller in some cases), but a DFA that /// // will panic if you try to use it with an unanchored search. /// let dfa = DFA::builder() /// .configure(DFA::config().start_kind(StartKind::Anchored)) /// .build(r"[0-9]+")?; /// let input = Input::new(haystack) /// .range(8..) /// .anchored(Anchored::Yes); /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// /// # Ok::<(), Box>(()) /// ``` pub fn start_kind(mut self, kind: StartKind) -> Config { self.start_kind = Some(kind); self } /// Whether to compile a separate start state for each pattern in the /// automaton. /// /// When enabled, a separate **anchored** start state is added for each /// pattern in the DFA. When this start state is used, then the DFA will /// only search for matches for the pattern specified, even if there are /// other patterns in the DFA. /// /// The main downside of this option is that it can potentially increase /// the size of the DFA and/or increase the time it takes to build the DFA. /// /// There are a few reasons one might want to enable this (it's disabled /// by default): /// /// 1. When looking for the start of an overlapping match (using a /// reverse DFA), doing it correctly requires starting the reverse search /// using the starting state of the pattern that matched in the forward /// direction. Indeed, when building a [`Regex`](crate::dfa::regex::Regex), /// it will automatically enable this option when building the reverse DFA /// internally. /// 2. When you want to use a DFA with multiple patterns to both search /// for matches of any pattern or to search for anchored matches of one /// particular pattern while using the same DFA. (Otherwise, you would need /// to compile a new DFA for each pattern.) /// 3. Since the start states added for each pattern are anchored, if you /// compile an unanchored DFA with one pattern while also enabling this /// option, then you can use the same DFA to perform anchored or unanchored /// searches. The latter you get with the standard search APIs. The former /// you get from the various `_at` search methods that allow you specify a /// pattern ID to search for. /// /// By default this is disabled. /// /// # Example /// /// This example shows how to use this option to permit the same DFA to /// run both anchored and unanchored searches for a single pattern. /// /// ``` /// use regex_automata::{ /// dfa::{dense, Automaton}, /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().starts_for_each_pattern(true)) /// .build(r"foo[0-9]+")?; /// let haystack = "quux foo123"; /// /// // Here's a normal unanchored search. Notice that we use 'None' for the /// // pattern ID. Since the DFA was built as an unanchored machine, it /// // use its default unanchored starting state. /// let expected = HalfMatch::must(0, 11); /// let input = Input::new(haystack); /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// // But now if we explicitly specify the pattern to search ('0' being /// // the only pattern in the DFA), then it will use the starting state /// // for that specific pattern which is always anchored. Since the /// // pattern doesn't have a match at the beginning of the haystack, we /// // find nothing. /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(0))); /// assert_eq!(None, dfa.try_search_fwd(&input)?); /// // And finally, an anchored search is not the same as putting a '^' at /// // beginning of the pattern. An anchored search can only match at the /// // beginning of the *search*, which we can change: /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(0))) /// .range(5..); /// assert_eq!(Some(expected), dfa.try_search_fwd(&input)?); /// /// # Ok::<(), Box>(()) /// ``` pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { self.starts_for_each_pattern = Some(yes); self } /// Whether to attempt to shrink the size of the DFA's alphabet or not. /// /// This option is enabled by default and should never be disabled unless /// one is debugging a generated DFA. /// /// When enabled, the DFA will use a map from all possible bytes to their /// corresponding equivalence class. Each equivalence class represents a /// set of bytes that does not discriminate between a match and a non-match /// in the DFA. For example, the pattern `[ab]+` has at least two /// equivalence classes: a set containing `a` and `b` and a set containing /// every byte except for `a` and `b`. `a` and `b` are in the same /// equivalence class because they never discriminate between a match and a /// non-match. /// /// The advantage of this map is that the size of the transition table /// can be reduced drastically from `#states * 256 * sizeof(StateID)` to /// `#states * k * sizeof(StateID)` where `k` is the number of equivalence /// classes (rounded up to the nearest power of 2). As a result, total /// space usage can decrease substantially. Moreover, since a smaller /// alphabet is used, DFA compilation becomes faster as well. /// /// **WARNING:** This is only useful for debugging DFAs. Disabling this /// does not yield any speed advantages. Namely, even when this is /// disabled, a byte class map is still used while searching. The only /// difference is that every byte will be forced into its own distinct /// equivalence class. This is useful for debugging the actual generated /// transitions because it lets one see the transitions defined on actual /// bytes instead of the equivalence classes. pub fn byte_classes(mut self, yes: bool) -> Config { self.byte_classes = Some(yes); self } /// Heuristically enable Unicode word boundaries. /// /// When set, this will attempt to implement Unicode word boundaries as if /// they were ASCII word boundaries. This only works when the search input /// is ASCII only. If a non-ASCII byte is observed while searching, then a /// [`MatchError::quit`](crate::MatchError::quit) error is returned. /// /// A possible alternative to enabling this option is to simply use an /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this /// option is if you absolutely need Unicode support. This option lets one /// use a fast search implementation (a DFA) for some potentially very /// common cases, while providing the option to fall back to some other /// regex engine to handle the general case when an error is returned. /// /// If the pattern provided has no Unicode word boundary in it, then this /// option has no effect. (That is, quitting on a non-ASCII byte only /// occurs when this option is enabled _and_ a Unicode word boundary is /// present in the pattern.) /// /// This is almost equivalent to setting all non-ASCII bytes to be quit /// bytes. The only difference is that this will cause non-ASCII bytes to /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// /// When enabling this option, callers _must_ be prepared to handle /// a [`MatchError`](crate::MatchError) error during search. /// When using a [`Regex`](crate::dfa::regex::Regex), this corresponds /// to using the `try_` suite of methods. Alternatively, if /// callers can guarantee that their input is ASCII only, then a /// [`MatchError::quit`](crate::MatchError::quit) error will never be /// returned while searching. /// /// This is disabled by default. /// /// # Example /// /// This example shows how to heuristically enable Unicode word boundaries /// in a pattern. It also shows what happens when a search comes across a /// non-ASCII byte. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// HalfMatch, Input, MatchError, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().unicode_word_boundary(true)) /// .build(r"\b[0-9]+\b")?; /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. /// let haystack = "foo 123 ☃".as_bytes(); /// let expected = Some(HalfMatch::must(0, 7)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // Notice that this search fails, even though the snowman character /// // occurs after the ending match offset. This is because search /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. /// let haystack = "foo 123 ☃".as_bytes(); /// let expected = MatchError::quit(0xE2, 8); /// let got = dfa.try_search_fwd(&Input::new(haystack)); /// assert_eq!(Err(expected), got); /// /// // Another example is executing a search where the span of the haystack /// // we specify is all ASCII, but there is non-ASCII just before it. This /// // correctly also reports an error. /// let input = Input::new("β123").range(2..); /// let expected = MatchError::quit(0xB2, 1); /// let got = dfa.try_search_fwd(&input); /// assert_eq!(Err(expected), got); /// /// // And similarly for the trailing word boundary. /// let input = Input::new("123β").range(..3); /// let expected = MatchError::quit(0xCE, 3); /// let got = dfa.try_search_fwd(&input); /// assert_eq!(Err(expected), got); /// /// # Ok::<(), Box>(()) /// ``` pub fn unicode_word_boundary(mut self, yes: bool) -> Config { // We have a separate option for this instead of just setting the // appropriate quit bytes here because we don't want to set quit bytes // for every regex. We only want to set them when the regex contains a // Unicode word boundary. self.unicode_word_boundary = Some(yes); self } /// Add a "quit" byte to the DFA. /// /// When a quit byte is seen during search time, then search will return /// a [`MatchError::quit`](crate::MatchError::quit) error indicating the /// offset at which the search stopped. /// /// A quit byte will always overrule any other aspects of a regex. For /// example, if the `x` byte is added as a quit byte and the regex `\w` is /// used, then observing `x` will cause the search to quit immediately /// despite the fact that `x` is in the `\w` class. /// /// This mechanism is primarily useful for heuristically enabling certain /// features like Unicode word boundaries in a DFA. Namely, if the input /// to search is ASCII, then a Unicode word boundary can be implemented /// via an ASCII word boundary with no change in semantics. Thus, a DFA /// can attempt to match a Unicode word boundary but give up as soon as it /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes /// to be quit bytes, then Unicode word boundaries will be permitted when /// building DFAs. Of course, callers should enable /// [`Config::unicode_word_boundary`] if they want this behavior instead. /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// /// When enabling this option, callers _must_ be prepared to handle a /// [`MatchError`](crate::MatchError) error during search. When using a /// [`Regex`](crate::dfa::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// /// By default, there are no quit bytes set. /// /// # Panics /// /// This panics if heuristic Unicode word boundaries are enabled and any /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling /// Unicode word boundaries requires setting every non-ASCII byte to a quit /// byte. So if the caller attempts to undo any of that, then this will /// panic. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte. This could be useful if, for example, you wanted to prevent /// a user supplied pattern from matching across a line boundary. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::{Automaton, dense}, Input, MatchError}; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// /// let haystack = "foo\nbar".as_bytes(); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::quit(b'\n', 3); /// let got = dfa.try_search_fwd(&Input::new(haystack)).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn quit(mut self, byte: u8, yes: bool) -> Config { if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { panic!( "cannot set non-ASCII byte to be non-quit when \ Unicode word boundaries are enabled" ); } if self.quitset.is_none() { self.quitset = Some(ByteSet::empty()); } if yes { self.quitset.as_mut().unwrap().add(byte); } else { self.quitset.as_mut().unwrap().remove(byte); } self } /// Enable specializing start states in the DFA. /// /// When start states are specialized, an implementor of a search routine /// using a lazy DFA can tell when the search has entered a starting state. /// When start states aren't specialized, then it is impossible to know /// whether the search has entered a start state. /// /// Ideally, this option wouldn't need to exist and we could always /// specialize start states. The problem is that start states can be quite /// active. This in turn means that an efficient search routine is likely /// to ping-pong between a heavily optimized hot loop that handles most /// states and to a less optimized specialized handling of start states. /// This causes branches to get heavily mispredicted and overall can /// materially decrease throughput. Therefore, specializing start states /// should only be enabled when it is needed. /// /// Knowing whether a search is in a start state is typically useful when a /// prefilter is active for the search. A prefilter is typically only run /// when in a start state and a prefilter can greatly accelerate a search. /// Therefore, the possible cost of specializing start states is worth it /// in this case. Otherwise, if you have no prefilter, there is likely no /// reason to specialize start states. /// /// This is disabled by default, but note that it is automatically /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless /// `specialize_start_states` has already been set, [`Config::prefilter`] /// will automatically enable or disable it based on whether a prefilter /// is present or not, respectively. This is done because a prefilter's /// effectiveness is rooted in being executed whenever the DFA is in a /// start state, and that's only possible to do when they are specialized. /// /// Note that it is plausibly reasonable to _disable_ this option /// explicitly while _enabling_ a prefilter. In that case, a prefilter /// will still be run at the beginning of a search, but never again. This /// in theory could strike a good balance if you're in a situation where a /// prefilter is likely to produce many false positive candidates. /// /// # Example /// /// This example shows how to enable start state specialization and then /// shows how to check whether a state is a start state or not. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().specialize_start_states(true)) /// .build(r"[a-z]+")?; /// /// let haystack = "123 foobar 4567".as_bytes(); /// let sid = dfa.start_state_forward(&Input::new(haystack))?; /// // The ID returned by 'start_state_forward' will always be tagged as /// // a start state when start state specialization is enabled. /// assert!(dfa.is_special_state(sid)); /// assert!(dfa.is_start_state(sid)); /// /// # Ok::<(), Box>(()) /// ``` /// /// Compare the above with the default DFA configuration where start states /// are _not_ specialized. In this case, the start state is not tagged at /// all: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, Input}; /// /// let dfa = DFA::new(r"[a-z]+")?; /// /// let haystack = "123 foobar 4567"; /// let sid = dfa.start_state_forward(&Input::new(haystack))?; /// // Start states are not special in the default configuration! /// assert!(!dfa.is_special_state(sid)); /// assert!(!dfa.is_start_state(sid)); /// /// # Ok::<(), Box>(()) /// ``` pub fn specialize_start_states(mut self, yes: bool) -> Config { self.specialize_start_states = Some(yes); self } /// Set a size limit on the total heap used by a DFA. /// /// This size limit is expressed in bytes and is applied during /// determinization of an NFA into a DFA. If the DFA's heap usage, and only /// the DFA, exceeds this configured limit, then determinization is stopped /// and an error is returned. /// /// This limit does not apply to auxiliary storage used during /// determinization that isn't part of the generated DFA. /// /// This limit is only applied during determinization. Currently, there is /// no way to post-pone this check to after minimization if minimization /// was enabled. /// /// The total limit on heap used during determinization is the sum of the /// DFA and determinization size limits. /// /// The default is no limit. /// /// # Example /// /// This example shows a DFA that fails to build because of a configured /// size limit. This particular example also serves as a cautionary tale /// demonstrating just how big DFAs with large Unicode character classes /// can get. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// /// // 6MB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new().dfa_size_limit(Some(6_000_000))) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 7MB probably is! /// // (Note that DFA sizes aren't necessarily stable between releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().dfa_size_limit(Some(7_000_000))) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); /// /// # Ok::<(), Box>(()) /// ``` /// /// While one needs a little more than 6MB to represent `\w{20}`, it /// turns out that you only need a little more than 6KB to represent /// `(?-u:\w{20})`. So only use Unicode if you need it! /// /// As with [`Config::determinize_size_limit`], the size of a DFA is /// influenced by other factors, such as what start state configurations /// to support. For example, if you only need unanchored searches and not /// anchored searches, then configuring the DFA to only support unanchored /// searches can reduce its size. By default, DFAs support both unanchored /// and anchored searches. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::{dense, Automaton, StartKind}, Input}; /// /// // 3MB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() /// .dfa_size_limit(Some(3_000_000)) /// .start_kind(StartKind::Unanchored) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 4MB probably is! /// // (Note that DFA sizes aren't necessarily stable between releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() /// .dfa_size_limit(Some(4_000_000)) /// .start_kind(StartKind::Unanchored) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); /// /// # Ok::<(), Box>(()) /// ``` pub fn dfa_size_limit(mut self, bytes: Option) -> Config { self.dfa_size_limit = Some(bytes); self } /// Set a size limit on the total heap used by determinization. /// /// This size limit is expressed in bytes and is applied during /// determinization of an NFA into a DFA. If the heap used for auxiliary /// storage during determinization (memory that is not in the DFA but /// necessary for building the DFA) exceeds this configured limit, then /// determinization is stopped and an error is returned. /// /// This limit does not apply to heap used by the DFA itself. /// /// The total limit on heap used during determinization is the sum of the /// DFA and determinization size limits. /// /// The default is no limit. /// /// # Example /// /// This example shows a DFA that fails to build because of a /// configured size limit on the amount of heap space used by /// determinization. This particular example complements the example for /// [`Config::dfa_size_limit`] by demonstrating that not only does Unicode /// potentially make DFAs themselves big, but it also results in more /// auxiliary storage during determinization. (Although, auxiliary storage /// is still not as much as the DFA itself.) /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// /// // 700KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() /// .determinize_size_limit(Some(700_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 800KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() /// .determinize_size_limit(Some(800_000)) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); /// assert!(dfa.try_search_fwd(&Input::new(&haystack))?.is_some()); /// /// # Ok::<(), Box>(()) /// ``` /// /// Note that some parts of the configuration on a DFA can have a /// big impact on how big the DFA is, and thus, how much memory is /// used. For example, the default setting for [`Config::start_kind`] is /// [`StartKind::Both`]. But if you only need an anchored search, for /// example, then it can be much cheaper to build a DFA that only supports /// anchored searches. (Running an unanchored search with it would panic.) /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{ /// dfa::{dense, Automaton, StartKind}, /// Anchored, Input, /// }; /// /// // 200KB isn't enough! /// dense::Builder::new() /// .configure(dense::Config::new() /// .determinize_size_limit(Some(200_000)) /// .start_kind(StartKind::Anchored) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 300KB probably is! /// // (Note that auxiliary storage sizes aren't necessarily stable between /// // releases.) /// let dfa = dense::Builder::new() /// .configure(dense::Config::new() /// .determinize_size_limit(Some(300_000)) /// .start_kind(StartKind::Anchored) /// ) /// .build(r"\w{20}")?; /// let haystack = "A".repeat(20).into_bytes(); /// let input = Input::new(&haystack).anchored(Anchored::Yes); /// assert!(dfa.try_search_fwd(&input)?.is_some()); /// /// # Ok::<(), Box>(()) /// ``` pub fn determinize_size_limit(mut self, bytes: Option) -> Config { self.determinize_size_limit = Some(bytes); self } /// Returns whether this configuration has enabled simple state /// acceleration. pub fn get_accelerate(&self) -> bool { self.accelerate.unwrap_or(true) } /// Returns the prefilter attached to this configuration, if any. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Returns whether this configuration has enabled the expensive process /// of minimizing a DFA. pub fn get_minimize(&self) -> bool { self.minimize.unwrap_or(false) } /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns the starting state configuration for a DFA. pub fn get_starts(&self) -> StartKind { self.start_kind.unwrap_or(StartKind::Both) } /// Returns whether this configuration has enabled anchored starting states /// for every pattern in the DFA. pub fn get_starts_for_each_pattern(&self) -> bool { self.starts_for_each_pattern.unwrap_or(false) } /// Returns whether this configuration has enabled byte classes or not. /// This is typically a debugging oriented option, as disabling it confers /// no speed benefit. pub fn get_byte_classes(&self) -> bool { self.byte_classes.unwrap_or(true) } /// Returns whether this configuration has enabled heuristic Unicode word /// boundary support. When enabled, it is possible for a search to return /// an error. pub fn get_unicode_word_boundary(&self) -> bool { self.unicode_word_boundary.unwrap_or(false) } /// Returns whether this configuration will instruct the DFA to enter a /// quit state whenever the given byte is seen during a search. When at /// least one byte has this enabled, it is possible for a search to return /// an error. pub fn get_quit(&self, byte: u8) -> bool { self.quitset.map_or(false, |q| q.contains(byte)) } /// Returns whether this configuration will instruct the DFA to /// "specialize" start states. When enabled, the DFA will mark start states /// as "special" so that search routines using the DFA can detect when /// it's in a start state and do some kind of optimization (like run a /// prefilter). pub fn get_specialize_start_states(&self) -> bool { self.specialize_start_states.unwrap_or(false) } /// Returns the DFA size limit of this configuration if one was set. /// The size limit is total number of bytes on the heap that a DFA is /// permitted to use. If the DFA exceeds this limit during construction, /// then construction is stopped and an error is returned. pub fn get_dfa_size_limit(&self) -> Option { self.dfa_size_limit.unwrap_or(None) } /// Returns the determinization size limit of this configuration if one /// was set. The size limit is total number of bytes on the heap that /// determinization is permitted to use. If determinization exceeds this /// limit during construction, then construction is stopped and an error is /// returned. /// /// This is different from the DFA size limit in that this only applies to /// the auxiliary storage used during determinization. Once determinization /// is complete, this memory is freed. /// /// The limit on the total heap memory used is the sum of the DFA and /// determinization size limits. pub fn get_determinize_size_limit(&self) -> Option { self.determinize_size_limit.unwrap_or(None) } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { accelerate: o.accelerate.or(self.accelerate), pre: o.pre.or_else(|| self.pre.clone()), minimize: o.minimize.or(self.minimize), match_kind: o.match_kind.or(self.match_kind), start_kind: o.start_kind.or(self.start_kind), starts_for_each_pattern: o .starts_for_each_pattern .or(self.starts_for_each_pattern), byte_classes: o.byte_classes.or(self.byte_classes), unicode_word_boundary: o .unicode_word_boundary .or(self.unicode_word_boundary), quitset: o.quitset.or(self.quitset), specialize_start_states: o .specialize_start_states .or(self.specialize_start_states), dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), determinize_size_limit: o .determinize_size_limit .or(self.determinize_size_limit), } } } /// A builder for constructing a deterministic finite automaton from regular /// expressions. /// /// This builder provides two main things: /// /// 1. It provides a few different `build` routines for actually constructing /// a DFA from different kinds of inputs. The most convenient is /// [`Builder::build`], which builds a DFA directly from a pattern string. The /// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight /// from an NFA. /// 2. The builder permits configuring a number of things. /// [`Builder::configure`] is used with [`Config`] to configure aspects of /// the DFA and the construction process itself. [`Builder::syntax`] and /// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA /// construction, respectively. The syntax and thompson configurations only /// apply when building from a pattern string. /// /// This builder always constructs a *single* DFA. As such, this builder /// can only be used to construct regexes that either detect the presence /// of a match or find the end location of a match. A single DFA cannot /// produce both the start and end of a match. For that information, use a /// [`Regex`](crate::dfa::regex::Regex), which can be similarly configured /// using [`regex::Builder`](crate::dfa::regex::Builder). The main reason to /// use a DFA directly is if the end location of a match is enough for your use /// case. Namely, a `Regex` will construct two DFAs instead of one, since a /// second reverse DFA is needed to find the start of a match. /// /// Note that if one wants to build a sparse DFA, you must first build a dense /// DFA and convert that to a sparse DFA. There is no way to build a sparse /// DFA without first building a dense DFA. /// /// # Example /// /// This example shows how to build a minimized DFA that completely disables /// Unicode. That is: /// /// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` /// and `\b` are ASCII-only while `.` matches any byte except for `\n` /// (instead of any UTF-8 encoding of a Unicode scalar value except for /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. /// * The pattern itself is permitted to match invalid UTF-8. For example, /// things like `[^a]` that match any byte except for `a` are permitted. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// util::syntax, /// HalfMatch, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().minimize(false)) /// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build(r"foo[^b]ar.*")?; /// /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; /// let expected = Some(HalfMatch::must(0, 10)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } #[cfg(feature = "dfa-build")] impl Builder { /// Create a new dense DFA builder with the default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a DFA from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a DFA from the given patterns. /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self .thompson .clone() // We can always forcefully disable captures because DFAs do not // support them. .configure( thompson::Config::new() .which_captures(thompson::WhichCaptures::None), ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(&nfa) } /// Build a DFA from the given NFA. /// /// # Example /// /// This example shows how to build a DFA if you already have an NFA in /// hand. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// nfa::thompson::NFA, /// HalfMatch, Input, /// }; /// /// let haystack = "foo123bar".as_bytes(); /// /// // This shows how to set non-default options for building an NFA. /// let nfa = NFA::compiler() /// .configure(NFA::config().shrink(true)) /// .build(r"[0-9]+")?; /// let dfa = dense::Builder::new().build_from_nfa(&nfa)?; /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_from_nfa( &self, nfa: &thompson::NFA, ) -> Result { let mut quitset = self.config.quitset.unwrap_or(ByteSet::empty()); if self.config.get_unicode_word_boundary() && nfa.look_set_any().contains_word_unicode() { for b in 0x80..=0xFF { quitset.add(b); } } let classes = if !self.config.get_byte_classes() { // DFAs will always use the equivalence class map, but enabling // this option is useful for debugging. Namely, this will cause all // transitions to be defined over their actual bytes instead of an // opaque equivalence class identifier. The former is much easier // to grok as a human. ByteClasses::singletons() } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other // bytes. Otherwise, a non-quit byte may end up in the same // class as a quit byte, and thus cause the DFA to stop when it // shouldn't. // // Test case: // // regex-cli find match dense --unicode-word-boundary \ // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quitset.is_empty() { set.add_set(&quitset); } set.byte_classes() }; let mut dfa = DFA::initial( classes, nfa.pattern_len(), self.config.get_starts(), nfa.look_matcher(), self.config.get_starts_for_each_pattern(), self.config.get_prefilter().map(|p| p.clone()), quitset, Flags::from_nfa(&nfa), )?; determinize::Config::new() .match_kind(self.config.get_match_kind()) .quit(quitset) .dfa_size_limit(self.config.get_dfa_size_limit()) .determinize_size_limit(self.config.get_determinize_size_limit()) .run(nfa, &mut dfa)?; if self.config.get_minimize() { dfa.minimize(); } if self.config.get_accelerate() { dfa.accelerate(); } // The state shuffling done before this point always assumes that start // states should be marked as "special," even though it isn't the // default configuration. State shuffling is complex enough as it is, // so it's simpler to just "fix" our special state ID ranges to not // include starting states after-the-fact. if !self.config.get_specialize_start_states() { dfa.special.set_no_special_start_states(); } // Look for and set the universal starting states. dfa.set_universal_starts(); Ok(dfa) } /// Apply the given dense DFA configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a DFA directly from a /// pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether the DFA should match the regex /// in reverse or if additional time should be spent shrinking the size of /// the NFA. /// /// These settings only apply when constructing a DFA directly from a /// pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } #[cfg(feature = "dfa-build")] impl Default for Builder { fn default() -> Builder { Builder::new() } } /// A convenience alias for an owned DFA. We use this particular instantiation /// a lot in this crate, so it's worth giving it a name. This instantiation /// is commonly used for mutable APIs on the DFA while building it. The main /// reason for making DFAs generic is no_std support, and more generally, /// making it possible to load a DFA from an arbitrary slice of bytes. #[cfg(feature = "alloc")] pub(crate) type OwnedDFA = DFA>; /// A dense table-based deterministic finite automaton (DFA). /// /// All dense DFAs have one or more start states, zero or more match states /// and a transition table that maps the current state and the current byte /// of input to the next state. A DFA can use this information to implement /// fast searching. In particular, the use of a dense DFA generally makes the /// trade off that match speed is the most valuable characteristic, even if /// building the DFA may take significant time *and* space. (More concretely, /// building a DFA takes time and space that is exponential in the size of the /// pattern in the worst case.) As such, the processing of every byte of input /// is done with a small constant number of operations that does not vary with /// the pattern, its size or the size of the alphabet. If your needs don't line /// up with this trade off, then a dense DFA may not be an adequate solution to /// your problem. /// /// In contrast, a [`sparse::DFA`] makes the opposite /// trade off: it uses less space but will execute a variable number of /// instructions per byte at match time, which makes it slower for matching. /// (Note that space usage is still exponential in the size of the pattern in /// the worst case.) /// /// A DFA can be built using the default configuration via the /// [`DFA::new`] constructor. Otherwise, one can /// configure various aspects via [`dense::Builder`](Builder). /// /// A single DFA fundamentally supports the following operations: /// /// 1. Detection of a match. /// 2. Location of the end of a match. /// 3. In the case of a DFA with multiple patterns, which pattern matched is /// reported as well. /// /// A notable absence from the above list of capabilities is the location of /// the *start* of a match. In order to provide both the start and end of /// a match, *two* DFAs are required. This functionality is provided by a /// [`Regex`](crate::dfa::regex::Regex). /// /// # Type parameters /// /// A `DFA` has one type parameter, `T`, which is used to represent state IDs, /// pattern IDs and accelerators. `T` is typically a `Vec` or a `&[u32]`. /// /// # The `Automaton` trait /// /// This type implements the [`Automaton`] trait, which means it can be used /// for searching. For example: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let expected = HalfMatch::must(0, 8); /// assert_eq!(Some(expected), dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] pub struct DFA { /// The transition table for this DFA. This includes the transitions /// themselves, along with the stride, number of states and the equivalence /// class mapping. tt: TransitionTable, /// The set of starting state identifiers for this DFA. The starting state /// IDs act as pointers into the transition table. The specific starting /// state chosen for each search is dependent on the context at which the /// search begins. st: StartTable, /// The set of match states and the patterns that match for each /// corresponding match state. /// /// This structure is technically only needed because of support for /// multi-regexes. Namely, multi-regexes require answering not just whether /// a match exists, but _which_ patterns match. So we need to store the /// matching pattern IDs for each match state. We do this even when there /// is only one pattern for the sake of simplicity. In practice, this uses /// up very little space for the case of one pattern. ms: MatchStates, /// Information about which states are "special." Special states are states /// that are dead, quit, matching, starting or accelerated. For more info, /// see the docs for `Special`. special: Special, /// The accelerators for this DFA. /// /// If a state is accelerated, then there exist only a small number of /// bytes that can cause the DFA to leave the state. This permits searching /// to use optimized routines to find those specific bytes instead of using /// the transition table. /// /// All accelerated states exist in a contiguous range in the DFA's /// transition table. See dfa/special.rs for more details on how states are /// arranged. accels: Accels, /// Any prefilter attached to this DFA. /// /// Note that currently prefilters are not serialized. When deserializing /// a DFA from bytes, this is always set to `None`. pre: Option, /// The set of "quit" bytes for this DFA. /// /// This is only used when computing the start state for a particular /// position in a haystack. Namely, in the case where there is a quit /// byte immediately before the start of the search, this set needs to be /// explicitly consulted. In all other cases, quit bytes are detected by /// the DFA itself, by transitioning all quit bytes to a special "quit /// state." quitset: ByteSet, /// Various flags describing the behavior of this DFA. flags: Flags, } #[cfg(feature = "dfa-build")] impl OwnedDFA { /// Parse the given regular expression using a default configuration and /// return the corresponding DFA. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`](Builder) to set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new("foo[0-9]+bar")?; /// let expected = Some(HalfMatch::must(0, 11)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { Builder::new().build(pattern) } /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`](Builder) to set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new_many(&["[0-9]+", "[a-z]+"])?; /// let expected = Some(HalfMatch::must(1, 3)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { Builder::new().build_many(patterns) } } #[cfg(feature = "dfa-build")] impl OwnedDFA { /// Create a new DFA that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::always_match()?; /// /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); Builder::new().build_from_nfa(&nfa) } /// Create a new DFA that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, Input}; /// /// let dfa = dense::DFA::never_match()?; /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); Builder::new().build_from_nfa(&nfa) } /// Create an initial DFA with the given equivalence classes, pattern /// length and whether anchored starting states are enabled for each /// pattern. An initial DFA can be further mutated via determinization. fn initial( classes: ByteClasses, pattern_len: usize, starts: StartKind, lookm: &LookMatcher, starts_for_each_pattern: bool, pre: Option, quitset: ByteSet, flags: Flags, ) -> Result { let start_pattern_len = if starts_for_each_pattern { Some(pattern_len) } else { None }; Ok(DFA { tt: TransitionTable::minimal(classes), st: StartTable::dead(starts, lookm, start_pattern_len)?, ms: MatchStates::empty(pattern_len), special: Special::new(), accels: Accels::empty(), pre, quitset, flags, }) } } #[cfg(feature = "dfa-build")] impl DFA<&[u32]> { /// Return a new default dense DFA compiler configuration. /// /// This is a convenience routine to avoid needing to import the [`Config`] /// type when customizing the construction of a dense DFA. pub fn config() -> Config { Config::new() } /// Create a new dense DFA builder with the default configuration. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. pub fn builder() -> Builder { Builder::new() } } impl> DFA { /// Cheaply return a borrowed version of this dense DFA. Specifically, /// the DFA returned always uses `&[u32]` for its transition table. pub fn as_ref(&self) -> DFA<&'_ [u32]> { DFA { tt: self.tt.as_ref(), st: self.st.as_ref(), ms: self.ms.as_ref(), special: self.special, accels: self.accels(), pre: self.pre.clone(), quitset: self.quitset, flags: self.flags, } } /// Return an owned version of this sparse DFA. Specifically, the DFA /// returned always uses `Vec` for its transition table. /// /// Effectively, this returns a dense DFA whose transition table lives on /// the heap. #[cfg(feature = "alloc")] pub fn to_owned(&self) -> OwnedDFA { DFA { tt: self.tt.to_owned(), st: self.st.to_owned(), ms: self.ms.to_owned(), special: self.special, accels: self.accels().to_owned(), pre: self.pre.clone(), quitset: self.quitset, flags: self.flags, } } /// Returns the starting state configuration for this DFA. /// /// The default is [`StartKind::Both`], which means the DFA supports both /// unanchored and anchored searches. However, this can generally lead to /// bigger DFAs. Therefore, a DFA might be compiled with support for just /// unanchored or anchored searches. In that case, running a search with /// an unsupported configuration will panic. pub fn start_kind(&self) -> StartKind { self.st.kind } /// Returns the start byte map used for computing the `Start` configuration /// at the beginning of a search. pub(crate) fn start_map(&self) -> &StartByteMap { &self.st.start_map } /// Returns true only if this DFA has starting states for each pattern. /// /// When a DFA has starting states for each pattern, then a search with the /// DFA can be configured to only look for anchored matches of a specific /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can /// accept a non-None `pattern_id` if and only if this method returns true. /// Otherwise, calling `try_search_fwd` will panic. /// /// Note that if the DFA has no patterns, this always returns false. pub fn starts_for_each_pattern(&self) -> bool { self.st.pattern_len.is_some() } /// Returns the equivalence classes that make up the alphabet for this DFA. /// /// Unless [`Config::byte_classes`] was disabled, it is possible that /// multiple distinct bytes are grouped into the same equivalence class /// if it is impossible for them to discriminate between a match and a /// non-match. This has the effect of reducing the overall alphabet size /// and in turn potentially substantially reducing the size of the DFA's /// transition table. /// /// The downside of using equivalence classes like this is that every state /// transition will automatically use this map to convert an arbitrary /// byte to its corresponding equivalence class. In practice this has a /// negligible impact on performance. pub fn byte_classes(&self) -> &ByteClasses { &self.tt.classes } /// Returns the total number of elements in the alphabet for this DFA. /// /// That is, this returns the total number of transitions that each state /// in this DFA must have. Typically, a normal byte oriented DFA would /// always have an alphabet size of 256, corresponding to the number of /// unique values in a single byte. However, this implementation has two /// peculiarities that impact the alphabet length: /// /// * Every state has a special "EOI" transition that is only followed /// after the end of some haystack is reached. This EOI transition is /// necessary to account for one byte of look-ahead when implementing /// things like `\b` and `$`. /// * Bytes are grouped into equivalence classes such that no two bytes in /// the same class can distinguish a match from a non-match. For example, /// in the regex `^[a-z]+$`, the ASCII bytes `a-z` could all be in the /// same equivalence class. This leads to a massive space savings. /// /// Note though that the alphabet length does _not_ necessarily equal the /// total stride space taken up by a single DFA state in the transition /// table. Namely, for performance reasons, the stride is always the /// smallest power of two that is greater than or equal to the alphabet /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are /// often more useful. The alphabet length is typically useful only for /// informational purposes. pub fn alphabet_len(&self) -> usize { self.tt.alphabet_len() } /// Returns the total stride for every state in this DFA, expressed as the /// exponent of a power of 2. The stride is the amount of space each state /// takes up in the transition table, expressed as a number of transitions. /// (Unused transitions map to dead states.) /// /// The stride of a DFA is always equivalent to the smallest power of 2 /// that is greater than or equal to the DFA's alphabet length. This /// definition uses extra space, but permits faster translation between /// premultiplied state identifiers and contiguous indices (by using shifts /// instead of relying on integer division). /// /// For example, if the DFA's stride is 16 transitions, then its `stride2` /// is `4` since `2^4 = 16`. /// /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) /// while the maximum `stride2` value is `9` (corresponding to a stride of /// `512`). The maximum is not `8` since the maximum alphabet size is `257` /// when accounting for the special EOI transition. However, an alphabet /// length of that size is exceptionally rare since the alphabet is shrunk /// into equivalence classes. pub fn stride2(&self) -> usize { self.tt.stride2 } /// Returns the total stride for every state in this DFA. This corresponds /// to the total number of transitions used by each state in this DFA's /// transition table. /// /// Please see [`DFA::stride2`] for more information. In particular, this /// returns the stride as the number of transitions, where as `stride2` /// returns it as the exponent of a power of 2. pub fn stride(&self) -> usize { self.tt.stride() } /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to /// represent this DFA. /// /// This does **not** include the stack size used up by this DFA. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.tt.memory_usage() + self.st.memory_usage() + self.ms.memory_usage() + self.accels.memory_usage() } } /// Routines for converting a dense DFA to other representations, such as /// sparse DFAs or raw bytes suitable for persistent storage. impl> DFA { /// Convert this dense DFA to a sparse DFA. /// /// If a `StateID` is too small to represent all states in the sparse /// DFA, then this returns an error. In most cases, if a dense DFA is /// constructable with `StateID` then a sparse DFA will be as well. /// However, it is not guaranteed. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dense = dense::DFA::new("foo[0-9]+")?; /// let sparse = dense.to_sparse()?; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, sparse.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_sparse(&self) -> Result>, BuildError> { sparse::DFA::from_dense(self) } /// Serialize this DFA as raw bytes to a `Vec` in little endian /// format. Upon success, the `Vec` and the initial padding length are /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// The padding returned is non-zero if the returned `Vec` starts at /// an address that does not have the same alignment as `u32`. The padding /// corresponds to the number of leading bytes written to the returned /// `Vec`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // N.B. We use native endianness here to make the example work, but /// // using to_bytes_little_endian would work on a little endian target. /// let (buf, _) = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_little_endian(&self) -> (Vec, usize) { self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in big endian /// format. Upon success, the `Vec` and the initial padding length are /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// The padding returned is non-zero if the returned `Vec` starts at /// an address that does not have the same alignment as `u32`. The padding /// corresponds to the number of leading bytes written to the returned /// `Vec`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // N.B. We use native endianness here to make the example work, but /// // using to_bytes_big_endian would work on a big endian target. /// let (buf, _) = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_big_endian(&self) -> (Vec, usize) { self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in native endian /// format. Upon success, the `Vec` and the initial padding length are /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// The padding returned is non-zero if the returned `Vec` starts at /// an address that does not have the same alignment as `u32`. The padding /// corresponds to the number of leading bytes written to the returned /// `Vec`. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the /// endianness of the target on which you're compiling DFA. For example, /// if serialization and deserialization happen in the same process or on /// the same machine. Otherwise, when serializing a DFA for use in a /// portable environment, you'll almost certainly want to serialize _both_ /// a little endian and a big endian version and then load the correct one /// based on the target's configuration. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// let (buf, _) = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_native_endian(&self) -> (Vec, usize) { self.to_bytes::() } /// The implementation of the public `to_bytes` serialization methods, /// which is generic over endianness. #[cfg(feature = "dfa-build")] fn to_bytes(&self) -> (Vec, usize) { let len = self.write_to_len(); let (mut buf, padding) = wire::alloc_aligned_buffer::(len); // This should always succeed since the only possible serialization // error is providing a buffer that's too small, but we've ensured that // `buf` is big enough here. self.as_ref().write_to::(&mut buf[padding..]).unwrap(); (buf, padding) } /// Serialize this DFA as raw bytes to the given slice, in little endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Note that unlike the various `to_byte_*` routines, this does not write /// any padding. Callers are responsible for handling alignment correctly. /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. We /// // need to use a special type to force the alignment of our [u8; N] /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing /// // the DFA may fail because of an alignment mismatch. /// #[repr(C)] /// struct Aligned { /// _align: [u32; 0], /// bytes: B, /// } /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_little_endian would work on a little endian target. /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_little_endian( &self, dst: &mut [u8], ) -> Result { self.as_ref().write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in big endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Note that unlike the various `to_byte_*` routines, this does not write /// any padding. Callers are responsible for handling alignment correctly. /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. We /// // need to use a special type to force the alignment of our [u8; N] /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing /// // the DFA may fail because of an alignment mismatch. /// #[repr(C)] /// struct Aligned { /// _align: [u32; 0], /// bytes: B, /// } /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_big_endian would work on a big endian target. /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_big_endian( &self, dst: &mut [u8], ) -> Result { self.as_ref().write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in native endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the /// endianness of the target on which you're compiling DFA. For example, /// if serialization and deserialization happen in the same process or on /// the same machine. Otherwise, when serializing a DFA for use in a /// portable environment, you'll almost certainly want to serialize _both_ /// a little endian and a big endian version and then load the correct one /// based on the target's configuration. /// /// Note that unlike the various `to_byte_*` routines, this does not write /// any padding. Callers are responsible for handling alignment correctly. /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. We /// // need to use a special type to force the alignment of our [u8; N] /// // array to be aligned to a 4 byte boundary. Otherwise, deserializing /// // the DFA may fail because of an alignment mismatch. /// #[repr(C)] /// struct Aligned { /// _align: [u32; 0], /// bytes: B, /// } /// let mut buf = Aligned { _align: [], bytes: [0u8; 4 * (1<<10)] }; /// let written = original_dfa.write_to_native_endian(&mut buf.bytes)?; /// let dfa: DFA<&[u32]> = DFA::from_bytes(&buf.bytes[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_native_endian( &self, dst: &mut [u8], ) -> Result { self.as_ref().write_to::(dst) } /// Return the total number of bytes required to serialize this DFA. /// /// This is useful for determining the size of the buffer required to pass /// to one of the serialization routines: /// /// * [`DFA::write_to_little_endian`] /// * [`DFA::write_to_big_endian`] /// * [`DFA::write_to_native_endian`] /// /// Passing a buffer smaller than the size returned by this method will /// result in a serialization error. Serialization routines are guaranteed /// to succeed when the buffer is big enough. /// /// # Example /// /// This example shows how to dynamically allocate enough room to serialize /// a DFA. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// let mut buf = vec![0; original_dfa.write_to_len()]; /// // This is guaranteed to succeed, because the only serialization error /// // that can occur is when the provided buffer is too small. But /// // write_to_len guarantees a correct size. /// let written = original_dfa.write_to_native_endian(&mut buf).unwrap(); /// // But this is not guaranteed to succeed! In particular, /// // deserialization requires proper alignment for &[u32], but our buffer /// // was allocated as a &[u8] whose required alignment is smaller than /// // &[u32]. However, it's likely to work in practice because of how most /// // allocators work. So if you write code like this, make sure to either /// // handle the error correctly and/or run it under Miri since Miri will /// // likely provoke the error by returning Vec buffers with alignment /// // less than &[u32]. /// let dfa: DFA<&[u32]> = match DFA::from_bytes(&buf[..written]) { /// // As mentioned above, it is legal for an error to be returned /// // here. It is quite difficult to get a Vec with a guaranteed /// // alignment equivalent to Vec. /// Err(_) => return Ok(()), /// Ok((dfa, _)) => dfa, /// }; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` /// /// Note that this example isn't actually guaranteed to work! In /// particular, if `buf` is not aligned to a 4-byte boundary, then the /// `DFA::from_bytes` call will fail. If you need this to work, then you /// either need to deal with adding some initial padding yourself, or use /// one of the `to_bytes` methods, which will do it for you. pub fn write_to_len(&self) -> usize { wire::write_label_len(LABEL) + wire::write_endianness_check_len() + wire::write_version_len() + size_of::() // unused, intended for future flexibility + self.flags.write_to_len() + self.tt.write_to_len() + self.st.write_to_len() + self.ms.write_to_len() + self.special.write_to_len() + self.accels.write_to_len() + self.quitset.write_to_len() } } impl<'a> DFA<&'a [u32]> { /// Safely deserialize a DFA with a specific state identifier /// representation. Upon success, this returns both the deserialized DFA /// and the number of bytes read from the given slice. Namely, the contents /// of the slice beyond the DFA are not read. /// /// Deserializing a DFA using this routine will never allocate heap memory. /// For safety purposes, the DFA's transition table will be verified such /// that every transition points to a valid state. If this verification is /// too costly, then a [`DFA::from_bytes_unchecked`] API is provided, which /// will always execute in constant time. /// /// The bytes given must be generated by one of the serialization APIs /// of a `DFA` using a semver compatible release of this crate. Those /// include: /// /// * [`DFA::to_bytes_little_endian`] /// * [`DFA::to_bytes_big_endian`] /// * [`DFA::to_bytes_native_endian`] /// * [`DFA::write_to_little_endian`] /// * [`DFA::write_to_big_endian`] /// * [`DFA::write_to_native_endian`] /// /// The `to_bytes` methods allocate and return a `Vec` for you, along /// with handling alignment correctly. The `write_to` methods do not /// allocate and write to an existing slice (which may be on the stack). /// Since deserialization always uses the native endianness of the target /// platform, the serialization API you use should match the endianness of /// the target platform. (It's often a good idea to generate serialized /// DFAs for both forms of endianness and then load the correct one based /// on endianness.) /// /// # Errors /// /// Generally speaking, it's easier to state the conditions in which an /// error is _not_ returned. All of the following must be true: /// /// * The bytes given must be produced by one of the serialization APIs /// on this DFA, as mentioned above. /// * The endianness of the target platform matches the endianness used to /// serialized the provided DFA. /// * The slice given must have the same alignment as `u32`. /// /// If any of the above are not true, then an error will be returned. /// /// # Panics /// /// This routine will never panic for any input. /// /// # Example /// /// This example shows how to serialize a DFA to raw bytes, deserialize it /// and then use it for searching. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let (bytes, _) = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: dealing with alignment and padding /// /// In the above example, we used the `to_bytes_native_endian` method to /// serialize a DFA, but we ignored part of its return value corresponding /// to padding added to the beginning of the serialized DFA. This is OK /// because deserialization will skip this initial padding. What matters /// is that the address immediately following the padding has an alignment /// that matches `u32`. That is, the following is an equivalent but /// alternative way to write the above example: /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// // Serialization returns the number of leading padding bytes added to /// // the returned Vec. /// let (bytes, pad) = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u32]> = DFA::from_bytes(&bytes[pad..])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` /// /// This padding is necessary because Rust's standard library does /// not expose any safe and robust way of creating a `Vec` with a /// guaranteed alignment other than 1. Now, in practice, the underlying /// allocator is likely to provide a `Vec` that meets our alignment /// requirements, which means `pad` is zero in practice most of the time. /// /// The purpose of exposing the padding like this is flexibility for the /// caller. For example, if one wants to embed a serialized DFA into a /// compiled program, then it's important to guarantee that it starts at a /// `u32`-aligned address. The simplest way to do this is to discard the /// padding bytes and set it up so that the serialized DFA itself begins at /// a properly aligned address. We can show this in two parts. The first /// part is serializing the DFA to a file: /// /// ```no_run /// use regex_automata::dfa::dense::DFA; /// /// let dfa = DFA::new("foo[0-9]+")?; /// /// let (bytes, pad) = dfa.to_bytes_big_endian(); /// // Write the contents of the DFA *without* the initial padding. /// std::fs::write("foo.bigendian.dfa", &bytes[pad..])?; /// /// // Do it again, but this time for little endian. /// let (bytes, pad) = dfa.to_bytes_little_endian(); /// std::fs::write("foo.littleendian.dfa", &bytes[pad..])?; /// # Ok::<(), Box>(()) /// ``` /// /// And now the second part is embedding the DFA into the compiled program /// and deserializing it at runtime on first use. We use conditional /// compilation to choose the correct endianness. /// /// ```no_run /// use regex_automata::{ /// dfa::{Automaton, dense::DFA}, /// util::{lazy::Lazy, wire::AlignAs}, /// HalfMatch, Input, /// }; /// /// // This crate provides its own "lazy" type, kind of like /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc /// // no-std environments and let's us write this using completely /// // safe code. /// static RE: Lazy> = Lazy::new(|| { /// # const _: &str = stringify! { /// // This assignment is made possible (implicitly) via the /// // CoerceUnsized trait. This is what guarantees that our /// // bytes are stored in memory on a 4 byte boundary. You /// // *must* do this or something equivalent for correct /// // deserialization. /// static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { /// _align: [], /// #[cfg(target_endian = "big")] /// bytes: *include_bytes!("foo.bigendian.dfa"), /// #[cfg(target_endian = "little")] /// bytes: *include_bytes!("foo.littleendian.dfa"), /// }; /// # }; /// # static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { /// # _align: [], /// # bytes: [], /// # }; /// /// let (dfa, _) = DFA::from_bytes(&ALIGNED.bytes) /// .expect("serialized DFA should be valid"); /// dfa /// }); /// /// let expected = Ok(Some(HalfMatch::must(0, 8))); /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); /// ``` /// /// An alternative to [`util::lazy::Lazy`](crate::util::lazy::Lazy) /// is [`lazy_static`](https://crates.io/crates/lazy_static) or /// [`once_cell`](https://crates.io/crates/once_cell), which provide /// stronger guarantees (like the initialization function only being /// executed once). And `once_cell` in particular provides a more /// expressive API. But a `Lazy` value from this crate is likely just fine /// in most circumstances. /// /// Note that regardless of which initialization method you use, you /// will still need to use the [`AlignAs`](crate::util::wire::AlignAs) /// trick above to force correct alignment, but this is safe to do and /// `from_bytes` will return an error if you get it wrong. pub fn from_bytes( slice: &'a [u8], ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { // SAFETY: This is safe because we validate the transition table, start // table, match states and accelerators below. If any validation fails, // then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; dfa.tt.validate(&dfa)?; dfa.st.validate(&dfa)?; dfa.ms.validate(&dfa)?; dfa.accels.validate()?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. for state in dfa.states() { // If the state is an accel state, then it must have a non-empty // accelerator. if dfa.is_accel_state(state.id()) { let index = dfa.accelerator_index(state.id()); if index >= dfa.accels.len() { return Err(DeserializeError::generic( "found DFA state with invalid accelerator index", )); } let needles = dfa.accels.needles(index); if !(1 <= needles.len() && needles.len() <= 3) { return Err(DeserializeError::generic( "accelerator needles has invalid length", )); } } } Ok((dfa, nread)) } /// Deserialize a DFA with a specific state identifier representation in /// constant time by omitting the verification of the validity of the /// transition table and other data inside the DFA. /// /// This is just like [`DFA::from_bytes`], except it can potentially return /// a DFA that exhibits undefined behavior if its transition table contains /// invalid state identifiers. /// /// This routine is useful if you need to deserialize a DFA cheaply /// and cannot afford the transition table validation performed by /// `from_bytes`. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, dense::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let (bytes, _) = initial.to_bytes_native_endian(); /// // SAFETY: This is guaranteed to be safe since the bytes given come /// // directly from a compatible serialization routine. /// let dfa: DFA<&[u32]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub unsafe fn from_bytes_unchecked( slice: &'a [u8], ) -> Result<(DFA<&'a [u32]>, usize), DeserializeError> { let mut nr = 0; nr += wire::skip_initial_padding(slice); wire::check_alignment::(&slice[nr..])?; nr += wire::read_label(&slice[nr..], LABEL)?; nr += wire::read_endianness_check(&slice[nr..])?; nr += wire::read_version(&slice[nr..], VERSION)?; let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; nr += size_of::(); let (flags, nread) = Flags::from_bytes(&slice[nr..])?; nr += nread; let (tt, nread) = TransitionTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (ms, nread) = MatchStates::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (special, nread) = Special::from_bytes(&slice[nr..])?; nr += nread; special.validate_state_len(tt.len(), tt.stride2)?; let (accels, nread) = Accels::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; nr += nread; // Prefilters don't support serialization, so they're always absent. let pre = None; Ok((DFA { tt, st, ms, special, accels, pre, quitset, flags }, nr)) } /// The implementation of the public `write_to` serialization methods, /// which is generic over endianness. /// /// This is defined only for &[u32] to reduce binary size/compilation time. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("dense DFA")); } dst = &mut dst[..nwrite]; let mut nw = 0; nw += wire::write_label(LABEL, &mut dst[nw..])?; nw += wire::write_endianness_check::(&mut dst[nw..])?; nw += wire::write_version::(VERSION, &mut dst[nw..])?; nw += { // Currently unused, intended for future flexibility E::write_u32(0, &mut dst[nw..]); size_of::() }; nw += self.flags.write_to::(&mut dst[nw..])?; nw += self.tt.write_to::(&mut dst[nw..])?; nw += self.st.write_to::(&mut dst[nw..])?; nw += self.ms.write_to::(&mut dst[nw..])?; nw += self.special.write_to::(&mut dst[nw..])?; nw += self.accels.write_to::(&mut dst[nw..])?; nw += self.quitset.write_to::(&mut dst[nw..])?; Ok(nw) } } // The following methods implement mutable routines on the internal // representation of a DFA. As such, we must fix the first type parameter to a // `Vec` since a generic `T: AsRef<[u32]>` does not permit mutation. We // can get away with this because these methods are internal to the crate and // are exclusively used during construction of the DFA. #[cfg(feature = "dfa-build")] impl OwnedDFA { /// Add a start state of this DFA. pub(crate) fn set_start_state( &mut self, anchored: Anchored, start: Start, id: StateID, ) { assert!(self.tt.is_valid(id), "invalid start state"); self.st.set_start(anchored, start, id); } /// Set the given transition to this DFA. Both the `from` and `to` states /// must already exist. pub(crate) fn set_transition( &mut self, from: StateID, byte: alphabet::Unit, to: StateID, ) { self.tt.set(from, byte, to); } /// An empty state (a state where all transitions lead to a dead state) /// and return its identifier. The identifier returned is guaranteed to /// not point to any other existing state. /// /// If adding a state would exceed `StateID::LIMIT`, then this returns an /// error. pub(crate) fn add_empty_state(&mut self) -> Result { self.tt.add_empty_state() } /// Swap the two states given in the transition table. /// /// This routine does not do anything to check the correctness of this /// swap. Callers must ensure that other states pointing to id1 and id2 are /// updated appropriately. pub(crate) fn swap_states(&mut self, id1: StateID, id2: StateID) { self.tt.swap(id1, id2); } /// Remap all of the state identifiers in this DFA according to the map /// function given. This includes all transitions and all starting state /// identifiers. pub(crate) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { // We could loop over each state ID and call 'remap_state' here, but // this is more direct: just map every transition directly. This // technically might do a little extra work since the alphabet length // is likely less than the stride, but if that is indeed an issue we // should benchmark it and fix it. for sid in self.tt.table_mut().iter_mut() { *sid = map(*sid); } for sid in self.st.table_mut().iter_mut() { *sid = map(*sid); } } /// Remap the transitions for the state given according to the function /// given. This applies the given map function to every transition in the /// given state and changes the transition in place to the result of the /// map function for that transition. pub(crate) fn remap_state( &mut self, id: StateID, map: impl Fn(StateID) -> StateID, ) { self.tt.remap(id, map); } /// Truncate the states in this DFA to the given length. /// /// This routine does not do anything to check the correctness of this /// truncation. Callers must ensure that other states pointing to truncated /// states are updated appropriately. pub(crate) fn truncate_states(&mut self, len: usize) { self.tt.truncate(len); } /// Minimize this DFA in place using Hopcroft's algorithm. pub(crate) fn minimize(&mut self) { Minimizer::new(self).run(); } /// Updates the match state pattern ID map to use the one provided. /// /// This is useful when it's convenient to manipulate matching states /// (and their corresponding pattern IDs) as a map. In particular, the /// representation used by a DFA for this map is not amenable to mutation, /// so if things need to be changed (like when shuffling states), it's /// often easier to work with the map form. pub(crate) fn set_pattern_map( &mut self, map: &BTreeMap>, ) -> Result<(), BuildError> { self.ms = self.ms.new_with_map(map)?; Ok(()) } /// Find states that have a small number of non-loop transitions and mark /// them as candidates for acceleration during search. pub(crate) fn accelerate(&mut self) { // dead and quit states can never be accelerated. if self.state_len() <= 2 { return; } // Go through every state and record their accelerator, if possible. let mut accels = BTreeMap::new(); // Count the number of accelerated match, start and non-match/start // states. let (mut cmatch, mut cstart, mut cnormal) = (0, 0, 0); for state in self.states() { if let Some(accel) = state.accelerate(self.byte_classes()) { debug!( "accelerating full DFA state {}: {:?}", state.id().as_usize(), accel, ); accels.insert(state.id(), accel); if self.is_match_state(state.id()) { cmatch += 1; } else if self.is_start_state(state.id()) { cstart += 1; } else { assert!(!self.is_dead_state(state.id())); assert!(!self.is_quit_state(state.id())); cnormal += 1; } } } // If no states were able to be accelerated, then we're done. if accels.is_empty() { return; } let original_accels_len = accels.len(); // A remapper keeps track of state ID changes. Once we're done // shuffling, the remapper is used to rewrite all transitions in the // DFA based on the new positions of states. let mut remapper = Remapper::new(self); // As we swap states, if they are match states, we need to swap their // pattern ID lists too (for multi-regexes). We do this by converting // the lists to an easily swappable map, and then convert back to // MatchStates once we're done. let mut new_matches = self.ms.to_map(self); // There is at least one state that gets accelerated, so these are // guaranteed to get set to sensible values below. self.special.min_accel = StateID::MAX; self.special.max_accel = StateID::ZERO; let update_special_accel = |special: &mut Special, accel_id: StateID| { special.min_accel = cmp::min(special.min_accel, accel_id); special.max_accel = cmp::max(special.max_accel, accel_id); }; // Start by shuffling match states. Any match states that are // accelerated get moved to the end of the match state range. if cmatch > 0 && self.special.matches() { // N.B. special.{min,max}_match do not need updating, since the // range/number of match states does not change. Only the ordering // of match states may change. let mut next_id = self.special.max_match; let mut cur_id = next_id; while cur_id >= self.special.min_match { if let Some(accel) = accels.remove(&cur_id) { accels.insert(next_id, accel); update_special_accel(&mut self.special, next_id); // No need to do any actual swapping for equivalent IDs. if cur_id != next_id { remapper.swap(self, cur_id, next_id); // Swap pattern IDs for match states. let cur_pids = new_matches.remove(&cur_id).unwrap(); let next_pids = new_matches.remove(&next_id).unwrap(); new_matches.insert(cur_id, next_pids); new_matches.insert(next_id, cur_pids); } next_id = self.tt.prev_state_id(next_id); } cur_id = self.tt.prev_state_id(cur_id); } } // This is where it gets tricky. Without acceleration, start states // normally come right after match states. But we want accelerated // states to be a single contiguous range (to make it very fast // to determine whether a state *is* accelerated), while also keeping // match and starting states as contiguous ranges for the same reason. // So what we do here is shuffle states such that it looks like this: // // DQMMMMAAAAASSSSSSNNNNNNN // | | // |---------| // accelerated states // // Where: // D - dead state // Q - quit state // M - match state (may be accelerated) // A - normal state that is accelerated // S - start state (may be accelerated) // N - normal state that is NOT accelerated // // We implement this by shuffling states, which is done by a sequence // of pairwise swaps. We start by looking at all normal states to be // accelerated. When we find one, we swap it with the earliest starting // state, and then swap that with the earliest normal state. This // preserves the contiguous property. // // Once we're done looking for accelerated normal states, now we look // for accelerated starting states by moving them to the beginning // of the starting state range (just like we moved accelerated match // states to the end of the matching state range). // // For a more detailed/different perspective on this, see the docs // in dfa/special.rs. if cnormal > 0 { // our next available starting and normal states for swapping. let mut next_start_id = self.special.min_start; let mut cur_id = self.to_state_id(self.state_len() - 1); // This is guaranteed to exist since cnormal > 0. let mut next_norm_id = self.tt.next_state_id(self.special.max_start); while cur_id >= next_norm_id { if let Some(accel) = accels.remove(&cur_id) { remapper.swap(self, next_start_id, cur_id); remapper.swap(self, next_norm_id, cur_id); // Keep our accelerator map updated with new IDs if the // states we swapped were also accelerated. if let Some(accel2) = accels.remove(&next_norm_id) { accels.insert(cur_id, accel2); } if let Some(accel2) = accels.remove(&next_start_id) { accels.insert(next_norm_id, accel2); } accels.insert(next_start_id, accel); update_special_accel(&mut self.special, next_start_id); // Our start range shifts one to the right now. self.special.min_start = self.tt.next_state_id(self.special.min_start); self.special.max_start = self.tt.next_state_id(self.special.max_start); next_start_id = self.tt.next_state_id(next_start_id); next_norm_id = self.tt.next_state_id(next_norm_id); } // This is pretty tricky, but if our 'next_norm_id' state also // happened to be accelerated, then the result is that it is // now in the position of cur_id, so we need to consider it // again. This loop is still guaranteed to terminate though, // because when accels contains cur_id, we're guaranteed to // increment next_norm_id even if cur_id remains unchanged. if !accels.contains_key(&cur_id) { cur_id = self.tt.prev_state_id(cur_id); } } } // Just like we did for match states, but we want to move accelerated // start states to the beginning of the range instead of the end. if cstart > 0 { // N.B. special.{min,max}_start do not need updating, since the // range/number of start states does not change at this point. Only // the ordering of start states may change. let mut next_id = self.special.min_start; let mut cur_id = next_id; while cur_id <= self.special.max_start { if let Some(accel) = accels.remove(&cur_id) { remapper.swap(self, cur_id, next_id); accels.insert(next_id, accel); update_special_accel(&mut self.special, next_id); next_id = self.tt.next_state_id(next_id); } cur_id = self.tt.next_state_id(cur_id); } } // Remap all transitions in our DFA and assert some things. remapper.remap(self); // This unwrap is OK because acceleration never changes the number of // match states or patterns in those match states. Since acceleration // runs after the pattern map has been set at least once, we know that // our match states cannot error. self.set_pattern_map(&new_matches).unwrap(); self.special.set_max(); self.special.validate().expect("special state ranges should validate"); self.special .validate_state_len(self.state_len(), self.stride2()) .expect( "special state ranges should be consistent with state length", ); assert_eq!( self.special.accel_len(self.stride()), // We record the number of accelerated states initially detected // since the accels map is itself mutated in the process above. // If mutated incorrectly, its size may change, and thus can't be // trusted as a source of truth of how many accelerated states we // expected there to be. original_accels_len, "mismatch with expected number of accelerated states", ); // And finally record our accelerators. We kept our accels map updated // as we shuffled states above, so the accelerators should now // correspond to a contiguous range in the state ID space. (Which we // assert.) let mut prev: Option = None; for (id, accel) in accels { assert!(prev.map_or(true, |p| self.tt.next_state_id(p) == id)); prev = Some(id); self.accels.add(accel); } } /// Shuffle the states in this DFA so that starting states, match /// states and accelerated states are all contiguous. /// /// See dfa/special.rs for more details. pub(crate) fn shuffle( &mut self, mut matches: BTreeMap>, ) -> Result<(), BuildError> { // The determinizer always adds a quit state and it is always second. self.special.quit_id = self.to_state_id(1); // If all we have are the dead and quit states, then we're done and // the DFA will never produce a match. if self.state_len() <= 2 { self.special.set_max(); return Ok(()); } // Collect all our non-DEAD start states into a convenient set and // confirm there is no overlap with match states. In the classicl DFA // construction, start states can be match states. But because of // look-around, we delay all matches by a byte, which prevents start // states from being match states. let mut is_start: BTreeSet = BTreeSet::new(); for (start_id, _, _) in self.starts() { // If a starting configuration points to a DEAD state, then we // don't want to shuffle it. The DEAD state is always the first // state with ID=0. So we can just leave it be. if start_id == DEAD { continue; } assert!( !matches.contains_key(&start_id), "{:?} is both a start and a match state, which is not allowed", start_id, ); is_start.insert(start_id); } // We implement shuffling by a sequence of pairwise swaps of states. // Since we have a number of things referencing states via their // IDs and swapping them changes their IDs, we need to record every // swap we make so that we can remap IDs. The remapper handles this // book-keeping for us. let mut remapper = Remapper::new(self); // Shuffle matching states. if matches.is_empty() { self.special.min_match = DEAD; self.special.max_match = DEAD; } else { // The determinizer guarantees that the first two states are the // dead and quit states, respectively. We want our match states to // come right after quit. let mut next_id = self.to_state_id(2); let mut new_matches = BTreeMap::new(); self.special.min_match = next_id; for (id, pids) in matches { remapper.swap(self, next_id, id); new_matches.insert(next_id, pids); // If we swapped a start state, then update our set. if is_start.contains(&next_id) { is_start.remove(&next_id); is_start.insert(id); } next_id = self.tt.next_state_id(next_id); } matches = new_matches; self.special.max_match = cmp::max( self.special.min_match, self.tt.prev_state_id(next_id), ); } // Shuffle starting states. { let mut next_id = self.to_state_id(2); if self.special.matches() { next_id = self.tt.next_state_id(self.special.max_match); } self.special.min_start = next_id; for id in is_start { remapper.swap(self, next_id, id); next_id = self.tt.next_state_id(next_id); } self.special.max_start = cmp::max( self.special.min_start, self.tt.prev_state_id(next_id), ); } // Finally remap all transitions in our DFA. remapper.remap(self); self.set_pattern_map(&matches)?; self.special.set_max(); self.special.validate().expect("special state ranges should validate"); self.special .validate_state_len(self.state_len(), self.stride2()) .expect( "special state ranges should be consistent with state length", ); Ok(()) } /// Checks whether there are universal start states (both anchored and /// unanchored), and if so, sets the relevant fields to the start state /// IDs. /// /// Universal start states occur precisely when the all patterns in the /// DFA have no look-around assertions in their prefix. fn set_universal_starts(&mut self) { assert_eq!(6, Start::len(), "expected 6 start configurations"); let start_id = |dfa: &mut OwnedDFA, anchored: Anchored, start: Start| { // This OK because we only call 'start' under conditions // in which we know it will succeed. dfa.st.start(anchored, start).expect("valid Input configuration") }; if self.start_kind().has_unanchored() { let anchor = Anchored::No; let sid = start_id(self, anchor, Start::NonWordByte); if sid == start_id(self, anchor, Start::WordByte) && sid == start_id(self, anchor, Start::Text) && sid == start_id(self, anchor, Start::LineLF) && sid == start_id(self, anchor, Start::LineCR) && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_unanchored = Some(sid); } } if self.start_kind().has_anchored() { let anchor = Anchored::Yes; let sid = start_id(self, anchor, Start::NonWordByte); if sid == start_id(self, anchor, Start::WordByte) && sid == start_id(self, anchor, Start::Text) && sid == start_id(self, anchor, Start::LineLF) && sid == start_id(self, anchor, Start::LineCR) && sid == start_id(self, anchor, Start::CustomLineTerminator) { self.st.universal_start_anchored = Some(sid); } } } } // A variety of generic internal methods for accessing DFA internals. impl> DFA { /// Return the info about special states. pub(crate) fn special(&self) -> &Special { &self.special } /// Return the info about special states as a mutable borrow. #[cfg(feature = "dfa-build")] pub(crate) fn special_mut(&mut self) -> &mut Special { &mut self.special } /// Returns the quit set (may be empty) used by this DFA. pub(crate) fn quitset(&self) -> &ByteSet { &self.quitset } /// Returns the flags for this DFA. pub(crate) fn flags(&self) -> &Flags { &self.flags } /// Returns an iterator over all states in this DFA. /// /// This iterator yields a tuple for each state. The first element of the /// tuple corresponds to a state's identifier, and the second element /// corresponds to the state itself (comprised of its transitions). pub(crate) fn states(&self) -> StateIter<'_, T> { self.tt.states() } /// Return the total number of states in this DFA. Every DFA has at least /// 1 state, even the empty DFA. pub(crate) fn state_len(&self) -> usize { self.tt.len() } /// Return an iterator over all pattern IDs for the given match state. /// /// If the given state is not a match state, then this panics. #[cfg(feature = "dfa-build")] pub(crate) fn pattern_id_slice(&self, id: StateID) -> &[PatternID] { assert!(self.is_match_state(id)); self.ms.pattern_id_slice(self.match_state_index(id)) } /// Return the total number of pattern IDs for the given match state. /// /// If the given state is not a match state, then this panics. pub(crate) fn match_pattern_len(&self, id: StateID) -> usize { assert!(self.is_match_state(id)); self.ms.pattern_len(self.match_state_index(id)) } /// Returns the total number of patterns matched by this DFA. pub(crate) fn pattern_len(&self) -> usize { self.ms.pattern_len } /// Returns a map from match state ID to a list of pattern IDs that match /// in that state. #[cfg(feature = "dfa-build")] pub(crate) fn pattern_map(&self) -> BTreeMap> { self.ms.to_map(self) } /// Returns the ID of the quit state for this DFA. #[cfg(feature = "dfa-build")] pub(crate) fn quit_id(&self) -> StateID { self.to_state_id(1) } /// Convert the given state identifier to the state's index. The state's /// index corresponds to the position in which it appears in the transition /// table. When a DFA is NOT premultiplied, then a state's identifier is /// also its index. When a DFA is premultiplied, then a state's identifier /// is equal to `index * alphabet_len`. This routine reverses that. pub(crate) fn to_index(&self, id: StateID) -> usize { self.tt.to_index(id) } /// Convert an index to a state (in the range 0..self.state_len()) to an /// actual state identifier. /// /// This is useful when using a `Vec` as an efficient map keyed by state /// to some other information (such as a remapped state ID). #[cfg(feature = "dfa-build")] pub(crate) fn to_state_id(&self, index: usize) -> StateID { self.tt.to_state_id(index) } /// Return the table of state IDs for this DFA's start states. pub(crate) fn starts(&self) -> StartStateIter<'_> { self.st.iter() } /// Returns the index of the match state for the given ID. If the /// given ID does not correspond to a match state, then this may /// panic or produce an incorrect result. #[cfg_attr(feature = "perf-inline", inline(always))] fn match_state_index(&self, id: StateID) -> usize { debug_assert!(self.is_match_state(id)); // This is one of the places where we rely on the fact that match // states are contiguous in the transition table. Namely, that the // first match state ID always corresponds to dfa.special.min_match. // From there, since we know the stride, we can compute the overall // index of any match state given the match state's ID. let min = self.special().min_match.as_usize(); // CORRECTNESS: We're allowed to produce an incorrect result or panic, // so both the subtraction and the unchecked StateID construction is // OK. self.to_index(StateID::new_unchecked(id.as_usize() - min)) } /// Returns the index of the accelerator state for the given ID. If the /// given ID does not correspond to an accelerator state, then this may /// panic or produce an incorrect result. fn accelerator_index(&self, id: StateID) -> usize { let min = self.special().min_accel.as_usize(); // CORRECTNESS: We're allowed to produce an incorrect result or panic, // so both the subtraction and the unchecked StateID construction is // OK. self.to_index(StateID::new_unchecked(id.as_usize() - min)) } /// Return the accelerators for this DFA. fn accels(&self) -> Accels<&[u32]> { self.accels.as_ref() } /// Return this DFA's transition table as a slice. fn trans(&self) -> &[StateID] { self.tt.table() } } impl> fmt::Debug for DFA { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "dense::DFA(")?; for state in self.states() { fmt_state_indicator(f, self, state.id())?; let id = if f.alternate() { state.id().as_usize() } else { self.to_index(state.id()) }; write!(f, "{:06?}: ", id)?; state.fmt(f)?; write!(f, "\n")?; } writeln!(f, "")?; for (i, (start_id, anchored, sty)) in self.starts().enumerate() { let id = if f.alternate() { start_id.as_usize() } else { self.to_index(start_id) }; if i % self.st.stride == 0 { match anchored { Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, Anchored::Pattern(pid) => { writeln!(f, "START_GROUP(pattern: {:?})", pid)? } } } writeln!(f, " {:?} => {:06?}", sty, id)?; } if self.pattern_len() > 1 { writeln!(f, "")?; for i in 0..self.ms.len() { let id = self.ms.match_state_id(self, i); let id = if f.alternate() { id.as_usize() } else { self.to_index(id) }; write!(f, "MATCH({:06?}): ", id)?; for (i, &pid) in self.ms.pattern_id_slice(i).iter().enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?}", pid)?; } writeln!(f, "")?; } } writeln!(f, "state length: {:?}", self.state_len())?; writeln!(f, "pattern length: {:?}", self.pattern_len())?; writeln!(f, "flags: {:?}", self.flags)?; writeln!(f, ")")?; Ok(()) } } // SAFETY: We assert that our implementation of each method is correct. unsafe impl> Automaton for DFA { #[cfg_attr(feature = "perf-inline", inline(always))] fn is_special_state(&self, id: StateID) -> bool { self.special.is_special_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_dead_state(&self, id: StateID) -> bool { self.special.is_dead_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_quit_state(&self, id: StateID) -> bool { self.special.is_quit_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match_state(&self, id: StateID) -> bool { self.special.is_match_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_start_state(&self, id: StateID) -> bool { self.special.is_start_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_accel_state(&self, id: StateID) -> bool { self.special.is_accel_state(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn next_state(&self, current: StateID, input: u8) -> StateID { let input = self.byte_classes().get(input); let o = current.as_usize() + usize::from(input); self.trans()[o] } #[cfg_attr(feature = "perf-inline", inline(always))] unsafe fn next_state_unchecked( &self, current: StateID, byte: u8, ) -> StateID { // We don't (or shouldn't) need an unchecked variant for the byte // class mapping, since bound checks should be omitted automatically // by virtue of its representation. If this ends up not being true as // confirmed by codegen, please file an issue. ---AG let class = self.byte_classes().get(byte); let o = current.as_usize() + usize::from(class); let next = *self.trans().get_unchecked(o); next } #[cfg_attr(feature = "perf-inline", inline(always))] fn next_eoi_state(&self, current: StateID) -> StateID { let eoi = self.byte_classes().eoi().as_usize(); let o = current.as_usize() + eoi; self.trans()[o] } #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_len(&self) -> usize { self.ms.pattern_len } #[cfg_attr(feature = "perf-inline", inline(always))] fn match_len(&self, id: StateID) -> usize { self.match_pattern_len(id) } #[cfg_attr(feature = "perf-inline", inline(always))] fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { // This is an optimization for the very common case of a DFA with a // single pattern. This conditional avoids a somewhat more costly path // that finds the pattern ID from the state machine, which requires // a bit of slicing/pointer-chasing. This optimization tends to only // matter when matches are frequent. if self.ms.pattern_len == 1 { return PatternID::ZERO; } let state_index = self.match_state_index(id); self.ms.pattern_id(state_index, match_index) } #[cfg_attr(feature = "perf-inline", inline(always))] fn has_empty(&self) -> bool { self.flags.has_empty } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_utf8(&self) -> bool { self.flags.is_utf8 } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_always_start_anchored(&self) -> bool { self.flags.is_always_start_anchored } #[cfg_attr(feature = "perf-inline", inline(always))] fn start_state( &self, config: &start::Config, ) -> Result { let anchored = config.get_anchored(); let start = match config.get_look_behind() { None => Start::Text, Some(byte) => { if !self.quitset.is_empty() && self.quitset.contains(byte) { return Err(StartError::quit(byte)); } self.st.start_map.get(byte) } }; self.st.start(anchored, start) } #[cfg_attr(feature = "perf-inline", inline(always))] fn universal_start_state(&self, mode: Anchored) -> Option { match mode { Anchored::No => self.st.universal_start_unanchored, Anchored::Yes => self.st.universal_start_anchored, Anchored::Pattern(_) => None, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn accelerator(&self, id: StateID) -> &[u8] { if !self.is_accel_state(id) { return &[]; } self.accels.needles(self.accelerator_index(id)) } #[cfg_attr(feature = "perf-inline", inline(always))] fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref() } } /// The transition table portion of a dense DFA. /// /// The transition table is the core part of the DFA in that it describes how /// to move from one state to another based on the input sequence observed. #[derive(Clone)] pub(crate) struct TransitionTable { /// A contiguous region of memory representing the transition table in /// row-major order. The representation is dense. That is, every state /// has precisely the same number of transitions. The maximum number of /// transitions per state is 257 (256 for each possible byte value, plus 1 /// for the special EOI transition). If a DFA has been instructed to use /// byte classes (the default), then the number of transitions is usually /// substantially fewer. /// /// In practice, T is either `Vec` or `&[u32]`. table: T, /// A set of equivalence classes, where a single equivalence class /// represents a set of bytes that never discriminate between a match /// and a non-match in the DFA. Each equivalence class corresponds to a /// single character in this DFA's alphabet, where the maximum number of /// characters is 257 (each possible value of a byte plus the special /// EOI transition). Consequently, the number of equivalence classes /// corresponds to the number of transitions for each DFA state. Note /// though that the *space* used by each DFA state in the transition table /// may be larger. The total space used by each DFA state is known as the /// stride. /// /// The only time the number of equivalence classes is fewer than 257 is if /// the DFA's kind uses byte classes (which is the default). Equivalence /// classes should generally only be disabled when debugging, so that /// the transitions themselves aren't obscured. Disabling them has no /// other benefit, since the equivalence class map is always used while /// searching. In the vast majority of cases, the number of equivalence /// classes is substantially smaller than 257, particularly when large /// Unicode classes aren't used. classes: ByteClasses, /// The stride of each DFA state, expressed as a power-of-two exponent. /// /// The stride of a DFA corresponds to the total amount of space used by /// each DFA state in the transition table. This may be bigger than the /// size of a DFA's alphabet, since the stride is always the smallest /// power of two greater than or equal to the alphabet size. /// /// While this wastes space, this avoids the need for integer division /// to convert between premultiplied state IDs and their corresponding /// indices. Instead, we can use simple bit-shifts. /// /// See the docs for the `stride2` method for more details. /// /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) /// while the maximum `stride2` value is `9` (corresponding to a stride of /// `512`). The maximum is not `8` since the maximum alphabet size is `257` /// when accounting for the special EOI transition. However, an alphabet /// length of that size is exceptionally rare since the alphabet is shrunk /// into equivalence classes. stride2: usize, } impl<'a> TransitionTable<&'a [u32]> { /// Deserialize a transition table starting at the beginning of `slice`. /// Upon success, return the total number of bytes read along with the /// transition table. /// /// If there was a problem deserializing any part of the transition table, /// then this returns an error. Notably, if the given slice does not have /// the same alignment as `StateID`, then this will return an error (among /// other possible errors). /// /// This is guaranteed to execute in constant time. /// /// # Safety /// /// This routine is not safe because it does not check the validity of the /// transition table itself. In particular, the transition table can be /// quite large, so checking its validity can be somewhat expensive. An /// invalid transition table is not safe because other code may rely on the /// transition table being correct (such as explicit bounds check elision). /// Therefore, an invalid transition table can lead to undefined behavior. /// /// Callers that use this function must either pass on the safety invariant /// or guarantee that the bytes given contain a valid transition table. /// This guarantee is upheld by the bytes written by `write_to`. unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(TransitionTable<&'a [u32]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); let (state_len, nr) = wire::try_read_u32_as_usize(slice, "state length")?; slice = &slice[nr..]; let (stride2, nr) = wire::try_read_u32_as_usize(slice, "stride2")?; slice = &slice[nr..]; let (classes, nr) = ByteClasses::from_bytes(slice)?; slice = &slice[nr..]; // The alphabet length (determined by the byte class map) cannot be // bigger than the stride (total space used by each DFA state). if stride2 > 9 { return Err(DeserializeError::generic( "dense DFA has invalid stride2 (too big)", )); } // It also cannot be zero, since even a DFA that never matches anything // has a non-zero number of states with at least two equivalence // classes: one for all 256 byte values and another for the EOI // sentinel. if stride2 < 1 { return Err(DeserializeError::generic( "dense DFA has invalid stride2 (too small)", )); } // This is OK since 1 <= stride2 <= 9. let stride = 1usize.checked_shl(u32::try_from(stride2).unwrap()).unwrap(); if classes.alphabet_len() > stride { return Err(DeserializeError::generic( "alphabet size cannot be bigger than transition table stride", )); } let trans_len = wire::shl(state_len, stride2, "dense table transition length")?; let table_bytes_len = wire::mul( trans_len, StateID::SIZE, "dense table state byte length", )?; wire::check_slice_len(slice, table_bytes_len, "transition table")?; wire::check_alignment::(slice)?; let table_bytes = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; // SAFETY: Since StateID is always representable as a u32, all we need // to do is ensure that we have the proper length and alignment. We've // checked both above, so the cast below is safe. // // N.B. This is the only not-safe code in this function. let table = core::slice::from_raw_parts( table_bytes.as_ptr().cast::(), trans_len, ); let tt = TransitionTable { table, classes, stride2 }; Ok((tt, slice.as_ptr().as_usize() - slice_start)) } } #[cfg(feature = "dfa-build")] impl TransitionTable> { /// Create a minimal transition table with just two states: a dead state /// and a quit state. The alphabet length and stride of the transition /// table is determined by the given set of equivalence classes. fn minimal(classes: ByteClasses) -> TransitionTable> { let mut tt = TransitionTable { table: vec![], classes, stride2: classes.stride2(), }; // Two states, regardless of alphabet size, can always fit into u32. tt.add_empty_state().unwrap(); // dead state tt.add_empty_state().unwrap(); // quit state tt } /// Set a transition in this table. Both the `from` and `to` states must /// already exist, otherwise this panics. `unit` should correspond to the /// transition out of `from` to set to `to`. fn set(&mut self, from: StateID, unit: alphabet::Unit, to: StateID) { assert!(self.is_valid(from), "invalid 'from' state"); assert!(self.is_valid(to), "invalid 'to' state"); self.table[from.as_usize() + self.classes.get_by_unit(unit)] = to.as_u32(); } /// Add an empty state (a state where all transitions lead to a dead state) /// and return its identifier. The identifier returned is guaranteed to /// not point to any other existing state. /// /// If adding a state would exhaust the state identifier space, then this /// returns an error. fn add_empty_state(&mut self) -> Result { // Normally, to get a fresh state identifier, we would just // take the index of the next state added to the transition // table. However, we actually perform an optimization here // that premultiplies state IDs by the stride, such that they // point immediately at the beginning of their transitions in // the transition table. This avoids an extra multiplication // instruction for state lookup at search time. // // Premultiplied identifiers means that instead of your matching // loop looking something like this: // // state = dfa.start // for byte in haystack: // next = dfa.transitions[state * stride + byte] // if dfa.is_match(next): // return true // return false // // it can instead look like this: // // state = dfa.start // for byte in haystack: // next = dfa.transitions[state + byte] // if dfa.is_match(next): // return true // return false // // In other words, we save a multiplication instruction in the // critical path. This turns out to be a decent performance win. // The cost of using premultiplied state ids is that they can // require a bigger state id representation. (And they also make // the code a bit more complex, especially during minimization and // when reshuffling states, as one needs to convert back and forth // between state IDs and state indices.) // // To do this, we simply take the index of the state into the // entire transition table, rather than the index of the state // itself. e.g., If the stride is 64, then the ID of the 3rd state // is 192, not 2. let next = self.table.len(); let id = StateID::new(next).map_err(|_| BuildError::too_many_states())?; self.table.extend(iter::repeat(0).take(self.stride())); Ok(id) } /// Swap the two states given in this transition table. /// /// This routine does not do anything to check the correctness of this /// swap. Callers must ensure that other states pointing to id1 and id2 are /// updated appropriately. /// /// Both id1 and id2 must point to valid states, otherwise this panics. fn swap(&mut self, id1: StateID, id2: StateID) { assert!(self.is_valid(id1), "invalid 'id1' state: {:?}", id1); assert!(self.is_valid(id2), "invalid 'id2' state: {:?}", id2); // We only need to swap the parts of the state that are used. So if the // stride is 64, but the alphabet length is only 33, then we save a lot // of work. for b in 0..self.classes.alphabet_len() { self.table.swap(id1.as_usize() + b, id2.as_usize() + b); } } /// Remap the transitions for the state given according to the function /// given. This applies the given map function to every transition in the /// given state and changes the transition in place to the result of the /// map function for that transition. fn remap(&mut self, id: StateID, map: impl Fn(StateID) -> StateID) { for byte in 0..self.alphabet_len() { let i = id.as_usize() + byte; let next = self.table()[i]; self.table_mut()[id.as_usize() + byte] = map(next); } } /// Truncate the states in this transition table to the given length. /// /// This routine does not do anything to check the correctness of this /// truncation. Callers must ensure that other states pointing to truncated /// states are updated appropriately. fn truncate(&mut self, len: usize) { self.table.truncate(len << self.stride2); } } impl> TransitionTable { /// Writes a serialized form of this transition table to the buffer given. /// If the buffer is too small, then an error is returned. To determine /// how big the buffer must be, use `write_to_len`. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("transition table")); } dst = &mut dst[..nwrite]; // write state length // Unwrap is OK since number of states is guaranteed to fit in a u32. E::write_u32(u32::try_from(self.len()).unwrap(), dst); dst = &mut dst[size_of::()..]; // write state stride (as power of 2) // Unwrap is OK since stride2 is guaranteed to be <= 9. E::write_u32(u32::try_from(self.stride2).unwrap(), dst); dst = &mut dst[size_of::()..]; // write byte class map let n = self.classes.write_to(dst)?; dst = &mut dst[n..]; // write actual transitions for &sid in self.table() { let n = wire::write_state_id::(sid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { size_of::() // state length + size_of::() // stride2 + self.classes.write_to_len() + (self.table().len() * StateID::SIZE) } /// Validates that every state ID in this transition table is valid. /// /// That is, every state ID can be used to correctly index a state in this /// table. fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { let sp = &dfa.special; for state in self.states() { // We check that the ID itself is well formed. That is, if it's // a special state then it must actually be a quit, dead, accel, // match or start state. if sp.is_special_state(state.id()) { let is_actually_special = sp.is_dead_state(state.id()) || sp.is_quit_state(state.id()) || sp.is_match_state(state.id()) || sp.is_start_state(state.id()) || sp.is_accel_state(state.id()); if !is_actually_special { // This is kind of a cryptic error message... return Err(DeserializeError::generic( "found dense state tagged as special but \ wasn't actually special", )); } if sp.is_match_state(state.id()) && dfa.match_len(state.id()) == 0 { return Err(DeserializeError::generic( "found match state with zero pattern IDs", )); } } for (_, to) in state.transitions() { if !self.is_valid(to) { return Err(DeserializeError::generic( "found invalid state ID in transition table", )); } } } Ok(()) } /// Converts this transition table to a borrowed value. fn as_ref(&self) -> TransitionTable<&'_ [u32]> { TransitionTable { table: self.table.as_ref(), classes: self.classes.clone(), stride2: self.stride2, } } /// Converts this transition table to an owned value. #[cfg(feature = "alloc")] fn to_owned(&self) -> TransitionTable> { TransitionTable { table: self.table.as_ref().to_vec(), classes: self.classes.clone(), stride2: self.stride2, } } /// Return the state for the given ID. If the given ID is not valid, then /// this panics. fn state(&self, id: StateID) -> State<'_> { assert!(self.is_valid(id)); let i = id.as_usize(); State { id, stride2: self.stride2, transitions: &self.table()[i..i + self.alphabet_len()], } } /// Returns an iterator over all states in this transition table. /// /// This iterator yields a tuple for each state. The first element of the /// tuple corresponds to a state's identifier, and the second element /// corresponds to the state itself (comprised of its transitions). fn states(&self) -> StateIter<'_, T> { StateIter { tt: self, it: self.table().chunks(self.stride()).enumerate(), } } /// Convert a state identifier to an index to a state (in the range /// 0..self.len()). /// /// This is useful when using a `Vec` as an efficient map keyed by state /// to some other information (such as a remapped state ID). /// /// If the given ID is not valid, then this may panic or produce an /// incorrect index. fn to_index(&self, id: StateID) -> usize { id.as_usize() >> self.stride2 } /// Convert an index to a state (in the range 0..self.len()) to an actual /// state identifier. /// /// This is useful when using a `Vec` as an efficient map keyed by state /// to some other information (such as a remapped state ID). /// /// If the given index is not in the specified range, then this may panic /// or produce an incorrect state ID. fn to_state_id(&self, index: usize) -> StateID { // CORRECTNESS: If the given index is not valid, then it is not // required for this to panic or return a valid state ID. StateID::new_unchecked(index << self.stride2) } /// Returns the state ID for the state immediately following the one given. /// /// This does not check whether the state ID returned is invalid. In fact, /// if the state ID given is the last state in this DFA, then the state ID /// returned is guaranteed to be invalid. #[cfg(feature = "dfa-build")] fn next_state_id(&self, id: StateID) -> StateID { self.to_state_id(self.to_index(id).checked_add(1).unwrap()) } /// Returns the state ID for the state immediately preceding the one given. /// /// If the dead ID given (which is zero), then this panics. #[cfg(feature = "dfa-build")] fn prev_state_id(&self, id: StateID) -> StateID { self.to_state_id(self.to_index(id).checked_sub(1).unwrap()) } /// Returns the table as a slice of state IDs. fn table(&self) -> &[StateID] { wire::u32s_to_state_ids(self.table.as_ref()) } /// Returns the total number of states in this transition table. /// /// Note that a DFA always has at least two states: the dead and quit /// states. In particular, the dead state always has ID 0 and is /// correspondingly always the first state. The dead state is never a match /// state. fn len(&self) -> usize { self.table().len() >> self.stride2 } /// Returns the total stride for every state in this DFA. This corresponds /// to the total number of transitions used by each state in this DFA's /// transition table. fn stride(&self) -> usize { 1 << self.stride2 } /// Returns the total number of elements in the alphabet for this /// transition table. This is always less than or equal to `self.stride()`. /// It is only equal when the alphabet length is a power of 2. Otherwise, /// it is always strictly less. fn alphabet_len(&self) -> usize { self.classes.alphabet_len() } /// Returns true if and only if the given state ID is valid for this /// transition table. Validity in this context means that the given ID can /// be used as a valid offset with `self.stride()` to index this transition /// table. fn is_valid(&self, id: StateID) -> bool { let id = id.as_usize(); id < self.table().len() && id % self.stride() == 0 } /// Return the memory usage, in bytes, of this transition table. /// /// This does not include the size of a `TransitionTable` value itself. fn memory_usage(&self) -> usize { self.table().len() * StateID::SIZE } } #[cfg(feature = "dfa-build")] impl> TransitionTable { /// Returns the table as a slice of state IDs. fn table_mut(&mut self) -> &mut [StateID] { wire::u32s_to_state_ids_mut(self.table.as_mut()) } } /// The set of all possible starting states in a DFA. /// /// The set of starting states corresponds to the possible choices one can make /// in terms of starting a DFA. That is, before following the first transition, /// you first need to select the state that you start in. /// /// Normally, a DFA converted from an NFA that has a single starting state /// would itself just have one starting state. However, our support for look /// around generally requires more starting states. The correct starting state /// is chosen based on certain properties of the position at which we begin /// our search. /// /// Before listing those properties, we first must define two terms: /// /// * `haystack` - The bytes to execute the search. The search always starts /// at the beginning of `haystack` and ends before or at the end of /// `haystack`. /// * `context` - The (possibly empty) bytes surrounding `haystack`. `haystack` /// must be contained within `context` such that `context` is at least as big /// as `haystack`. /// /// This split is crucial for dealing with look-around. For example, consider /// the context `foobarbaz`, the haystack `bar` and the regex `^bar$`. This /// regex should _not_ match the haystack since `bar` does not appear at the /// beginning of the input. Similarly, the regex `\Bbar\B` should match the /// haystack because `bar` is not surrounded by word boundaries. But a search /// that does not take context into account would not permit `\B` to match /// since the beginning of any string matches a word boundary. Similarly, a /// search that does not take context into account when searching `^bar$` in /// the haystack `bar` would produce a match when it shouldn't. /// /// Thus, it follows that the starting state is chosen based on the following /// criteria, derived from the position at which the search starts in the /// `context` (corresponding to the start of `haystack`): /// /// 1. If the search starts at the beginning of `context`, then the `Text` /// start state is used. (Since `^` corresponds to /// `hir::Anchor::Start`.) /// 2. If the search starts at a position immediately following a line /// terminator, then the `Line` start state is used. (Since `(?m:^)` /// corresponds to `hir::Anchor::StartLF`.) /// 3. If the search starts at a position immediately following a byte /// classified as a "word" character (`[_0-9a-zA-Z]`), then the `WordByte` /// start state is used. (Since `(?-u:\b)` corresponds to a word boundary.) /// 4. Otherwise, if the search starts at a position immediately following /// a byte that is not classified as a "word" character (`[^_0-9a-zA-Z]`), /// then the `NonWordByte` start state is used. (Since `(?-u:\B)` /// corresponds to a not-word-boundary.) /// /// (N.B. Unicode word boundaries are not supported by the DFA because they /// require multi-byte look-around and this is difficult to support in a DFA.) /// /// To further complicate things, we also support constructing individual /// anchored start states for each pattern in the DFA. (Which is required to /// implement overlapping regexes correctly, but is also generally useful.) /// Thus, when individual start states for each pattern are enabled, then the /// total number of start states represented is `4 + (4 * #patterns)`, where /// the 4 comes from each of the 4 possibilities above. The first 4 represents /// the starting states for the entire DFA, which support searching for /// multiple patterns simultaneously (possibly unanchored). /// /// If individual start states are disabled, then this will only store 4 /// start states. Typically, individual start states are only enabled when /// constructing the reverse DFA for regex matching. But they are also useful /// for building DFAs that can search for a specific pattern or even to support /// both anchored and unanchored searches with the same DFA. /// /// Note though that while the start table always has either `4` or /// `4 + (4 * #patterns)` starting state *ids*, the total number of states /// might be considerably smaller. That is, many of the IDs may be duplicative. /// (For example, if a regex doesn't have a `\b` sub-pattern, then there's no /// reason to generate a unique starting state for handling word boundaries. /// Similarly for start/end anchors.) #[derive(Clone)] pub(crate) struct StartTable { /// The initial start state IDs. /// /// In practice, T is either `Vec` or `&[u32]`. /// /// The first `2 * stride` (currently always 8) entries always correspond /// to the starts states for the entire DFA, with the first 4 entries being /// for unanchored searches and the second 4 entries being for anchored /// searches. To keep things simple, we always use 8 entries even if the /// `StartKind` is not both. /// /// After that, there are `stride * patterns` state IDs, where `patterns` /// may be zero in the case of a DFA with no patterns or in the case where /// the DFA was built without enabling starting states for each pattern. table: T, /// The starting state configuration supported. When 'both', both /// unanchored and anchored searches work. When 'unanchored', anchored /// searches panic. When 'anchored', unanchored searches panic. kind: StartKind, /// The start state configuration for every possible byte. start_map: StartByteMap, /// The number of starting state IDs per pattern. stride: usize, /// The total number of patterns for which starting states are encoded. /// This is `None` for DFAs that were built without start states for each /// pattern. Thus, one cannot use this field to say how many patterns /// are in the DFA in all cases. It is specific to how many patterns are /// represented in this start table. pattern_len: Option, /// The universal starting state for unanchored searches. This is only /// present when the DFA supports unanchored searches and when all starting /// state IDs for an unanchored search are equivalent. universal_start_unanchored: Option, /// The universal starting state for anchored searches. This is only /// present when the DFA supports anchored searches and when all starting /// state IDs for an anchored search are equivalent. universal_start_anchored: Option, } #[cfg(feature = "dfa-build")] impl StartTable> { /// Create a valid set of start states all pointing to the dead state. /// /// When the corresponding DFA is constructed with start states for each /// pattern, then `patterns` should be the number of patterns. Otherwise, /// it should be zero. /// /// If the total table size could exceed the allocatable limit, then this /// returns an error. In practice, this is unlikely to be able to occur, /// since it's likely that allocation would have failed long before it got /// to this point. fn dead( kind: StartKind, lookm: &LookMatcher, pattern_len: Option, ) -> Result>, BuildError> { if let Some(len) = pattern_len { assert!(len <= PatternID::LIMIT); } let stride = Start::len(); // OK because 2*4 is never going to overflow anything. let starts_len = stride.checked_mul(2).unwrap(); let pattern_starts_len = match stride.checked_mul(pattern_len.unwrap_or(0)) { Some(x) => x, None => return Err(BuildError::too_many_start_states()), }; let table_len = match starts_len.checked_add(pattern_starts_len) { Some(x) => x, None => return Err(BuildError::too_many_start_states()), }; if let Err(_) = isize::try_from(table_len) { return Err(BuildError::too_many_start_states()); } let table = vec![DEAD.as_u32(); table_len]; let start_map = StartByteMap::new(lookm); Ok(StartTable { table, kind, start_map, stride, pattern_len, universal_start_unanchored: None, universal_start_anchored: None, }) } } impl<'a> StartTable<&'a [u32]> { /// Deserialize a table of start state IDs starting at the beginning of /// `slice`. Upon success, return the total number of bytes read along with /// the table of starting state IDs. /// /// If there was a problem deserializing any part of the starting IDs, /// then this returns an error. Notably, if the given slice does not have /// the same alignment as `StateID`, then this will return an error (among /// other possible errors). /// /// This is guaranteed to execute in constant time. /// /// # Safety /// /// This routine is not safe because it does not check the validity of the /// starting state IDs themselves. In particular, the number of starting /// IDs can be of variable length, so it's possible that checking their /// validity cannot be done in constant time. An invalid starting state /// ID is not safe because other code may rely on the starting IDs being /// correct (such as explicit bounds check elision). Therefore, an invalid /// start ID can lead to undefined behavior. /// /// Callers that use this function must either pass on the safety invariant /// or guarantee that the bytes given contain valid starting state IDs. /// This guarantee is upheld by the bytes written by `write_to`. unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u32]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); let (kind, nr) = StartKind::from_bytes(slice)?; slice = &slice[nr..]; let (start_map, nr) = StartByteMap::from_bytes(slice)?; slice = &slice[nr..]; let (stride, nr) = wire::try_read_u32_as_usize(slice, "start table stride")?; slice = &slice[nr..]; if stride != Start::len() { return Err(DeserializeError::generic( "invalid starting table stride", )); } let (maybe_pattern_len, nr) = wire::try_read_u32_as_usize(slice, "start table patterns")?; slice = &slice[nr..]; let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { None } else { Some(maybe_pattern_len) }; if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { return Err(DeserializeError::generic( "invalid number of patterns", )); } let (universal_unanchored, nr) = wire::try_read_u32(slice, "universal unanchored start")?; slice = &slice[nr..]; let universal_start_unanchored = if universal_unanchored == u32::MAX { None } else { Some(StateID::try_from(universal_unanchored).map_err(|e| { DeserializeError::state_id_error( e, "universal unanchored start", ) })?) }; let (universal_anchored, nr) = wire::try_read_u32(slice, "universal anchored start")?; slice = &slice[nr..]; let universal_start_anchored = if universal_anchored == u32::MAX { None } else { Some(StateID::try_from(universal_anchored).map_err(|e| { DeserializeError::state_id_error(e, "universal anchored start") })?) }; let pattern_table_size = wire::mul( stride, pattern_len.unwrap_or(0), "invalid pattern length", )?; // Our start states always start with a two stride of start states for // the entire automaton. The first stride is for unanchored starting // states and the second stride is for anchored starting states. What // follows it are an optional set of start states for each pattern. let start_state_len = wire::add( wire::mul(2, stride, "start state stride too big")?, pattern_table_size, "invalid 'any' pattern starts size", )?; let table_bytes_len = wire::mul( start_state_len, StateID::SIZE, "pattern table bytes length", )?; wire::check_slice_len(slice, table_bytes_len, "start ID table")?; wire::check_alignment::(slice)?; let table_bytes = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; // SAFETY: Since StateID is always representable as a u32, all we need // to do is ensure that we have the proper length and alignment. We've // checked both above, so the cast below is safe. // // N.B. This is the only not-safe code in this function. let table = core::slice::from_raw_parts( table_bytes.as_ptr().cast::(), start_state_len, ); let st = StartTable { table, kind, start_map, stride, pattern_len, universal_start_unanchored, universal_start_anchored, }; Ok((st, slice.as_ptr().as_usize() - slice_start)) } } impl> StartTable { /// Writes a serialized form of this start table to the buffer given. If /// the buffer is too small, then an error is returned. To determine how /// big the buffer must be, use `write_to_len`. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small( "starting table ids", )); } dst = &mut dst[..nwrite]; // write start kind let nw = self.kind.write_to::(dst)?; dst = &mut dst[nw..]; // write start byte map let nw = self.start_map.write_to(dst)?; dst = &mut dst[nw..]; // write stride // Unwrap is OK since the stride is always 4 (currently). E::write_u32(u32::try_from(self.stride).unwrap(), dst); dst = &mut dst[size_of::()..]; // write pattern length // Unwrap is OK since number of patterns is guaranteed to fit in a u32. E::write_u32( u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), dst, ); dst = &mut dst[size_of::()..]; // write universal start unanchored state id, u32::MAX if absent E::write_u32( self.universal_start_unanchored .map_or(u32::MAX, |sid| sid.as_u32()), dst, ); dst = &mut dst[size_of::()..]; // write universal start anchored state id, u32::MAX if absent E::write_u32( self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), dst, ); dst = &mut dst[size_of::()..]; // write start IDs for &sid in self.table() { let n = wire::write_state_id::(sid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) } /// Returns the number of bytes the serialized form of this start ID table /// will use. fn write_to_len(&self) -> usize { self.kind.write_to_len() + self.start_map.write_to_len() + size_of::() // stride + size_of::() // # patterns + size_of::() // universal unanchored start + size_of::() // universal anchored start + (self.table().len() * StateID::SIZE) } /// Validates that every state ID in this start table is valid by checking /// it against the given transition table (which must be for the same DFA). /// /// That is, every state ID can be used to correctly index a state. fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { let tt = &dfa.tt; if !self.universal_start_unanchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal unanchored starting state ID", )); } if !self.universal_start_anchored.map_or(true, |s| tt.is_valid(s)) { return Err(DeserializeError::generic( "found invalid universal anchored starting state ID", )); } for &id in self.table() { if !tt.is_valid(id) { return Err(DeserializeError::generic( "found invalid starting state ID", )); } } Ok(()) } /// Converts this start list to a borrowed value. fn as_ref(&self) -> StartTable<&'_ [u32]> { StartTable { table: self.table.as_ref(), kind: self.kind, start_map: self.start_map.clone(), stride: self.stride, pattern_len: self.pattern_len, universal_start_unanchored: self.universal_start_unanchored, universal_start_anchored: self.universal_start_anchored, } } /// Converts this start list to an owned value. #[cfg(feature = "alloc")] fn to_owned(&self) -> StartTable> { StartTable { table: self.table.as_ref().to_vec(), kind: self.kind, start_map: self.start_map.clone(), stride: self.stride, pattern_len: self.pattern_len, universal_start_unanchored: self.universal_start_unanchored, universal_start_anchored: self.universal_start_anchored, } } /// Return the start state for the given input and starting configuration. /// This returns an error if the input configuration is not supported by /// this DFA. For example, requesting an unanchored search when the DFA was /// not built with unanchored starting states. Or asking for an anchored /// pattern search with an invalid pattern ID or on a DFA that was not /// built with start states for each pattern. #[cfg_attr(feature = "perf-inline", inline(always))] fn start( &self, anchored: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; if pid.as_usize() >= len { return Ok(DEAD); } (2 * self.stride) + (self.stride * pid.as_usize()) + start_index } }; Ok(self.table()[index]) } /// Returns an iterator over all start state IDs in this table. /// /// Each item is a triple of: start state ID, the start state type and the /// pattern ID (if any). fn iter(&self) -> StartStateIter<'_> { StartStateIter { st: self.as_ref(), i: 0 } } /// Returns the table as a slice of state IDs. fn table(&self) -> &[StateID] { wire::u32s_to_state_ids(self.table.as_ref()) } /// Return the memory usage, in bytes, of this start list. /// /// This does not include the size of a `StartList` value itself. fn memory_usage(&self) -> usize { self.table().len() * StateID::SIZE } } #[cfg(feature = "dfa-build")] impl> StartTable { /// Set the start state for the given index and pattern. /// /// If the pattern ID or state ID are not valid, then this will panic. fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { let start_index = start.as_usize(); let index = match anchored { Anchored::No => start_index, Anchored::Yes => self.stride + start_index, Anchored::Pattern(pid) => { let pid = pid.as_usize(); let len = self .pattern_len .expect("start states for each pattern enabled"); assert!(pid < len, "invalid pattern ID {:?}", pid); self.stride .checked_mul(pid) .unwrap() .checked_add(self.stride.checked_mul(2).unwrap()) .unwrap() .checked_add(start_index) .unwrap() } }; self.table_mut()[index] = id; } /// Returns the table as a mutable slice of state IDs. fn table_mut(&mut self) -> &mut [StateID] { wire::u32s_to_state_ids_mut(self.table.as_mut()) } } /// An iterator over start state IDs. /// /// This iterator yields a triple of start state ID, the anchored mode and the /// start state type. If a pattern ID is relevant, then the anchored mode will /// contain it. Start states with an anchored mode containing a pattern ID will /// only occur when the DFA was compiled with start states for each pattern /// (which is disabled by default). pub(crate) struct StartStateIter<'a> { st: StartTable<&'a [u32]>, i: usize, } impl<'a> Iterator for StartStateIter<'a> { type Item = (StateID, Anchored, Start); fn next(&mut self) -> Option<(StateID, Anchored, Start)> { let i = self.i; let table = self.st.table(); if i >= table.len() { return None; } self.i += 1; // This unwrap is okay since the stride of the starting state table // must always match the number of start state types. let start_type = Start::from_usize(i % self.st.stride).unwrap(); let anchored = if i < self.st.stride { Anchored::No } else if i < (2 * self.st.stride) { Anchored::Yes } else { let pid = (i - (2 * self.st.stride)) / self.st.stride; Anchored::Pattern(PatternID::new(pid).unwrap()) }; Some((table[i], anchored, start_type)) } } /// This type represents that patterns that should be reported whenever a DFA /// enters a match state. This structure exists to support DFAs that search for /// matches for multiple regexes. /// /// This structure relies on the fact that all match states in a DFA occur /// contiguously in the DFA's transition table. (See dfa/special.rs for a more /// detailed breakdown of the representation.) Namely, when a match occurs, we /// know its state ID. Since we know the start and end of the contiguous region /// of match states, we can use that to compute the position at which the match /// state occurs. That in turn is used as an offset into this structure. #[derive(Clone, Debug)] struct MatchStates { /// Slices is a flattened sequence of pairs, where each pair points to a /// sub-slice of pattern_ids. The first element of the pair is an offset /// into pattern_ids and the second element of the pair is the number /// of 32-bit pattern IDs starting at that position. That is, each pair /// corresponds to a single DFA match state and its corresponding match /// IDs. The number of pairs always corresponds to the number of distinct /// DFA match states. /// /// In practice, T is either Vec or &[u32]. slices: T, /// A flattened sequence of pattern IDs for each DFA match state. The only /// way to correctly read this sequence is indirectly via `slices`. /// /// In practice, T is either Vec or &[u32]. pattern_ids: T, /// The total number of unique patterns represented by these match states. pattern_len: usize, } impl<'a> MatchStates<&'a [u32]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(MatchStates<&'a [u32]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); // Read the total number of match states. let (state_len, nr) = wire::try_read_u32_as_usize(slice, "match state length")?; slice = &slice[nr..]; // Read the slice start/length pairs. let pair_len = wire::mul(2, state_len, "match state offset pairs")?; let slices_bytes_len = wire::mul( pair_len, PatternID::SIZE, "match state slice offset byte length", )?; wire::check_slice_len(slice, slices_bytes_len, "match state slices")?; wire::check_alignment::(slice)?; let slices_bytes = &slice[..slices_bytes_len]; slice = &slice[slices_bytes_len..]; // SAFETY: Since PatternID is always representable as a u32, all we // need to do is ensure that we have the proper length and alignment. // We've checked both above, so the cast below is safe. // // N.B. This is one of the few not-safe snippets in this function, // so we mark it explicitly to call it out. let slices = core::slice::from_raw_parts( slices_bytes.as_ptr().cast::(), pair_len, ); // Read the total number of unique pattern IDs (which is always 1 more // than the maximum pattern ID in this automaton, since pattern IDs are // handed out contiguously starting at 0). let (pattern_len, nr) = wire::try_read_u32_as_usize(slice, "pattern length")?; slice = &slice[nr..]; // Now read the pattern ID length. We don't need to store this // explicitly, but we need it to know how many pattern IDs to read. let (idlen, nr) = wire::try_read_u32_as_usize(slice, "pattern ID length")?; slice = &slice[nr..]; // Read the actual pattern IDs. let pattern_ids_len = wire::mul(idlen, PatternID::SIZE, "pattern ID byte length")?; wire::check_slice_len(slice, pattern_ids_len, "match pattern IDs")?; wire::check_alignment::(slice)?; let pattern_ids_bytes = &slice[..pattern_ids_len]; slice = &slice[pattern_ids_len..]; // SAFETY: Since PatternID is always representable as a u32, all we // need to do is ensure that we have the proper length and alignment. // We've checked both above, so the cast below is safe. // // N.B. This is one of the few not-safe snippets in this function, // so we mark it explicitly to call it out. let pattern_ids = core::slice::from_raw_parts( pattern_ids_bytes.as_ptr().cast::(), idlen, ); let ms = MatchStates { slices, pattern_ids, pattern_len }; Ok((ms, slice.as_ptr().as_usize() - slice_start)) } } #[cfg(feature = "dfa-build")] impl MatchStates> { fn empty(pattern_len: usize) -> MatchStates> { assert!(pattern_len <= PatternID::LIMIT); MatchStates { slices: vec![], pattern_ids: vec![], pattern_len } } fn new( matches: &BTreeMap>, pattern_len: usize, ) -> Result>, BuildError> { let mut m = MatchStates::empty(pattern_len); for (_, pids) in matches.iter() { let start = PatternID::new(m.pattern_ids.len()) .map_err(|_| BuildError::too_many_match_pattern_ids())?; m.slices.push(start.as_u32()); // This is always correct since the number of patterns in a single // match state can never exceed maximum number of allowable // patterns. Why? Because a pattern can only appear once in a // particular match state, by construction. (And since our pattern // ID limit is one less than u32::MAX, we're guaranteed that the // length fits in a u32.) m.slices.push(u32::try_from(pids.len()).unwrap()); for &pid in pids { m.pattern_ids.push(pid.as_u32()); } } m.pattern_len = pattern_len; Ok(m) } fn new_with_map( &self, matches: &BTreeMap>, ) -> Result>, BuildError> { MatchStates::new(matches, self.pattern_len) } } impl> MatchStates { /// Writes a serialized form of these match states to the buffer given. If /// the buffer is too small, then an error is returned. To determine how /// big the buffer must be, use `write_to_len`. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("match states")); } dst = &mut dst[..nwrite]; // write state ID length // Unwrap is OK since number of states is guaranteed to fit in a u32. E::write_u32(u32::try_from(self.len()).unwrap(), dst); dst = &mut dst[size_of::()..]; // write slice offset pairs for &pid in self.slices() { let n = wire::write_pattern_id::(pid, &mut dst); dst = &mut dst[n..]; } // write unique pattern ID length // Unwrap is OK since number of patterns is guaranteed to fit in a u32. E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); dst = &mut dst[size_of::()..]; // write pattern ID length // Unwrap is OK since we check at construction (and deserialization) // that the number of patterns is representable as a u32. E::write_u32(u32::try_from(self.pattern_ids().len()).unwrap(), dst); dst = &mut dst[size_of::()..]; // write pattern IDs for &pid in self.pattern_ids() { let n = wire::write_pattern_id::(pid, &mut dst); dst = &mut dst[n..]; } Ok(nwrite) } /// Returns the number of bytes the serialized form of these match states /// will use. fn write_to_len(&self) -> usize { size_of::() // match state length + (self.slices().len() * PatternID::SIZE) + size_of::() // unique pattern ID length + size_of::() // pattern ID length + (self.pattern_ids().len() * PatternID::SIZE) } /// Valides that the match state info is itself internally consistent and /// consistent with the recorded match state region in the given DFA. fn validate(&self, dfa: &DFA) -> Result<(), DeserializeError> { if self.len() != dfa.special.match_len(dfa.stride()) { return Err(DeserializeError::generic( "match state length mismatch", )); } for si in 0..self.len() { let start = self.slices()[si * 2].as_usize(); let len = self.slices()[si * 2 + 1].as_usize(); if start >= self.pattern_ids().len() { return Err(DeserializeError::generic( "invalid pattern ID start offset", )); } if start + len > self.pattern_ids().len() { return Err(DeserializeError::generic( "invalid pattern ID length", )); } for mi in 0..len { let pid = self.pattern_id(si, mi); if pid.as_usize() >= self.pattern_len { return Err(DeserializeError::generic( "invalid pattern ID", )); } } } Ok(()) } /// Converts these match states back into their map form. This is useful /// when shuffling states, as the normal MatchStates representation is not /// amenable to easy state swapping. But with this map, to swap id1 and /// id2, all you need to do is: /// /// if let Some(pids) = map.remove(&id1) { /// map.insert(id2, pids); /// } /// /// Once shuffling is done, use MatchStates::new to convert back. #[cfg(feature = "dfa-build")] fn to_map(&self, dfa: &DFA) -> BTreeMap> { let mut map = BTreeMap::new(); for i in 0..self.len() { let mut pids = vec![]; for j in 0..self.pattern_len(i) { pids.push(self.pattern_id(i, j)); } map.insert(self.match_state_id(dfa, i), pids); } map } /// Converts these match states to a borrowed value. fn as_ref(&self) -> MatchStates<&'_ [u32]> { MatchStates { slices: self.slices.as_ref(), pattern_ids: self.pattern_ids.as_ref(), pattern_len: self.pattern_len, } } /// Converts these match states to an owned value. #[cfg(feature = "alloc")] fn to_owned(&self) -> MatchStates> { MatchStates { slices: self.slices.as_ref().to_vec(), pattern_ids: self.pattern_ids.as_ref().to_vec(), pattern_len: self.pattern_len, } } /// Returns the match state ID given the match state index. (Where the /// first match state corresponds to index 0.) /// /// This panics if there is no match state at the given index. fn match_state_id(&self, dfa: &DFA, index: usize) -> StateID { assert!(dfa.special.matches(), "no match states to index"); // This is one of the places where we rely on the fact that match // states are contiguous in the transition table. Namely, that the // first match state ID always corresponds to dfa.special.min_start. // From there, since we know the stride, we can compute the ID of any // match state given its index. let stride2 = u32::try_from(dfa.stride2()).unwrap(); let offset = index.checked_shl(stride2).unwrap(); let id = dfa.special.min_match.as_usize().checked_add(offset).unwrap(); let sid = StateID::new(id).unwrap(); assert!(dfa.is_match_state(sid)); sid } /// Returns the pattern ID at the given match index for the given match /// state. /// /// The match state index is the state index minus the state index of the /// first match state in the DFA. /// /// The match index is the index of the pattern ID for the given state. /// The index must be less than `self.pattern_len(state_index)`. #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_id(&self, state_index: usize, match_index: usize) -> PatternID { self.pattern_id_slice(state_index)[match_index] } /// Returns the number of patterns in the given match state. /// /// The match state index is the state index minus the state index of the /// first match state in the DFA. #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_len(&self, state_index: usize) -> usize { self.slices()[state_index * 2 + 1].as_usize() } /// Returns all of the pattern IDs for the given match state index. /// /// The match state index is the state index minus the state index of the /// first match state in the DFA. #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_id_slice(&self, state_index: usize) -> &[PatternID] { let start = self.slices()[state_index * 2].as_usize(); let len = self.pattern_len(state_index); &self.pattern_ids()[start..start + len] } /// Returns the pattern ID offset slice of u32 as a slice of PatternID. #[cfg_attr(feature = "perf-inline", inline(always))] fn slices(&self) -> &[PatternID] { wire::u32s_to_pattern_ids(self.slices.as_ref()) } /// Returns the total number of match states. #[cfg_attr(feature = "perf-inline", inline(always))] fn len(&self) -> usize { assert_eq!(0, self.slices().len() % 2); self.slices().len() / 2 } /// Returns the pattern ID slice of u32 as a slice of PatternID. #[cfg_attr(feature = "perf-inline", inline(always))] fn pattern_ids(&self) -> &[PatternID] { wire::u32s_to_pattern_ids(self.pattern_ids.as_ref()) } /// Return the memory usage, in bytes, of these match pairs. fn memory_usage(&self) -> usize { (self.slices().len() + self.pattern_ids().len()) * PatternID::SIZE } } /// A common set of flags for both dense and sparse DFAs. This primarily /// centralizes the serialization format of these flags at a bitset. #[derive(Clone, Copy, Debug)] pub(crate) struct Flags { /// Whether the DFA can match the empty string. When this is false, all /// matches returned by this DFA are guaranteed to have non-zero length. pub(crate) has_empty: bool, /// Whether the DFA should only produce matches with spans that correspond /// to valid UTF-8. This also includes omitting any zero-width matches that /// split the UTF-8 encoding of a codepoint. pub(crate) is_utf8: bool, /// Whether the DFA is always anchored or not, regardless of `Input` /// configuration. This is useful for avoiding a reverse scan even when /// executing unanchored searches. pub(crate) is_always_start_anchored: bool, } impl Flags { /// Creates a set of flags for a DFA from an NFA. /// /// N.B. This constructor was defined at the time of writing because all /// of the flags are derived directly from the NFA. If this changes in the /// future, we might be more thoughtful about how the `Flags` value is /// itself built. #[cfg(feature = "dfa-build")] fn from_nfa(nfa: &thompson::NFA) -> Flags { Flags { has_empty: nfa.has_empty(), is_utf8: nfa.is_utf8(), is_always_start_anchored: nfa.is_always_start_anchored(), } } /// Deserializes the flags from the given slice. On success, this also /// returns the number of bytes read from the slice. pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(Flags, usize), DeserializeError> { let (bits, nread) = wire::try_read_u32(slice, "flag bitset")?; let flags = Flags { has_empty: bits & (1 << 0) != 0, is_utf8: bits & (1 << 1) != 0, is_always_start_anchored: bits & (1 << 2) != 0, }; Ok((flags, nread)) } /// Writes these flags to the given byte slice. If the buffer is too small, /// then an error is returned. To determine how big the buffer must be, /// use `write_to_len`. pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { fn bool_to_int(b: bool) -> u32 { if b { 1 } else { 0 } } let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("flag bitset")); } let bits = (bool_to_int(self.has_empty) << 0) | (bool_to_int(self.is_utf8) << 1) | (bool_to_int(self.is_always_start_anchored) << 2); E::write_u32(bits, dst); Ok(nwrite) } /// Returns the number of bytes the serialized form of these flags /// will use. pub(crate) fn write_to_len(&self) -> usize { size_of::() } } /// An iterator over all states in a DFA. /// /// This iterator yields a tuple for each state. The first element of the /// tuple corresponds to a state's identifier, and the second element /// corresponds to the state itself (comprised of its transitions). /// /// `'a` corresponding to the lifetime of original DFA, `T` corresponds to /// the type of the transition table itself. pub(crate) struct StateIter<'a, T> { tt: &'a TransitionTable, it: iter::Enumerate>, } impl<'a, T: AsRef<[u32]>> Iterator for StateIter<'a, T> { type Item = State<'a>; fn next(&mut self) -> Option> { self.it.next().map(|(index, _)| { let id = self.tt.to_state_id(index); self.tt.state(id) }) } } /// An immutable representation of a single DFA state. /// /// `'a` correspondings to the lifetime of a DFA's transition table. pub(crate) struct State<'a> { id: StateID, stride2: usize, transitions: &'a [StateID], } impl<'a> State<'a> { /// Return an iterator over all transitions in this state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. /// /// Each transition is represented by a tuple. The first element is /// the input byte for that transition and the second element is the /// transitions itself. pub(crate) fn transitions(&self) -> StateTransitionIter<'_> { StateTransitionIter { len: self.transitions.len(), it: self.transitions.iter().enumerate(), } } /// Return an iterator over a sparse representation of the transitions in /// this state. Only non-dead transitions are returned. /// /// The "sparse" representation in this case corresponds to a sequence of /// triples. The first two elements of the triple comprise an inclusive /// byte range while the last element corresponds to the transition taken /// for all bytes in the range. /// /// This is somewhat more condensed than the classical sparse /// representation (where you have an element for every non-dead /// transition), but in practice, checking if a byte is in a range is very /// cheap and using ranges tends to conserve quite a bit more space. pub(crate) fn sparse_transitions(&self) -> StateSparseTransitionIter<'_> { StateSparseTransitionIter { dense: self.transitions(), cur: None } } /// Returns the identifier for this state. pub(crate) fn id(&self) -> StateID { self.id } /// Analyzes this state to determine whether it can be accelerated. If so, /// it returns an accelerator that contains at least one byte. #[cfg(feature = "dfa-build")] fn accelerate(&self, classes: &ByteClasses) -> Option { // We just try to add bytes to our accelerator. Once adding fails // (because we've added too many bytes), then give up. let mut accel = Accel::new(); for (class, id) in self.transitions() { if id == self.id() { continue; } for unit in classes.elements(class) { if let Some(byte) = unit.as_u8() { if !accel.add(byte) { return None; } } } } if accel.is_empty() { None } else { Some(accel) } } } impl<'a> fmt::Debug for State<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { for (i, (start, end, sid)) in self.sparse_transitions().enumerate() { let id = if f.alternate() { sid.as_usize() } else { sid.as_usize() >> self.stride2 }; if i > 0 { write!(f, ", ")?; } if start == end { write!(f, "{:?} => {:?}", start, id)?; } else { write!(f, "{:?}-{:?} => {:?}", start, end, id)?; } } Ok(()) } } /// An iterator over all transitions in a single DFA state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. /// /// Each transition is represented by a tuple. The first element is the input /// byte for that transition and the second element is the transition itself. #[derive(Debug)] pub(crate) struct StateTransitionIter<'a> { len: usize, it: iter::Enumerate>, } impl<'a> Iterator for StateTransitionIter<'a> { type Item = (alphabet::Unit, StateID); fn next(&mut self) -> Option<(alphabet::Unit, StateID)> { self.it.next().map(|(i, &id)| { let unit = if i + 1 == self.len { alphabet::Unit::eoi(i) } else { let b = u8::try_from(i) .expect("raw byte alphabet is never exceeded"); alphabet::Unit::u8(b) }; (unit, id) }) } } /// An iterator over all non-DEAD transitions in a single DFA state using a /// sparse representation. /// /// Each transition is represented by a triple. The first two elements of the /// triple comprise an inclusive byte range while the last element corresponds /// to the transition taken for all bytes in the range. /// /// As a convenience, this always returns `alphabet::Unit` values of the same /// type. That is, you'll never get a (byte, EOI) or a (EOI, byte). Only (byte, /// byte) and (EOI, EOI) values are yielded. #[derive(Debug)] pub(crate) struct StateSparseTransitionIter<'a> { dense: StateTransitionIter<'a>, cur: Option<(alphabet::Unit, alphabet::Unit, StateID)>, } impl<'a> Iterator for StateSparseTransitionIter<'a> { type Item = (alphabet::Unit, alphabet::Unit, StateID); fn next(&mut self) -> Option<(alphabet::Unit, alphabet::Unit, StateID)> { while let Some((unit, next)) = self.dense.next() { let (prev_start, prev_end, prev_next) = match self.cur { Some(t) => t, None => { self.cur = Some((unit, unit, next)); continue; } }; if prev_next == next && !unit.is_eoi() { self.cur = Some((prev_start, unit, prev_next)); } else { self.cur = Some((unit, unit, next)); if prev_next != DEAD { return Some((prev_start, prev_end, prev_next)); } } } if let Some((start, end, next)) = self.cur.take() { if next != DEAD { return Some((start, end, next)); } } None } } /// An error that occurred during the construction of a DFA. /// /// This error does not provide many introspection capabilities. There are /// generally only two things you can do with it: /// /// * Obtain a human readable message via its `std::fmt::Display` impl. /// * Access an underlying [`nfa::thompson::BuildError`](thompson::BuildError) /// type from its `source` method via the `std::error::Error` trait. This error /// only occurs when using convenience routines for building a DFA directly /// from a pattern string. /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[cfg(feature = "dfa-build")] #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } #[cfg(feature = "dfa-build")] impl BuildError { /// Returns true if and only if this error corresponds to an error with DFA /// construction that occurred because of exceeding a size limit. /// /// While this can occur when size limits like [`Config::dfa_size_limit`] /// or [`Config::determinize_size_limit`] are exceeded, this can also occur /// when the number of states or patterns exceeds a hard-coded maximum. /// (Where these maximums are derived based on the values representable by /// [`StateID`] and [`PatternID`].) /// /// This predicate is useful in contexts where you want to distinguish /// between errors related to something provided by an end user (for /// example, an invalid regex pattern) and errors related to configured /// heuristics. For example, building a DFA might be an optimization that /// you want to skip if construction fails because of an exceeded size /// limit, but where you want to bubble up an error if it fails for some /// other reason. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// # if !cfg!(target_pointer_width = "64") { return Ok(()); } // see #1039 /// use regex_automata::{dfa::{dense, Automaton}, Input}; /// /// let err = dense::Builder::new() /// .configure(dense::Config::new() /// .determinize_size_limit(Some(100_000)) /// ) /// .build(r"\w{20}") /// .unwrap_err(); /// // This error occurs because a size limit was exceeded. /// // But things are otherwise valid. /// assert!(err.is_size_limit_exceeded()); /// /// let err = dense::Builder::new() /// .build(r"\bxyz\b") /// .unwrap_err(); /// // This error occurs because a Unicode word boundary /// // was used without enabling heuristic support for it. /// // So... not related to size limits. /// assert!(!err.is_size_limit_exceeded()); /// /// let err = dense::Builder::new() /// .build(r"(xyz") /// .unwrap_err(); /// // This error occurs because the pattern is invalid. /// // So... not related to size limits. /// assert!(!err.is_size_limit_exceeded()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_size_limit_exceeded(&self) -> bool { use self::BuildErrorKind::*; match self.kind { NFA(_) | Unsupported(_) => false, TooManyStates | TooManyStartStates | TooManyMatchPatternIDs | DFAExceededSizeLimit { .. } | DeterminizeExceededSizeLimit { .. } => true, } } } /// The kind of error that occurred during the construction of a DFA. /// /// Note that this error is non-exhaustive. Adding new variants is not /// considered a breaking change. #[cfg(feature = "dfa-build")] #[derive(Clone, Debug)] enum BuildErrorKind { /// An error that occurred while constructing an NFA as a precursor step /// before a DFA is compiled. NFA(thompson::BuildError), /// An error that occurred because an unsupported regex feature was used. /// The message string describes which unsupported feature was used. /// /// The primary regex feature that is unsupported by DFAs is the Unicode /// word boundary look-around assertion (`\b`). This can be worked around /// by either using an ASCII word boundary (`(?-u:\b)`) or by enabling /// Unicode word boundaries when building a DFA. Unsupported(&'static str), /// An error that occurs if too many states are produced while building a /// DFA. TooManyStates, /// An error that occurs if too many start states are needed while building /// a DFA. /// /// This is a kind of oddball error that occurs when building a DFA with /// start states enabled for each pattern and enough patterns to cause /// the table of start states to overflow `usize`. TooManyStartStates, /// This is another oddball error that can occur if there are too many /// patterns spread out across too many match states. TooManyMatchPatternIDs, /// An error that occurs if the DFA got too big during determinization. DFAExceededSizeLimit { limit: usize }, /// An error that occurs if auxiliary storage (not the DFA) used during /// determinization got too big. DeterminizeExceededSizeLimit { limit: usize }, } #[cfg(feature = "dfa-build")] impl BuildError { /// Return the kind of this error. fn kind(&self) -> &BuildErrorKind { &self.kind } pub(crate) fn nfa(err: thompson::BuildError) -> BuildError { BuildError { kind: BuildErrorKind::NFA(err) } } pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { let msg = "cannot build DFAs for regexes with Unicode word \ boundaries; switch to ASCII word boundaries, or \ heuristically enable Unicode word boundaries or use a \ different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } pub(crate) fn too_many_states() -> BuildError { BuildError { kind: BuildErrorKind::TooManyStates } } pub(crate) fn too_many_start_states() -> BuildError { BuildError { kind: BuildErrorKind::TooManyStartStates } } pub(crate) fn too_many_match_pattern_ids() -> BuildError { BuildError { kind: BuildErrorKind::TooManyMatchPatternIDs } } pub(crate) fn dfa_exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::DFAExceededSizeLimit { limit } } } pub(crate) fn determinize_exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::DeterminizeExceededSizeLimit { limit }, } } } #[cfg(all(feature = "std", feature = "dfa-build"))] impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind() { BuildErrorKind::NFA(ref err) => Some(err), _ => None, } } } #[cfg(feature = "dfa-build")] impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind() { BuildErrorKind::NFA(_) => write!(f, "error building NFA"), BuildErrorKind::Unsupported(ref msg) => { write!(f, "unsupported regex feature for DFAs: {}", msg) } BuildErrorKind::TooManyStates => write!( f, "number of DFA states exceeds limit of {}", StateID::LIMIT, ), BuildErrorKind::TooManyStartStates => { let stride = Start::len(); // The start table has `stride` entries for starting states for // the entire DFA, and then `stride` entries for each pattern // if start states for each pattern are enabled (which is the // only way this error can occur). Thus, the total number of // patterns that can fit in the table is `stride` less than // what we can allocate. let max = usize::try_from(core::isize::MAX).unwrap(); let limit = (max - stride) / stride; write!( f, "compiling DFA with start states exceeds pattern \ pattern limit of {}", limit, ) } BuildErrorKind::TooManyMatchPatternIDs => write!( f, "compiling DFA with total patterns in all match states \ exceeds limit of {}", PatternID::LIMIT, ), BuildErrorKind::DFAExceededSizeLimit { limit } => write!( f, "DFA exceeded size limit of {:?} during determinization", limit, ), BuildErrorKind::DeterminizeExceededSizeLimit { limit } => { write!(f, "determinization exceeded size limit of {:?}", limit) } } } } #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { use crate::{Input, MatchError}; use super::*; #[test] fn errors_with_unicode_word_boundary() { let pattern = r"\b"; assert!(Builder::new().build(pattern).is_err()); } #[test] fn roundtrip_never_match() { let dfa = DFA::never_match().unwrap(); let (buf, _) = dfa.to_bytes_native_endian(); let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; assert_eq!(None, dfa.try_search_fwd(&Input::new("foo12345")).unwrap()); } #[test] fn roundtrip_always_match() { use crate::HalfMatch; let dfa = DFA::always_match().unwrap(); let (buf, _) = dfa.to_bytes_native_endian(); let dfa: DFA<&[u32]> = DFA::from_bytes(&buf).unwrap().0; assert_eq!( Some(HalfMatch::must(0, 0)), dfa.try_search_fwd(&Input::new("foo12345")).unwrap() ); } // See the analogous test in src/hybrid/dfa.rs. #[test] fn heuristic_unicode_reverse() { let dfa = DFA::builder() .configure(DFA::config().unicode_word_boundary(true)) .thompson(thompson::Config::new().reverse(true)) .build(r"\b[0-9]+\b") .unwrap(); let input = Input::new("β123").range(2..); let expected = MatchError::quit(0xB2, 1); let got = dfa.try_search_rev(&input); assert_eq!(Err(expected), got); let input = Input::new("123β").range(..3); let expected = MatchError::quit(0xCE, 3); let got = dfa.try_search_rev(&input); assert_eq!(Err(expected), got); } } regex-automata-0.4.9/src/dfa/determinize.rs000064400000000000000000000621401046102023000167720ustar 00000000000000use alloc::{collections::BTreeMap, vec::Vec}; use crate::{ dfa::{ dense::{self, BuildError}, DEAD, }, nfa::thompson, util::{ self, alphabet::{self, ByteSet}, determinize::{State, StateBuilderEmpty, StateBuilderNFA}, primitives::{PatternID, StateID}, search::{Anchored, MatchKind}, sparse_set::SparseSets, start::Start, }, }; /// A builder for configuring and running a DFA determinizer. #[derive(Clone, Debug)] pub(crate) struct Config { match_kind: MatchKind, quit: ByteSet, dfa_size_limit: Option, determinize_size_limit: Option, } impl Config { /// Create a new default config for a determinizer. The determinizer may be /// configured before calling `run`. pub fn new() -> Config { Config { match_kind: MatchKind::LeftmostFirst, quit: ByteSet::empty(), dfa_size_limit: None, determinize_size_limit: None, } } /// Run determinization on the given NFA and write the resulting DFA into /// the one given. The DFA given should be initialized but otherwise empty. /// "Initialized" means that it is setup to handle the NFA's byte classes, /// number of patterns and whether to build start states for each pattern. pub fn run( &self, nfa: &thompson::NFA, dfa: &mut dense::OwnedDFA, ) -> Result<(), BuildError> { let dead = State::dead(); let quit = State::dead(); let mut cache = StateMap::default(); // We only insert the dead state here since its representation is // identical to the quit state. And we never want anything pointing // to the quit state other than specific transitions derived from the // determinizer's configured "quit" bytes. // // We do put the quit state into 'builder_states' below. This ensures // that a proper DFA state ID is allocated for it, and that no other // DFA state uses the "location after the DEAD state." That is, it // is assumed that the quit state is always the state immediately // following the DEAD state. cache.insert(dead.clone(), DEAD); let runner = Runner { config: self.clone(), nfa, dfa, builder_states: alloc::vec![dead, quit], cache, memory_usage_state: 0, sparses: SparseSets::new(nfa.states().len()), stack: alloc::vec![], scratch_state_builder: StateBuilderEmpty::new(), }; runner.run() } /// The match semantics to use for determinization. /// /// MatchKind::All corresponds to the standard textbook construction. /// All possible match states are represented in the DFA. /// MatchKind::LeftmostFirst permits greediness and otherwise tries to /// simulate the match semantics of backtracking regex engines. Namely, /// only a subset of match states are built, and dead states are used to /// stop searches with an unanchored prefix. /// /// The default is MatchKind::LeftmostFirst. pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config { self.match_kind = kind; self } /// The set of bytes to use that will cause the DFA to enter a quit state, /// stop searching and return an error. By default, this is empty. pub fn quit(&mut self, set: ByteSet) -> &mut Config { self.quit = set; self } /// The limit, in bytes of the heap, that the DFA is permitted to use. This /// does not include the auxiliary heap storage used by determinization. pub fn dfa_size_limit(&mut self, bytes: Option) -> &mut Config { self.dfa_size_limit = bytes; self } /// The limit, in bytes of the heap, that determinization itself is allowed /// to use. This does not include the size of the DFA being built. pub fn determinize_size_limit( &mut self, bytes: Option, ) -> &mut Config { self.determinize_size_limit = bytes; self } } /// The actual implementation of determinization that converts an NFA to a DFA /// through powerset construction. /// /// This determinizer roughly follows the typical powerset construction, where /// each DFA state is comprised of one or more NFA states. In the worst case, /// there is one DFA state for every possible combination of NFA states. In /// practice, this only happens in certain conditions, typically when there are /// bounded repetitions. /// /// The main differences between this implementation and typical deteminization /// are that this implementation delays matches by one state and hackily makes /// look-around work. Comments below attempt to explain this. /// /// The lifetime variable `'a` refers to the lifetime of the NFA or DFA, /// whichever is shorter. #[derive(Debug)] struct Runner<'a> { /// The configuration used to initialize determinization. config: Config, /// The NFA we're converting into a DFA. nfa: &'a thompson::NFA, /// The DFA we're building. dfa: &'a mut dense::OwnedDFA, /// Each DFA state being built is defined as an *ordered* set of NFA /// states, along with some meta facts about the ordered set of NFA states. /// /// This is never empty. The first state is always a dummy state such that /// a state id == 0 corresponds to a dead state. The second state is always /// the quit state. /// /// Why do we have states in both a `Vec` and in a cache map below? /// Well, they serve two different roles based on access patterns. /// `builder_states` is the canonical home of each state, and provides /// constant random access by a DFA state's ID. The cache map below, on /// the other hand, provides a quick way of searching for identical DFA /// states by using the DFA state as a key in the map. Of course, we use /// reference counting to avoid actually duplicating the state's data /// itself. (Although this has never been benchmarked.) Note that the cache /// map does not give us full minimization; it just lets us avoid some very /// obvious redundant states. /// /// Note that the index into this Vec isn't quite the DFA's state ID. /// Rather, it's just an index. To get the state ID, you have to multiply /// it by the DFA's stride. That's done by self.dfa.from_index. And the /// inverse is self.dfa.to_index. /// /// Moreover, DFA states don't usually retain the IDs assigned to them /// by their position in this Vec. After determinization completes, /// states are shuffled around to support other optimizations. See the /// sibling 'special' module for more details on that. (The reason for /// mentioning this is that if you print out the DFA for debugging during /// determinization, and then print out the final DFA after it is fully /// built, then the state IDs likely won't match up.) builder_states: Vec, /// A cache of DFA states that already exist and can be easily looked up /// via ordered sets of NFA states. /// /// See `builder_states` docs for why we store states in two different /// ways. cache: StateMap, /// The memory usage, in bytes, used by builder_states and cache. We track /// this as new states are added since states use a variable amount of /// heap. Tracking this as we add states makes it possible to compute the /// total amount of memory used by the determinizer in constant time. memory_usage_state: usize, /// A pair of sparse sets for tracking ordered sets of NFA state IDs. /// These are reused throughout determinization. A bounded sparse set /// gives us constant time insertion, membership testing and clearing. sparses: SparseSets, /// Scratch space for a stack of NFA states to visit, for depth first /// visiting without recursion. stack: Vec, /// Scratch space for storing an ordered sequence of NFA states, for /// amortizing allocation. This is principally useful for when we avoid /// adding a new DFA state since it already exists. In order to detect this /// case though, we still need an ordered set of NFA state IDs. So we use /// this space to stage that ordered set before we know whether we need to /// create a new DFA state or not. scratch_state_builder: StateBuilderEmpty, } /// A map from states to state identifiers. When using std, we use a standard /// hashmap, since it's a bit faster for this use case. (Other maps, like /// one's based on FNV, have not yet been benchmarked.) /// /// The main purpose of this map is to reuse states where possible. This won't /// fully minimize the DFA, but it works well in a lot of cases. #[cfg(feature = "std")] type StateMap = std::collections::HashMap; #[cfg(not(feature = "std"))] type StateMap = BTreeMap; impl<'a> Runner<'a> { /// Build the DFA. If there was a problem constructing the DFA (e.g., if /// the chosen state identifier representation is too small), then an error /// is returned. fn run(mut self) -> Result<(), BuildError> { if self.nfa.look_set_any().contains_word_unicode() && !self.config.quit.contains_range(0x80, 0xFF) { return Err(BuildError::unsupported_dfa_word_boundary_unicode()); } // A sequence of "representative" bytes drawn from each equivalence // class. These representative bytes are fed to the NFA to compute // state transitions. This allows us to avoid re-computing state // transitions for bytes that are guaranteed to produce identical // results. Since computing the representatives needs to do a little // work, we do it once here because we'll be iterating over them a lot. let representatives: Vec = self.dfa.byte_classes().representatives(..).collect(); // The set of all DFA state IDs that still need to have their // transitions set. We start by seeding this with all starting states. let mut uncompiled = alloc::vec![]; self.add_all_starts(&mut uncompiled)?; while let Some(dfa_id) = uncompiled.pop() { for &unit in &representatives { if unit.as_u8().map_or(false, |b| self.config.quit.contains(b)) { continue; } // In many cases, the state we transition to has already been // computed. 'cached_state' will do the minimal amount of work // to check this, and if it exists, immediately return an // already existing state ID. let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?; self.dfa.set_transition(dfa_id, unit, next_dfa_id); // If the state ID we got back is newly created, then we need // to compile it, so add it to our uncompiled frontier. if is_new { uncompiled.push(next_dfa_id); } } } debug!( "determinization complete, memory usage: {}, \ dense DFA size: {}, \ is reverse? {}", self.memory_usage(), self.dfa.memory_usage(), self.nfa.is_reverse(), ); // A map from DFA state ID to one or more NFA match IDs. Each NFA match // ID corresponds to a distinct regex pattern that matches in the state // corresponding to the key. let mut matches: BTreeMap> = BTreeMap::new(); self.cache.clear(); #[cfg(feature = "logging")] let mut total_pat_len = 0; for (i, state) in self.builder_states.into_iter().enumerate() { if let Some(pat_ids) = state.match_pattern_ids() { let id = self.dfa.to_state_id(i); log! { total_pat_len += pat_ids.len(); } matches.insert(id, pat_ids); } } log! { use core::mem::size_of; let per_elem = size_of::() + size_of::>(); let pats = total_pat_len * size_of::(); let mem = (matches.len() * per_elem) + pats; log::debug!("matches map built, memory usage: {}", mem); } // At this point, we shuffle the "special" states in the final DFA. // This permits a DFA's match loop to detect a match condition (among // other things) by merely inspecting the current state's identifier, // and avoids the need for any additional auxiliary storage. self.dfa.shuffle(matches)?; Ok(()) } /// Return the identifier for the next DFA state given an existing DFA /// state and an input byte. If the next DFA state already exists, then /// return its identifier from the cache. Otherwise, build the state, cache /// it and return its identifier. /// /// This routine returns a boolean indicating whether a new state was /// built. If a new state is built, then the caller needs to add it to its /// frontier of uncompiled DFA states to compute transitions for. fn cached_state( &mut self, dfa_id: StateID, unit: alphabet::Unit, ) -> Result<(StateID, bool), BuildError> { // Compute the set of all reachable NFA states, including epsilons. let empty_builder = self.get_state_builder(); let builder = util::determinize::next( self.nfa, self.config.match_kind, &mut self.sparses, &mut self.stack, &self.builder_states[self.dfa.to_index(dfa_id)], unit, empty_builder, ); self.maybe_add_state(builder) } /// Compute the set of DFA start states and add their identifiers in /// 'dfa_state_ids' (no duplicates are added). fn add_all_starts( &mut self, dfa_state_ids: &mut Vec, ) -> Result<(), BuildError> { // These should be the first states added. assert!(dfa_state_ids.is_empty()); // We only want to add (un)anchored starting states that is consistent // with our DFA's configuration. Unconditionally adding both (although // it is the default) can make DFAs quite a bit bigger. if self.dfa.start_kind().has_unanchored() { self.add_start_group(Anchored::No, dfa_state_ids)?; } if self.dfa.start_kind().has_anchored() { self.add_start_group(Anchored::Yes, dfa_state_ids)?; } // I previously has an 'assert' here checking that either // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it // turns out this isn't always true. For example, the NFA might have // one or more patterns but where all such patterns are just 'fail' // states. These will ultimately just compile down to DFA dead states, // and since the dead state was added earlier, no new DFA states are // added. And thus, it is valid and okay for 'dfa_state_ids' to be // empty even if there are a non-zero number of patterns in the NFA. // We only need to compute anchored start states for each pattern if it // was requested to do so. if self.dfa.starts_for_each_pattern() { for pid in self.nfa.patterns() { self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?; } } Ok(()) } /// Add a group of start states for the given match pattern ID. Any new /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are /// pushed.) /// /// When pattern_id is None, then this will compile a group of unanchored /// start states (if the DFA is unanchored). When the pattern_id is /// present, then this will compile a group of anchored start states that /// only match the given pattern. /// /// This panics if `anchored` corresponds to an invalid pattern ID. fn add_start_group( &mut self, anchored: Anchored, dfa_state_ids: &mut Vec, ) -> Result<(), BuildError> { let nfa_start = match anchored { Anchored::No => self.nfa.start_unanchored(), Anchored::Yes => self.nfa.start_anchored(), Anchored::Pattern(pid) => { self.nfa.start_pattern(pid).expect("valid pattern ID") } }; // When compiling start states, we're careful not to build additional // states that aren't necessary. For example, if the NFA has no word // boundary assertion, then there's no reason to have distinct start // states for 'NonWordByte' and 'WordByte' starting configurations. // Instead, the 'WordByte' starting configuration can just point // directly to the start state for the 'NonWordByte' config. // // Note though that we only need to care about assertions in the prefix // of an NFA since this only concerns the starting states. (Actually, // the most precisely thing we could do it is look at the prefix // assertions of each pattern when 'anchored == Anchored::Pattern', // and then only compile extra states if the prefix is non-empty.) But // we settle for simplicity here instead of absolute minimalism. It is // somewhat rare, after all, for multiple patterns in the same regex to // have different prefix look-arounds. let (id, is_new) = self.add_one_start(nfa_start, Start::NonWordByte)?; self.dfa.set_start_state(anchored, Start::NonWordByte, id); if is_new { dfa_state_ids.push(id); } if !self.nfa.look_set_prefix_any().contains_word() { self.dfa.set_start_state(anchored, Start::WordByte, id); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::WordByte)?; self.dfa.set_start_state(anchored, Start::WordByte, id); if is_new { dfa_state_ids.push(id); } } if !self.nfa.look_set_prefix_any().contains_anchor() { self.dfa.set_start_state(anchored, Start::Text, id); self.dfa.set_start_state(anchored, Start::LineLF, id); self.dfa.set_start_state(anchored, Start::LineCR, id); self.dfa.set_start_state( anchored, Start::CustomLineTerminator, id, ); } else { let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?; self.dfa.set_start_state(anchored, Start::Text, id); if is_new { dfa_state_ids.push(id); } let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?; self.dfa.set_start_state(anchored, Start::LineLF, id); if is_new { dfa_state_ids.push(id); } let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?; self.dfa.set_start_state(anchored, Start::LineCR, id); if is_new { dfa_state_ids.push(id); } let (id, is_new) = self.add_one_start(nfa_start, Start::CustomLineTerminator)?; self.dfa.set_start_state( anchored, Start::CustomLineTerminator, id, ); if is_new { dfa_state_ids.push(id); } } Ok(()) } /// Add a new DFA start state corresponding to the given starting NFA /// state, and the starting search configuration. (The starting search /// configuration essentially tells us which look-behind assertions are /// true for this particular state.) /// /// The boolean returned indicates whether the state ID returned is a newly /// created state, or a previously cached state. fn add_one_start( &mut self, nfa_start: StateID, start: Start, ) -> Result<(StateID, bool), BuildError> { // Compute the look-behind assertions that are true in this starting // configuration, and the determine the epsilon closure. While // computing the epsilon closure, we only follow condiional epsilon // transitions that satisfy the look-behind assertions in 'look_have'. let mut builder_matches = self.get_state_builder().into_matches(); util::determinize::set_lookbehind_from_start( self.nfa, &start, &mut builder_matches, ); self.sparses.set1.clear(); util::determinize::epsilon_closure( self.nfa, nfa_start, builder_matches.look_have(), &mut self.stack, &mut self.sparses.set1, ); let mut builder = builder_matches.into_nfa(); util::determinize::add_nfa_states( &self.nfa, &self.sparses.set1, &mut builder, ); self.maybe_add_state(builder) } /// Adds the given state to the DFA being built depending on whether it /// already exists in this determinizer's cache. /// /// If it does exist, then the memory used by 'state' is put back into the /// determinizer and the previously created state's ID is returned. (Along /// with 'false', indicating that no new state was added.) /// /// If it does not exist, then the state is added to the DFA being built /// and a fresh ID is allocated (if ID allocation fails, then an error is /// returned) and returned. (Along with 'true', indicating that a new state /// was added.) fn maybe_add_state( &mut self, builder: StateBuilderNFA, ) -> Result<(StateID, bool), BuildError> { if let Some(&cached_id) = self.cache.get(builder.as_bytes()) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. self.put_state_builder(builder); return Ok((cached_id, false)); } self.add_state(builder).map(|sid| (sid, true)) } /// Add the given state to the DFA and make it available in the cache. /// /// The state initially has no transitions. That is, it transitions to the /// dead state for all possible inputs, and transitions to the quit state /// for all quit bytes. /// /// If adding the state would exceed the maximum value for StateID, then an /// error is returned. fn add_state( &mut self, builder: StateBuilderNFA, ) -> Result { let id = self.dfa.add_empty_state()?; if !self.config.quit.is_empty() { for b in self.config.quit.iter() { self.dfa.set_transition( id, alphabet::Unit::u8(b), self.dfa.quit_id(), ); } } let state = builder.to_state(); // States use reference counting internally, so we only need to count // their memory usage once. self.memory_usage_state += state.memory_usage(); self.builder_states.push(state.clone()); self.cache.insert(state, id); self.put_state_builder(builder); if let Some(limit) = self.config.dfa_size_limit { if self.dfa.memory_usage() > limit { return Err(BuildError::dfa_exceeded_size_limit(limit)); } } if let Some(limit) = self.config.determinize_size_limit { if self.memory_usage() > limit { return Err(BuildError::determinize_exceeded_size_limit( limit, )); } } Ok(id) } /// Returns a state builder from this determinizer that might have existing /// capacity. This helps avoid allocs in cases where a state is built that /// turns out to already be cached. /// /// Callers must put the state builder back with 'put_state_builder', /// otherwise the allocation reuse won't work. fn get_state_builder(&mut self) -> StateBuilderEmpty { core::mem::replace( &mut self.scratch_state_builder, StateBuilderEmpty::new(), ) } /// Puts the given state builder back into this determinizer for reuse. /// /// Note that building a 'State' from a builder always creates a new /// alloc, so callers should always put the builder back. fn put_state_builder(&mut self, builder: StateBuilderNFA) { let _ = core::mem::replace( &mut self.scratch_state_builder, builder.clear(), ); } /// Return the memory usage, in bytes, of this determinizer at the current /// point in time. This does not include memory used by the NFA or the /// dense DFA itself. fn memory_usage(&self) -> usize { use core::mem::size_of; self.builder_states.len() * size_of::() // Maps likely use more memory than this, but it's probably close. + self.cache.len() * (size_of::() + size_of::()) + self.memory_usage_state + self.stack.capacity() * size_of::() + self.scratch_state_builder.capacity() } } regex-automata-0.4.9/src/dfa/minimize.rs000064400000000000000000000431521046102023000162760ustar 00000000000000use core::{cell::RefCell, fmt, mem}; use alloc::{collections::BTreeMap, rc::Rc, vec, vec::Vec}; use crate::{ dfa::{automaton::Automaton, dense, DEAD}, util::{ alphabet, primitives::{PatternID, StateID}, }, }; /// An implementation of Hopcroft's algorithm for minimizing DFAs. /// /// The algorithm implemented here is mostly taken from Wikipedia: /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm /// /// This code has had some light optimization attention paid to it, /// particularly in the form of reducing allocation as much as possible. /// However, it is still generally slow. Future optimization work should /// probably focus on the bigger picture rather than micro-optimizations. For /// example: /// /// 1. Figure out how to more intelligently create initial partitions. That is, /// Hopcroft's algorithm starts by creating two partitions of DFA states /// that are known to NOT be equivalent: match states and non-match states. /// The algorithm proceeds by progressively refining these partitions into /// smaller partitions. If we could start with more partitions, then we /// could reduce the amount of work that Hopcroft's algorithm needs to do. /// 2. For every partition that we visit, we find all incoming transitions to /// every state in the partition for *every* element in the alphabet. (This /// is why using byte classes can significantly decrease minimization times, /// since byte classes shrink the alphabet.) This is quite costly and there /// is perhaps some redundant work being performed depending on the specific /// states in the set. For example, we might be able to only visit some /// elements of the alphabet based on the transitions. /// 3. Move parts of minimization into determinization. If minimization has /// fewer states to deal with, then it should run faster. A prime example /// of this might be large Unicode classes, which are generated in way that /// can create a lot of redundant states. (Some work has been done on this /// point during NFA compilation via the algorithm described in the /// "Incremental Construction of MinimalAcyclic Finite-State Automata" /// paper.) pub(crate) struct Minimizer<'a> { dfa: &'a mut dense::OwnedDFA, in_transitions: Vec>>, partitions: Vec, waiting: Vec, } impl<'a> fmt::Debug for Minimizer<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { f.debug_struct("Minimizer") .field("dfa", &self.dfa) .field("in_transitions", &self.in_transitions) .field("partitions", &self.partitions) .field("waiting", &self.waiting) .finish() } } /// A set of states. A state set makes up a single partition in Hopcroft's /// algorithm. /// /// It is represented by an ordered set of state identifiers. We use shared /// ownership so that a single state set can be in both the set of partitions /// and in the set of waiting sets simultaneously without an additional /// allocation. Generally, once a state set is built, it becomes immutable. /// /// We use this representation because it avoids the overhead of more /// traditional set data structures (HashSet/BTreeSet), and also because /// computing intersection/subtraction on this representation is especially /// fast. #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] struct StateSet { ids: Rc>>, } impl<'a> Minimizer<'a> { pub fn new(dfa: &'a mut dense::OwnedDFA) -> Minimizer<'a> { let in_transitions = Minimizer::incoming_transitions(dfa); let partitions = Minimizer::initial_partitions(dfa); let waiting = partitions.clone(); Minimizer { dfa, in_transitions, partitions, waiting } } pub fn run(mut self) { let stride2 = self.dfa.stride2(); let as_state_id = |index: usize| -> StateID { StateID::new(index << stride2).unwrap() }; let as_index = |id: StateID| -> usize { id.as_usize() >> stride2 }; let mut incoming = StateSet::empty(); let mut scratch1 = StateSet::empty(); let mut scratch2 = StateSet::empty(); let mut newparts = vec![]; // This loop is basically Hopcroft's algorithm. Everything else is just // shuffling data around to fit our representation. while let Some(set) = self.waiting.pop() { for b in self.dfa.byte_classes().iter() { self.find_incoming_to(b, &set, &mut incoming); // If incoming is empty, then the intersection with any other // set must also be empty. So 'newparts' just ends up being // 'self.partitions'. So there's no need to go through the loop // below. // // This actually turns out to be rather large optimization. On // the order of making minimization 4-5x faster. It's likely // that the vast majority of all states have very few incoming // transitions. if incoming.is_empty() { continue; } for p in 0..self.partitions.len() { self.partitions[p].intersection(&incoming, &mut scratch1); if scratch1.is_empty() { newparts.push(self.partitions[p].clone()); continue; } self.partitions[p].subtract(&incoming, &mut scratch2); if scratch2.is_empty() { newparts.push(self.partitions[p].clone()); continue; } let (x, y) = (scratch1.deep_clone(), scratch2.deep_clone()); newparts.push(x.clone()); newparts.push(y.clone()); match self.find_waiting(&self.partitions[p]) { Some(i) => { self.waiting[i] = x; self.waiting.push(y); } None => { if x.len() <= y.len() { self.waiting.push(x); } else { self.waiting.push(y); } } } } newparts = mem::replace(&mut self.partitions, newparts); newparts.clear(); } } // At this point, we now have a minimal partitioning of states, where // each partition is an equivalence class of DFA states. Now we need to // use this partitioning to update the DFA to only contain one state for // each partition. // Create a map from DFA state ID to the representative ID of the // equivalence class to which it belongs. The representative ID of an // equivalence class of states is the minimum ID in that class. let mut state_to_part = vec![DEAD; self.dfa.state_len()]; for p in &self.partitions { p.iter(|id| state_to_part[as_index(id)] = p.min()); } // Generate a new contiguous sequence of IDs for minimal states, and // create a map from equivalence IDs to the new IDs. Thus, the new // minimal ID of *any* state in the unminimized DFA can be obtained // with minimals_ids[state_to_part[old_id]]. let mut minimal_ids = vec![DEAD; self.dfa.state_len()]; let mut new_index = 0; for state in self.dfa.states() { if state_to_part[as_index(state.id())] == state.id() { minimal_ids[as_index(state.id())] = as_state_id(new_index); new_index += 1; } } // The total number of states in the minimal DFA. let minimal_count = new_index; // Convenience function for remapping state IDs. This takes an old ID, // looks up its Hopcroft partition and then maps that to the new ID // range. let remap = |old| minimal_ids[as_index(state_to_part[as_index(old)])]; // Re-map this DFA in place such that the only states remaining // correspond to the representative states of every equivalence class. for id in (0..self.dfa.state_len()).map(as_state_id) { // If this state isn't a representative for an equivalence class, // then we skip it since it won't appear in the minimal DFA. if state_to_part[as_index(id)] != id { continue; } self.dfa.remap_state(id, remap); self.dfa.swap_states(id, minimal_ids[as_index(id)]); } // Trim off all unused states from the pre-minimized DFA. This // represents all states that were merged into a non-singleton // equivalence class of states, and appeared after the first state // in each such class. (Because the state with the smallest ID in each // equivalence class is its representative ID.) self.dfa.truncate_states(minimal_count); // Update the new start states, which is now just the minimal ID of // whatever state the old start state was collapsed into. Also, we // collect everything before-hand to work around the borrow checker. // We're already allocating so much that this is probably fine. If this // turns out to be costly, then I guess add a `starts_mut` iterator. let starts: Vec<_> = self.dfa.starts().collect(); for (old_start_id, anchored, start_type) in starts { self.dfa.set_start_state( anchored, start_type, remap(old_start_id), ); } // Update the match state pattern ID list for multi-regexes. All we // need to do is remap the match state IDs. The pattern ID lists are // always the same as they were since match states with distinct // pattern ID lists are always considered distinct states. let mut pmap = BTreeMap::new(); for (match_id, pattern_ids) in self.dfa.pattern_map() { let new_id = remap(match_id); pmap.insert(new_id, pattern_ids); } // This unwrap is OK because minimization never increases the number of // match states or patterns in those match states. Since minimization // runs after the pattern map has already been set at least once, we // know that our match states cannot error. self.dfa.set_pattern_map(&pmap).unwrap(); // In order to update the ID of the maximum match state, we need to // find the maximum ID among all of the match states in the minimized // DFA. This is not necessarily the new ID of the unminimized maximum // match state, since that could have been collapsed with a much // earlier match state. Therefore, to find the new max match state, // we iterate over all previous match states, find their corresponding // new minimal ID, and take the maximum of those. let old = self.dfa.special().clone(); let new = self.dfa.special_mut(); // ... but only remap if we had match states. if old.matches() { new.min_match = StateID::MAX; new.max_match = StateID::ZERO; for i in as_index(old.min_match)..=as_index(old.max_match) { let new_id = remap(as_state_id(i)); if new_id < new.min_match { new.min_match = new_id; } if new_id > new.max_match { new.max_match = new_id; } } } // ... same, but for start states. if old.starts() { new.min_start = StateID::MAX; new.max_start = StateID::ZERO; for i in as_index(old.min_start)..=as_index(old.max_start) { let new_id = remap(as_state_id(i)); if new_id == DEAD { continue; } if new_id < new.min_start { new.min_start = new_id; } if new_id > new.max_start { new.max_start = new_id; } } if new.max_start == DEAD { new.min_start = DEAD; } } new.quit_id = remap(new.quit_id); new.set_max(); } fn find_waiting(&self, set: &StateSet) -> Option { self.waiting.iter().position(|s| s == set) } fn find_incoming_to( &self, b: alphabet::Unit, set: &StateSet, incoming: &mut StateSet, ) { incoming.clear(); set.iter(|id| { for &inid in &self.in_transitions[self.dfa.to_index(id)][b.as_usize()] { incoming.add(inid); } }); incoming.canonicalize(); } fn initial_partitions(dfa: &dense::OwnedDFA) -> Vec { // For match states, we know that two match states with different // pattern ID lists will *always* be distinct, so we can partition them // initially based on that. let mut matching: BTreeMap, StateSet> = BTreeMap::new(); let mut is_quit = StateSet::empty(); let mut no_match = StateSet::empty(); for state in dfa.states() { if dfa.is_match_state(state.id()) { let mut pids = vec![]; for i in 0..dfa.match_len(state.id()) { pids.push(dfa.match_pattern(state.id(), i)); } matching .entry(pids) .or_insert(StateSet::empty()) .add(state.id()); } else if dfa.is_quit_state(state.id()) { is_quit.add(state.id()); } else { no_match.add(state.id()); } } let mut sets: Vec = matching.into_iter().map(|(_, set)| set).collect(); sets.push(no_match); sets.push(is_quit); sets } fn incoming_transitions(dfa: &dense::OwnedDFA) -> Vec>> { let mut incoming = vec![]; for _ in dfa.states() { incoming.push(vec![vec![]; dfa.alphabet_len()]); } for state in dfa.states() { for (b, next) in state.transitions() { incoming[dfa.to_index(next)][b.as_usize()].push(state.id()); } } incoming } } impl StateSet { fn empty() -> StateSet { StateSet { ids: Rc::new(RefCell::new(vec![])) } } fn add(&mut self, id: StateID) { self.ids.borrow_mut().push(id); } fn min(&self) -> StateID { self.ids.borrow()[0] } fn canonicalize(&mut self) { self.ids.borrow_mut().sort(); self.ids.borrow_mut().dedup(); } fn clear(&mut self) { self.ids.borrow_mut().clear(); } fn len(&self) -> usize { self.ids.borrow().len() } fn is_empty(&self) -> bool { self.len() == 0 } fn deep_clone(&self) -> StateSet { let ids = self.ids.borrow().iter().cloned().collect(); StateSet { ids: Rc::new(RefCell::new(ids)) } } fn iter(&self, mut f: F) { for &id in self.ids.borrow().iter() { f(id); } } fn intersection(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { return; } let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => break, Some(b) => b, }; } else if a < b { a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => break, Some(b) => b, }; } } } fn subtract(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { self.iter(|s| dest.add(s)); return; } let (seta, setb) = (self.ids.borrow(), other.ids.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } else if a < b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } } for a in ita { dest.add(a); } } } regex-automata-0.4.9/src/dfa/mod.rs000064400000000000000000000371631046102023000152410ustar 00000000000000/*! A module for building and searching with deterministic finite automata (DFAs). Like other modules in this crate, DFAs support a rich regex syntax with Unicode features. DFAs also have extensive options for configuring the best space vs time trade off for your use case and provides support for cheap deserialization of automata for use in `no_std` environments. If you're looking for lazy DFAs that build themselves incrementally during search, then please see the top-level [`hybrid` module](crate::hybrid). # Overview This section gives a brief overview of the primary types in this module: * A [`regex::Regex`] provides a way to search for matches of a regular expression using DFAs. This includes iterating over matches with both the start and end positions of each match. * A [`dense::DFA`] provides low level access to a DFA that uses a dense representation (uses lots of space, but fast searching). * A [`sparse::DFA`] provides the same API as a `dense::DFA`, but uses a sparse representation (uses less space, but slower searching). * An [`Automaton`] trait that defines an interface that both dense and sparse DFAs implement. (A `regex::Regex` is generic over this trait.) * Both dense DFAs and sparse DFAs support serialization to raw bytes (e.g., [`dense::DFA::to_bytes_little_endian`]) and cheap deserialization (e.g., [`dense::DFA::from_bytes`]). There is also a [`onepass`] module that provides a [one-pass DFA](onepass::DFA). The unique advantage of this DFA is that, for the class of regexes it can be built with, it supports reporting the spans of matching capturing groups. It is the only DFA in this crate capable of such a thing. # Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ``` use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; let text = b"2018-12-24 2016-10-08"; let matches: Vec = re.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` # Example: searching with regex sets The DFAs in this module all fully support searching with multiple regexes simultaneously. You can use this support with standard leftmost-first style searching to find non-overlapping matches: ``` # if cfg!(miri) { return Ok(()); } // miri takes too long use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_many(&[r"\w+", r"\S+"])?; let text = b"@foo bar"; let matches: Vec = re.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(1, 0..4), Match::must(0, 5..8), ]); # Ok::<(), Box>(()) ``` # Example: use sparse DFAs By default, compiling a regex will use dense DFAs internally. This uses more memory, but executes searches more quickly. If you can abide slower searches (somewhere around 3-5x), then sparse DFAs might make more sense since they can use significantly less space. Using sparse DFAs is as easy as using `Regex::new_sparse` instead of `Regex::new`: ``` use regex_automata::{Match, dfa::regex::Regex}; let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; let matches: Vec = re.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` If you already have dense DFAs for some reason, they can be converted to sparse DFAs and used to build a new `Regex`. For example: ``` use regex_automata::{Match, dfa::regex::Regex}; let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let sparse_re = Regex::builder().build_from_dfas( dense_re.forward().to_sparse()?, dense_re.reverse().to_sparse()?, ); let text = b"2018-12-24 2016-10-08"; let matches: Vec = sparse_re.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` # Example: deserialize a DFA This shows how to first serialize a DFA into raw bytes, and then deserialize those raw bytes back into a DFA. While this particular example is a bit contrived, this same technique can be used in your program to deserialize a DFA at start up time or by memory mapping a file. ``` use regex_automata::{Match, dfa::{dense, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both the forward and reverse DFAs, see note below let (fwd_bytes, fwd_pad) = re1.forward().to_bytes_native_endian(); let (rev_bytes, rev_pad) = re1.reverse().to_bytes_native_endian(); // now deserialize both---we need to specify the correct type! let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes[fwd_pad..])?.0; let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes[rev_pad..])?.0; // finally, reconstruct our regex let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; let matches: Vec = re2.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` There are a few points worth noting here: * We need to extract the raw DFAs used by the regex and serialize those. You can build the DFAs manually yourself using [`dense::Builder`], but using the DFAs from a `Regex` guarantees that the DFAs are built correctly. (In particular, a `Regex` constructs a reverse DFA for finding the starting location of matches.) * To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method. In practice, you'll want to use either [`dense::DFA::to_bytes_little_endian`] or [`dense::DFA::to_bytes_big_endian`], depending on which platform you're deserializing your DFA from. If you intend to deserialize on either platform, then you'll need to serialize both and deserialize the right one depending on your target's endianness. * Safely deserializing a DFA requires verifying the raw bytes, particularly if they are untrusted, since an invalid DFA could cause logical errors, panics or even undefined behavior. This verification step requires visiting all of the transitions in the DFA, which can be costly. If cheaper verification is desired, then [`dense::DFA::from_bytes_unchecked`] is available that only does verification that can be performed in constant time. However, one can only use this routine if the caller can guarantee that the bytes provided encoded a valid DFA. The same process can be achieved with sparse DFAs as well: ``` use regex_automata::{Match, dfa::{sparse, regex::Regex}}; let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both let fwd_bytes = re1.forward().to_sparse()?.to_bytes_native_endian(); let rev_bytes = re1.reverse().to_sparse()?.to_bytes_native_endian(); // now deserialize both---we need to specify the correct type! let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes)?.0; let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes)?.0; // finally, reconstruct our regex let re2 = Regex::builder().build_from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; let matches: Vec = re2.find_iter(text).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. Conversely, dense DFAs must be aligned to the same alignment as a [`StateID`](crate::util::primitives::StateID). # Support for `no_std` and `alloc`-only This crate comes with `alloc` and `std` features that are enabled by default. When the `alloc` or `std` features are enabled, the API of this module will include the facilities necessary for compiling, serializing, deserializing and searching with DFAs. When only the `alloc` feature is enabled, then implementations of the `std::error::Error` trait are dropped, but everything else generally remains the same. When both the `alloc` and `std` features are disabled, the API of this module will shrink such that it only includes the facilities necessary for deserializing and searching with DFAs. The intended workflow for `no_std` environments is thus as follows: * Write a program with the `alloc` or `std` features that compiles and serializes a regular expression. You may need to serialize both little and big endian versions of each DFA. (So that's 4 DFAs in total for each regex.) * In your `no_std` environment, follow the examples above for deserializing your previously serialized DFAs into regexes. You can then search with them as you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. The `regex-cli` command (found in the same repository as this crate) can be used to serialize DFAs to files and generate Rust code to read them. # Syntax This module supports the same syntax as the `regex` crate, since they share the same parser. You can find an exhaustive list of supported syntax in the [documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). There are two things that are not supported by the DFAs in this module: * Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top of them) can only find the offsets of an entire match, but cannot resolve the offsets of each capturing group. This is because DFAs do not have the expressive power necessary. * Unicode word boundaries. These present particularly difficult challenges for DFA construction and would result in an explosion in the number of states. One can enable [`dense::Config::unicode_word_boundary`] though, which provides heuristic support for Unicode word boundaries that only works on ASCII text. Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work on any input. There are no plans to lift either of these limitations. Note that these restrictions are identical to the restrictions on lazy DFAs. # Differences with general purpose regexes The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a general purpose regular expression engine. It aims to automatically balance low compile times, fast search times and low memory usage, while also providing a convenient API for users. In contrast, this module provides a lower level regular expression interface based exclusively on DFAs that is a bit less convenient while providing more explicit control over memory usage and search times. Here are some specific negative differences: * **Compilation can take an exponential amount of time and space** in the size of the regex pattern. While most patterns do not exhibit worst case exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA with approximately `2^(N+2)` states. For this reason, untrusted patterns should not be compiled with this module. (In the future, the API may expose an option to return an error if the DFA gets too big.) * This module does not support sub-match extraction via capturing groups, which can be achieved with the regex crate's "captures" API. * While the regex crate doesn't necessarily sport fast compilation times, the regexes in this module are almost universally slow to compile, especially when they contain large Unicode character classes. For example, on my system, compiling `\w{50}` takes about 1 second and almost 15MB of memory! (Compiling a sparse regex takes about the same time but only uses about 1.2MB of memory.) Conversely, compiling the same regex without Unicode support, e.g., `(?-u)\w{50}`, takes under 1 millisecond and about 15KB of memory. For this reason, you should only use Unicode character classes if you absolutely need them! (They are enabled by default though.) * This module does not support Unicode word boundaries. ASCII word bondaries may be used though by disabling Unicode or selectively doing so in the syntax, e.g., `(?-u:\b)`. There is also an option to [heuristically enable Unicode word boundaries](crate::dfa::dense::Config::unicode_word_boundary), where the corresponding DFA will give up if any non-ASCII byte is seen. * As a lower level API, this module does not do literal optimizations automatically. Although it does provide hooks in its API to make use of the [`Prefilter`](crate::util::prefilter::Prefilter) trait. Missing literal optimizations means that searches may run much slower than what you're accustomed to, although, it does provide more predictable and consistent performance. * There is no `&str` API like in the regex crate. In this module, all APIs operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 boundaries, unless either of [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) or [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) are disabled. With some of the downsides out of the way, here are some positive differences: * Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply deserialized. Deserialization can be done in constant time with the unchecked APIs, since searching can be performed directly on the raw serialized bytes of a DFA. * This module was specifically designed so that the searching phase of a DFA has minimal runtime requirements, and can therefore be used in `no_std` environments. While `no_std` environments cannot compile regexes, they can deserialize pre-compiled regexes. * Since this module builds DFAs ahead of time, it will generally out-perform the `regex` crate on equivalent tasks. The performance difference is likely not large. However, because of a complex set of optimizations in the regex crate (like literal optimizations), an accurate performance comparison may be difficult to do. * Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search performance a small amount, but uses much less storage space. Potentially even less than what the regex crate uses. * This module exposes DFAs directly, such as [`dense::DFA`] and [`sparse::DFA`], which enables one to do less work in some cases. For example, if you only need the end of a match and not the start of a match, then you can use a DFA directly without building a `Regex`, which always requires a second DFA to find the start of a match. * This module provides more control over memory usage. Aside from choosing between dense and sparse DFAs, one can also choose a smaller state identifier representation to use less space. Also, one can enable DFA minimization via [`dense::Config::minimize`], but it can increase compilation times dramatically. */ #[cfg(feature = "dfa-search")] pub use crate::dfa::{ automaton::{Automaton, OverlappingState, StartError}, start::StartKind, }; /// This is an alias for a state ID of zero. It has special significance /// because it always corresponds to the first state in a DFA, and the first /// state in a DFA is always "dead." That is, the dead state always has all /// of its transitions set to itself. Moreover, the dead state is used as a /// sentinel for various things. e.g., In search, reaching a dead state means /// that the search must stop. const DEAD: crate::util::primitives::StateID = crate::util::primitives::StateID::ZERO; #[cfg(feature = "dfa-search")] pub mod dense; #[cfg(feature = "dfa-onepass")] pub mod onepass; #[cfg(feature = "dfa-search")] pub mod regex; #[cfg(feature = "dfa-search")] pub mod sparse; #[cfg(feature = "dfa-search")] pub(crate) mod accel; #[cfg(feature = "dfa-search")] mod automaton; #[cfg(feature = "dfa-build")] mod determinize; #[cfg(feature = "dfa-build")] mod minimize; #[cfg(any(feature = "dfa-build", feature = "dfa-onepass"))] mod remapper; #[cfg(feature = "dfa-search")] mod search; #[cfg(feature = "dfa-search")] mod special; #[cfg(feature = "dfa-search")] mod start; regex-automata-0.4.9/src/dfa/onepass.rs000064400000000000000000004040021046102023000161200ustar 00000000000000/*! A DFA that can return spans for matching capturing groups. This module is the home of a [one-pass DFA](DFA). This module also contains a [`Builder`] and a [`Config`] for building and configuring a one-pass DFA. */ // A note on naming and credit: // // As far as I know, Russ Cox came up with the practical vision and // implementation of a "one-pass regex engine." He mentions and describes it // briefly in the third article of his regexp article series: // https://swtch.com/~rsc/regexp/regexp3.html // // Cox's implementation is in RE2, and the implementation below is most // heavily inspired by RE2's. The key thing they have in common is that // their transitions are defined over an alphabet of bytes. In contrast, // Go's regex engine also has a one-pass engine, but its transitions are // more firmly rooted on Unicode codepoints. The ideas are the same, but the // implementations are different. // // RE2 tends to call this a "one-pass NFA." Here, we call it a "one-pass DFA." // They're both true in their own ways: // // * The "one-pass" criterion is generally a property of the NFA itself. In // particular, it is said that an NFA is one-pass if, after each byte of input // during a search, there is at most one "VM thread" remaining to take for the // next byte of input. That is, there is never any ambiguity as to the path to // take through the NFA during a search. // // * On the other hand, once a one-pass NFA has its representation converted // to something where a constant number of instructions is used for each byte // of input, the implementation looks a lot more like a DFA. It's technically // more powerful than a DFA since it has side effects (storing offsets inside // of slots activated by a transition), but it is far closer to a DFA than an // NFA simulation. // // Thus, in this crate, we call it a one-pass DFA. use alloc::{vec, vec::Vec}; use crate::{ dfa::{remapper::Remapper, DEAD}, nfa::thompson::{self, NFA}, util::{ alphabet::ByteClasses, captures::Captures, escape::DebugByte, int::{Usize, U32, U64, U8}, look::{Look, LookSet, UnicodeWordBoundaryError}, primitives::{NonMaxUsize, PatternID, StateID}, search::{Anchored, Input, Match, MatchError, MatchKind, Span}, sparse_set::SparseSet, }, }; /// The configuration used for building a [one-pass DFA](DFA). /// /// A one-pass DFA configuration is a simple data object that is typically used /// with [`Builder::configure`]. It can be cheaply cloned. /// /// A default configuration can be created either with `Config::new`, or /// perhaps more conveniently, with [`DFA::config`]. #[derive(Clone, Debug, Default)] pub struct Config { match_kind: Option, starts_for_each_pattern: Option, byte_classes: Option, size_limit: Option>, } impl Config { /// Return a new default one-pass DFA configuration. pub fn new() -> Config { Config::default() } /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the /// match semantics of Perl-like regex engines. That is, when multiple /// patterns would match at the same leftmost position, the pattern that /// appears first in the concrete syntax is chosen. /// /// Currently, the only other kind of match semantics supported is /// [`MatchKind::All`]. This corresponds to "classical DFA" construction /// where all possible matches are visited. /// /// When it comes to the one-pass DFA, it is rarer for preference order and /// "longest match" to actually disagree. Since if they did disagree, then /// the regex typically isn't one-pass. For example, searching `Samwise` /// for `Sam|Samwise` will report `Sam` for leftmost-first matching and /// `Samwise` for "longest match" or "all" matching. However, this regex is /// not one-pass if taken literally. The equivalent regex, `Sam(?:|wise)` /// is one-pass and `Sam|Samwise` may be optimized to it. /// /// The other main difference is that "all" match semantics don't support /// non-greedy matches. "All" match semantics always try to match as much /// as possible. pub fn match_kind(mut self, kind: MatchKind) -> Config { self.match_kind = Some(kind); self } /// Whether to compile a separate start state for each pattern in the /// one-pass DFA. /// /// When enabled, a separate **anchored** start state is added for each /// pattern in the DFA. When this start state is used, then the DFA will /// only search for matches for the pattern specified, even if there are /// other patterns in the DFA. /// /// The main downside of this option is that it can potentially increase /// the size of the DFA and/or increase the time it takes to build the DFA. /// /// You might want to enable this option when you want to both search for /// anchored matches of any pattern or to search for anchored matches of /// one particular pattern while using the same DFA. (Otherwise, you would /// need to compile a new DFA for each pattern.) /// /// By default this is disabled. /// /// # Example /// /// This example shows how to build a multi-regex and then search for /// matches for a any of the patterns or matches for a specific pattern. /// /// ``` /// use regex_automata::{ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, /// }; /// /// let re = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) /// .build_many(&["[a-z]+", "[0-9]+"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "123abc"; /// let input = Input::new(haystack).anchored(Anchored::Yes); /// /// // A normal multi-pattern search will show pattern 1 matches. /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); /// /// // If we only want to report pattern 0 matches, then we'll get no /// // match here. /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { self.starts_for_each_pattern = Some(yes); self } /// Whether to attempt to shrink the size of the DFA's alphabet or not. /// /// This option is enabled by default and should never be disabled unless /// one is debugging a one-pass DFA. /// /// When enabled, the DFA will use a map from all possible bytes to their /// corresponding equivalence class. Each equivalence class represents a /// set of bytes that does not discriminate between a match and a non-match /// in the DFA. For example, the pattern `[ab]+` has at least two /// equivalence classes: a set containing `a` and `b` and a set containing /// every byte except for `a` and `b`. `a` and `b` are in the same /// equivalence class because they never discriminate between a match and a /// non-match. /// /// The advantage of this map is that the size of the transition table /// can be reduced drastically from (approximately) `#states * 256 * /// sizeof(StateID)` to `#states * k * sizeof(StateID)` where `k` is the /// number of equivalence classes (rounded up to the nearest power of 2). /// As a result, total space usage can decrease substantially. Moreover, /// since a smaller alphabet is used, DFA compilation becomes faster as /// well. /// /// **WARNING:** This is only useful for debugging DFAs. Disabling this /// does not yield any speed advantages. Namely, even when this is /// disabled, a byte class map is still used while searching. The only /// difference is that every byte will be forced into its own distinct /// equivalence class. This is useful for debugging the actual generated /// transitions because it lets one see the transitions defined on actual /// bytes instead of the equivalence classes. pub fn byte_classes(mut self, yes: bool) -> Config { self.byte_classes = Some(yes); self } /// Set a size limit on the total heap used by a one-pass DFA. /// /// This size limit is expressed in bytes and is applied during /// construction of a one-pass DFA. If the DFA's heap usage exceeds /// this configured limit, then construction is stopped and an error is /// returned. /// /// The default is no limit. /// /// # Example /// /// This example shows a one-pass DFA that fails to build because of /// a configured size limit. This particular example also serves as a /// cautionary tale demonstrating just how big DFAs with large Unicode /// character classes can get. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// // 6MB isn't enough! /// DFA::builder() /// .configure(DFA::config().size_limit(Some(6_000_000))) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 7MB probably is! /// // (Note that DFA sizes aren't necessarily stable between releases.) /// let re = DFA::builder() /// .configure(DFA::config().size_limit(Some(7_000_000))) /// .build(r"\w{20}")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "A".repeat(20); /// re.captures(&mut cache, &haystack, &mut caps); /// assert_eq!(Some(Match::must(0, 0..20)), caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// While one needs a little more than 3MB to represent `\w{20}`, it /// turns out that you only need a little more than 4KB to represent /// `(?-u:\w{20})`. So only use Unicode if you need it! pub fn size_limit(mut self, limit: Option) -> Config { self.size_limit = Some(limit); self } /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns whether this configuration has enabled anchored starting states /// for every pattern in the DFA. pub fn get_starts_for_each_pattern(&self) -> bool { self.starts_for_each_pattern.unwrap_or(false) } /// Returns whether this configuration has enabled byte classes or not. /// This is typically a debugging oriented option, as disabling it confers /// no speed benefit. pub fn get_byte_classes(&self) -> bool { self.byte_classes.unwrap_or(true) } /// Returns the DFA size limit of this configuration if one was set. /// The size limit is total number of bytes on the heap that a DFA is /// permitted to use. If the DFA exceeds this limit during construction, /// then construction is stopped and an error is returned. pub fn get_size_limit(&self) -> Option { self.size_limit.unwrap_or(None) } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), starts_for_each_pattern: o .starts_for_each_pattern .or(self.starts_for_each_pattern), byte_classes: o.byte_classes.or(self.byte_classes), size_limit: o.size_limit.or(self.size_limit), } } } /// A builder for a [one-pass DFA](DFA). /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction and the DFA construction. This builder is different from a /// general purpose regex builder in that it permits fine grain configuration /// of the construction process. The trade off for this is complexity, and /// the possibility of setting a configuration that might not make sense. For /// example, there are two different UTF-8 modes: /// /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls /// whether the pattern itself can contain sub-expressions that match invalid /// UTF-8. /// * [`thompson::Config::utf8`] controls whether empty matches that split a /// Unicode codepoint are reported or not. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the NFA. /// This is generally what you want for matching on arbitrary bytes. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::onepass::DFA, /// nfa::thompson, /// util::syntax, /// Match, /// }; /// /// let re = DFA::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; /// re.captures(&mut cache, haystack, &mut caps); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// // /// // N.B. This example does not show the impact of /// // disabling UTF-8 mode on a one-pass DFA Config, /// // since that only impacts regexes that can /// // produce matches of length 0. /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } impl Builder { /// Create a new one-pass DFA builder with the default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a one-pass DFA from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a one-pass DFA from the given patterns. /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self.thompson.build_many(patterns).map_err(BuildError::nfa)?; self.build_from_nfa(nfa) } /// Build a DFA from the given NFA. /// /// # Example /// /// This example shows how to build a DFA if you already have an NFA in /// hand. /// /// ``` /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Match}; /// /// // This shows how to set non-default options for building an NFA. /// let nfa = NFA::compiler() /// .configure(NFA::config().shrink(true)) /// .build(r"[a-z0-9]+")?; /// let re = DFA::builder().build_from_nfa(nfa)?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// re.captures(&mut cache, "foo123bar", &mut caps); /// assert_eq!(Some(Match::must(0, 0..9)), caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_from_nfa(&self, nfa: NFA) -> Result { // Why take ownership if we're just going to pass a reference to the // NFA to our internal builder? Well, the first thing to note is that // an NFA uses reference counting internally, so either choice is going // to be cheap. So there isn't much cost either way. // // The real reason is that a one-pass DFA, semantically, shares // ownership of an NFA. This is unlike other DFAs that don't share // ownership of an NFA at all, primarily because they want to be // self-contained in order to support cheap (de)serialization. // // But then why pass a '&nfa' below if we want to share ownership? // Well, it turns out that using a '&NFA' in our internal builder // separates its lifetime from the DFA we're building, and this turns // out to make code a bit more composable. e.g., We can iterate over // things inside the NFA while borrowing the builder as mutable because // we know the NFA cannot be mutated. So TL;DR --- this weirdness is // "because borrow checker." InternalBuilder::new(self.config.clone(), &nfa).build() } /// Apply the given one-pass DFA configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a one-pass DFA directly /// from a pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. /// /// These settings only apply when constructing a DFA directly from a /// pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } /// An internal builder for encapsulating the state necessary to build a /// one-pass DFA. Typical use is just `InternalBuilder::new(..).build()`. /// /// There is no separate pass for determining whether the NFA is one-pass or /// not. We just try to build the DFA. If during construction we discover that /// it is not one-pass, we bail out. This is likely to lead to some undesirable /// expense in some cases, so it might make sense to try an identify common /// patterns in the NFA that make it definitively not one-pass. That way, we /// can avoid ever trying to build a one-pass DFA in the first place. For /// example, '\w*\s' is not one-pass, and since '\w' is Unicode-aware by /// default, it's probably not a trivial cost to try and build a one-pass DFA /// for it and then fail. /// /// Note that some (immutable) fields are duplicated here. For example, the /// 'nfa' and 'classes' fields are both in the 'DFA'. They are the same thing, /// but we duplicate them because it makes composition easier below. Otherwise, /// since the borrow checker can't see through method calls, the mutable borrow /// we use to mutate the DFA winds up preventing borrowing from any other part /// of the DFA, even though we aren't mutating those parts. We only do this /// because the duplication is cheap. #[derive(Debug)] struct InternalBuilder<'a> { /// The DFA we're building. dfa: DFA, /// An unordered collection of NFA state IDs that we haven't yet tried to /// build into a DFA state yet. /// /// This collection does not ultimately wind up including every NFA state /// ID. Instead, each ID represents a "start" state for a sub-graph of the /// NFA. The set of NFA states we then use to build a DFA state consists /// of that "start" state and all states reachable from it via epsilon /// transitions. uncompiled_nfa_ids: Vec, /// A map from NFA state ID to DFA state ID. This is useful for easily /// determining whether an NFA state has been used as a "starting" point /// to build a DFA state yet. If it hasn't, then it is mapped to DEAD, /// and since DEAD is specially added and never corresponds to any NFA /// state, it follows that a mapping to DEAD implies the NFA state has /// no corresponding DFA state yet. nfa_to_dfa_id: Vec, /// A stack used to traverse the NFA states that make up a single DFA /// state. Traversal occurs until the stack is empty, and we only push to /// the stack when the state ID isn't in 'seen'. Actually, even more than /// that, if we try to push something on to this stack that is already in /// 'seen', then we bail out on construction completely, since it implies /// that the NFA is not one-pass. stack: Vec<(StateID, Epsilons)>, /// The set of NFA states that we've visited via 'stack'. seen: SparseSet, /// Whether a match NFA state has been observed while constructing a /// one-pass DFA state. Once a match state is seen, assuming we are using /// leftmost-first match semantics, then we don't add any more transitions /// to the DFA state we're building. matched: bool, /// The config passed to the builder. /// /// This is duplicated in dfa.config. config: Config, /// The NFA we're building a one-pass DFA from. /// /// This is duplicated in dfa.nfa. nfa: &'a NFA, /// The equivalence classes that make up the alphabet for this DFA> /// /// This is duplicated in dfa.classes. classes: ByteClasses, } impl<'a> InternalBuilder<'a> { /// Create a new builder with an initial empty DFA. fn new(config: Config, nfa: &'a NFA) -> InternalBuilder<'a> { let classes = if !config.get_byte_classes() { // A one-pass DFA will always use the equivalence class map, but // enabling this option is useful for debugging. Namely, this will // cause all transitions to be defined over their actual bytes // instead of an opaque equivalence class identifier. The former is // much easier to grok as a human. ByteClasses::singletons() } else { nfa.byte_classes().clone() }; // Normally a DFA alphabet includes the EOI symbol, but we don't need // that in the one-pass DFA since we handle look-around explicitly // without encoding it into the DFA. Thus, we don't need to delay // matches by 1 byte. However, we reuse the space that *would* be used // by the EOI transition by putting match information there (like which // pattern matches and which look-around assertions need to hold). So // this means our real alphabet length is 1 fewer than what the byte // classes report, since we don't use EOI. let alphabet_len = classes.alphabet_len().checked_sub(1).unwrap(); let stride2 = classes.stride2(); let dfa = DFA { config: config.clone(), nfa: nfa.clone(), table: vec![], starts: vec![], // Since one-pass DFAs have a smaller state ID max than // StateID::MAX, it follows that StateID::MAX is a valid initial // value for min_match_id since no state ID can ever be greater // than it. In the case of a one-pass DFA with no match states, the // min_match_id will keep this sentinel value. min_match_id: StateID::MAX, classes: classes.clone(), alphabet_len, stride2, pateps_offset: alphabet_len, // OK because PatternID::MAX*2 is guaranteed not to overflow. explicit_slot_start: nfa.pattern_len().checked_mul(2).unwrap(), }; InternalBuilder { dfa, uncompiled_nfa_ids: vec![], nfa_to_dfa_id: vec![DEAD; nfa.states().len()], stack: vec![], seen: SparseSet::new(nfa.states().len()), matched: false, config, nfa, classes, } } /// Build the DFA from the NFA given to this builder. If the NFA is not /// one-pass, then return an error. An error may also be returned if a /// particular limit is exceeded. (Some limits, like the total heap memory /// used, are configurable. Others, like the total patterns or slots, are /// hard-coded based on representational limitations.) fn build(mut self) -> Result { self.nfa.look_set_any().available().map_err(BuildError::word)?; for look in self.nfa.look_set_any().iter() { // This is a future incompatibility check where if we add any // more look-around assertions, then the one-pass DFA either // needs to reject them (what we do here) or it needs to have its // Transition representation modified to be capable of storing the // new assertions. if look.as_repr() > Look::WordUnicodeNegate.as_repr() { return Err(BuildError::unsupported_look(look)); } } if self.nfa.pattern_len().as_u64() > PatternEpsilons::PATTERN_ID_LIMIT { return Err(BuildError::too_many_patterns( PatternEpsilons::PATTERN_ID_LIMIT, )); } if self.nfa.group_info().explicit_slot_len() > Slots::LIMIT { return Err(BuildError::not_one_pass( "too many explicit capturing groups (max is 16)", )); } assert_eq!(DEAD, self.add_empty_state()?); // This is where the explicit slots start. We care about this because // we only need to track explicit slots. The implicit slots---two for // each pattern---are tracked as part of the search routine itself. let explicit_slot_start = self.nfa.pattern_len() * 2; self.add_start_state(None, self.nfa.start_anchored())?; if self.config.get_starts_for_each_pattern() { for pid in self.nfa.patterns() { self.add_start_state( Some(pid), self.nfa.start_pattern(pid).unwrap(), )?; } } // NOTE: One wonders what the effects of treating 'uncompiled_nfa_ids' // as a stack are. It is really an unordered *set* of NFA state IDs. // If it, for example, in practice led to discovering whether a regex // was or wasn't one-pass later than if we processed NFA state IDs in // ascending order, then that would make this routine more costly in // the somewhat common case of a regex that isn't one-pass. while let Some(nfa_id) = self.uncompiled_nfa_ids.pop() { let dfa_id = self.nfa_to_dfa_id[nfa_id]; // Once we see a match, we keep going, but don't add any new // transitions. Normally we'd just stop, but we have to keep // going in order to verify that our regex is actually one-pass. self.matched = false; // The NFA states we've already explored for this DFA state. self.seen.clear(); // The NFA states to explore via epsilon transitions. If we ever // try to push an NFA state that we've already seen, then the NFA // is not one-pass because it implies there are multiple epsilon // transition paths that lead to the same NFA state. In other // words, there is ambiguity. self.stack_push(nfa_id, Epsilons::empty())?; while let Some((id, epsilons)) = self.stack.pop() { match *self.nfa.state(id) { thompson::State::ByteRange { ref trans } => { self.compile_transition(dfa_id, trans, epsilons)?; } thompson::State::Sparse(ref sparse) => { for trans in sparse.transitions.iter() { self.compile_transition(dfa_id, trans, epsilons)?; } } thompson::State::Dense(ref dense) => { for trans in dense.iter() { self.compile_transition(dfa_id, &trans, epsilons)?; } } thompson::State::Look { look, next } => { let looks = epsilons.looks().insert(look); self.stack_push(next, epsilons.set_looks(looks))?; } thompson::State::Union { ref alternates } => { for &sid in alternates.iter().rev() { self.stack_push(sid, epsilons)?; } } thompson::State::BinaryUnion { alt1, alt2 } => { self.stack_push(alt2, epsilons)?; self.stack_push(alt1, epsilons)?; } thompson::State::Capture { next, slot, .. } => { let slot = slot.as_usize(); let epsilons = if slot < explicit_slot_start { // If this is an implicit slot, we don't care // about it, since we handle implicit slots in // the search routine. We can get away with that // because there are 2 implicit slots for every // pattern. epsilons } else { // Offset our explicit slots so that they start // at index 0. let offset = slot - explicit_slot_start; epsilons.set_slots(epsilons.slots().insert(offset)) }; self.stack_push(next, epsilons)?; } thompson::State::Fail => { continue; } thompson::State::Match { pattern_id } => { // If we found two different paths to a match state // for the same DFA state, then we have ambiguity. // Thus, it's not one-pass. if self.matched { return Err(BuildError::not_one_pass( "multiple epsilon transitions to match state", )); } self.matched = true; // Shove the matching pattern ID and the 'epsilons' // into the current DFA state's pattern epsilons. The // 'epsilons' includes the slots we need to capture // before reporting the match and also the conditional // epsilon transitions we need to check before we can // report a match. self.dfa.set_pattern_epsilons( dfa_id, PatternEpsilons::empty() .set_pattern_id(pattern_id) .set_epsilons(epsilons), ); // N.B. It is tempting to just bail out here when // compiling a leftmost-first DFA, since we will never // compile any more transitions in that case. But we // actually need to keep going in order to verify that // we actually have a one-pass regex. e.g., We might // see more Match states (e.g., for other patterns) // that imply that we don't have a one-pass regex. // So instead, we mark that we've found a match and // continue on. When we go to compile a new DFA state, // we just skip that part. But otherwise check that the // one-pass property is upheld. } } } } self.shuffle_states(); Ok(self.dfa) } /// Shuffle all match states to the end of the transition table and set /// 'min_match_id' to the ID of the first such match state. /// /// The point of this is to make it extremely cheap to determine whether /// a state is a match state or not. We need to check on this on every /// transition during a search, so it being cheap is important. This /// permits us to check it by simply comparing two state identifiers, as /// opposed to looking for the pattern ID in the state's `PatternEpsilons`. /// (Which requires a memory load and some light arithmetic.) fn shuffle_states(&mut self) { let mut remapper = Remapper::new(&self.dfa); let mut next_dest = self.dfa.last_state_id(); for i in (0..self.dfa.state_len()).rev() { let id = StateID::must(i); let is_match = self.dfa.pattern_epsilons(id).pattern_id().is_some(); if !is_match { continue; } remapper.swap(&mut self.dfa, next_dest, id); self.dfa.min_match_id = next_dest; next_dest = self.dfa.prev_state_id(next_dest).expect( "match states should be a proper subset of all states", ); } remapper.remap(&mut self.dfa); } /// Compile the given NFA transition into the DFA state given. /// /// 'Epsilons' corresponds to any conditional epsilon transitions that need /// to be satisfied to follow this transition, and any slots that need to /// be saved if the transition is followed. /// /// If this transition indicates that the NFA is not one-pass, then /// this returns an error. (This occurs, for example, if the DFA state /// already has a transition defined for the same input symbols as the /// given transition, *and* the result of the old and new transitions is /// different.) fn compile_transition( &mut self, dfa_id: StateID, trans: &thompson::Transition, epsilons: Epsilons, ) -> Result<(), BuildError> { let next_dfa_id = self.add_dfa_state_for_nfa_state(trans.next)?; for byte in self .classes .representatives(trans.start..=trans.end) .filter_map(|r| r.as_u8()) { let oldtrans = self.dfa.transition(dfa_id, byte); let newtrans = Transition::new(self.matched, next_dfa_id, epsilons); // If the old transition points to the DEAD state, then we know // 'byte' has not been mapped to any transition for this DFA state // yet. So set it unconditionally. Otherwise, we require that the // old and new transitions are equivalent. Otherwise, there is // ambiguity and thus the regex is not one-pass. if oldtrans.state_id() == DEAD { self.dfa.set_transition(dfa_id, byte, newtrans); } else if oldtrans != newtrans { return Err(BuildError::not_one_pass( "conflicting transition", )); } } Ok(()) } /// Add a start state to the DFA corresponding to the given NFA starting /// state ID. /// /// If adding a state would blow any limits (configured or hard-coded), /// then an error is returned. /// /// If the starting state is an anchored state for a particular pattern, /// then callers must provide the pattern ID for that starting state. /// Callers must also ensure that the first starting state added is the /// start state for all patterns, and then each anchored starting state for /// each pattern (if necessary) added in order. Otherwise, this panics. fn add_start_state( &mut self, pid: Option, nfa_id: StateID, ) -> Result { match pid { // With no pid, this should be the start state for all patterns // and thus be the first one. None => assert!(self.dfa.starts.is_empty()), // With a pid, we want it to be at self.dfa.starts[pid+1]. Some(pid) => assert!(self.dfa.starts.len() == pid.one_more()), } let dfa_id = self.add_dfa_state_for_nfa_state(nfa_id)?; self.dfa.starts.push(dfa_id); Ok(dfa_id) } /// Add a new DFA state corresponding to the given NFA state. If adding a /// state would blow any limits (configured or hard-coded), then an error /// is returned. If a DFA state already exists for the given NFA state, /// then that DFA state's ID is returned and no new states are added. /// /// It is not expected that this routine is called for every NFA state. /// Instead, an NFA state ID will usually correspond to the "start" state /// for a sub-graph of the NFA, where all states in the sub-graph are /// reachable via epsilon transitions (conditional or unconditional). That /// sub-graph of NFA states is ultimately what produces a single DFA state. fn add_dfa_state_for_nfa_state( &mut self, nfa_id: StateID, ) -> Result { // If we've already built a DFA state for the given NFA state, then // just return that. We definitely do not want to have more than one // DFA state in existence for the same NFA state, since all but one of // them will likely become unreachable. And at least some of them are // likely to wind up being incomplete. let existing_dfa_id = self.nfa_to_dfa_id[nfa_id]; if existing_dfa_id != DEAD { return Ok(existing_dfa_id); } // If we don't have any DFA state yet, add it and then add the given // NFA state to the list of states to explore. let dfa_id = self.add_empty_state()?; self.nfa_to_dfa_id[nfa_id] = dfa_id; self.uncompiled_nfa_ids.push(nfa_id); Ok(dfa_id) } /// Unconditionally add a new empty DFA state. If adding it would exceed /// any limits (configured or hard-coded), then an error is returned. The /// ID of the new state is returned on success. /// /// The added state is *not* a match state. fn add_empty_state(&mut self) -> Result { let state_limit = Transition::STATE_ID_LIMIT; // Note that unlike dense and lazy DFAs, we specifically do NOT // premultiply our state IDs here. The reason is that we want to pack // our state IDs into 64-bit transitions with other info, so the fewer // the bits we use for state IDs the better. If we premultiply, then // our state ID space shrinks. We justify this by the assumption that // a one-pass DFA is just already doing a fair bit more work than a // normal DFA anyway, so an extra multiplication to compute a state // transition doesn't seem like a huge deal. let next_id = self.dfa.table.len() >> self.dfa.stride2(); let id = StateID::new(next_id) .map_err(|_| BuildError::too_many_states(state_limit))?; if id.as_u64() > Transition::STATE_ID_LIMIT { return Err(BuildError::too_many_states(state_limit)); } self.dfa .table .extend(core::iter::repeat(Transition(0)).take(self.dfa.stride())); // The default empty value for 'PatternEpsilons' is sadly not all // zeroes. Instead, a special sentinel is used to indicate that there // is no pattern. So we need to explicitly set the pattern epsilons to // the correct "empty" PatternEpsilons. self.dfa.set_pattern_epsilons(id, PatternEpsilons::empty()); if let Some(size_limit) = self.config.get_size_limit() { if self.dfa.memory_usage() > size_limit { return Err(BuildError::exceeded_size_limit(size_limit)); } } Ok(id) } /// Push the given NFA state ID and its corresponding epsilons (slots and /// conditional epsilon transitions) on to a stack for use in a depth first /// traversal of a sub-graph of the NFA. /// /// If the given NFA state ID has already been pushed on to the stack, then /// it indicates the regex is not one-pass and this correspondingly returns /// an error. fn stack_push( &mut self, nfa_id: StateID, epsilons: Epsilons, ) -> Result<(), BuildError> { // If we already have seen a match and we are compiling a leftmost // first DFA, then we shouldn't add any more states to look at. This is // effectively how preference order and non-greediness is implemented. // if !self.config.get_match_kind().continue_past_first_match() // && self.matched // { // return Ok(()); // } if !self.seen.insert(nfa_id) { return Err(BuildError::not_one_pass( "multiple epsilon transitions to same state", )); } self.stack.push((nfa_id, epsilons)); Ok(()) } } /// A one-pass DFA for executing a subset of anchored regex searches while /// resolving capturing groups. /// /// A one-pass DFA can be built from an NFA that is one-pass. An NFA is /// one-pass when there is never any ambiguity about how to continue a search. /// For example, `a*a` is not one-pass becuase during a search, it's not /// possible to know whether to continue matching the `a*` or to move on to /// the single `a`. However, `a*b` is one-pass, because for every byte in the /// input, it's always clear when to move on from `a*` to `b`. /// /// # Only anchored searches are supported /// /// In this crate, especially for DFAs, unanchored searches are implemented by /// treating the pattern as if it had a `(?s-u:.)*?` prefix. While the prefix /// is one-pass on its own, adding anything after it, e.g., `(?s-u:.)*?a` will /// make the overall pattern not one-pass. Why? Because the `(?s-u:.)` matches /// any byte, and there is therefore ambiguity as to when the prefix should /// stop matching and something else should start matching. /// /// Therefore, one-pass DFAs do not support unanchored searches. In addition /// to many regexes simply not being one-pass, it implies that one-pass DFAs /// have limited utility. With that said, when a one-pass DFA can be used, it /// can potentially provide a dramatic speed up over alternatives like the /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) /// and the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). In particular, /// a one-pass DFA is the only DFA capable of reporting the spans of matching /// capturing groups. /// /// To clarify, when we say that unanchored searches are not supported, what /// that actually means is: /// /// * The high level routines, [`DFA::is_match`] and [`DFA::captures`], always /// do anchored searches. /// * Since iterators are most useful in the context of unanchored searches, /// there is no `DFA::captures_iter` method. /// * For lower level routines like [`DFA::try_search`], an error will be /// returned if the given [`Input`] is configured to do an unanchored search or /// search for an invalid pattern ID. (Note that an [`Input`] is configured to /// do an unanchored search by default, so just giving a `Input::new` is /// guaranteed to return an error.) /// /// # Other limitations /// /// In addition to the [configurable heap limit](Config::size_limit) and /// the requirement that a regex pattern be one-pass, there are some other /// limitations: /// /// * There is an internal limit on the total number of explicit capturing /// groups that appear across all patterns. It is somewhat small and there is /// no way to configure it. If your pattern(s) exceed this limit, then building /// a one-pass DFA will fail. /// * If the number of patterns exceeds an internal unconfigurable limit, then /// building a one-pass DFA will fail. This limit is quite large and you're /// unlikely to hit it. /// * If the total number of states exceeds an internal unconfigurable limit, /// then building a one-pass DFA will fail. This limit is quite large and /// you're unlikely to hit it. /// /// # Other examples of regexes that aren't one-pass /// /// One particularly unfortunate example is that enabling Unicode can cause /// regexes that were one-pass to no longer be one-pass. Consider the regex /// `(?-u)\w*\s` for example. It is one-pass because there is exactly no /// overlap between the ASCII definitions of `\w` and `\s`. But `\w*\s` /// (i.e., with Unicode enabled) is *not* one-pass because `\w` and `\s` get /// translated to UTF-8 automatons. And while the *codepoints* in `\w` and `\s` /// do not overlap, the underlying UTF-8 encodings do. Indeed, because of the /// overlap between UTF-8 automata, the use of Unicode character classes will /// tend to vastly increase the likelihood of a regex not being one-pass. /// /// # How does one know if a regex is one-pass or not? /// /// At the time of writing, the only way to know is to try and build a one-pass /// DFA. The one-pass property is checked while constructing the DFA. /// /// This does mean that you might potentially waste some CPU cycles and memory /// by optimistically trying to build a one-pass DFA. But this is currently the /// only way. In the future, building a one-pass DFA might be able to use some /// heuristics to detect common violations of the one-pass property and bail /// more quickly. /// /// # Resource usage /// /// Unlike a general DFA, a one-pass DFA has stricter bounds on its resource /// usage. Namely, construction of a one-pass DFA has a time and space /// complexity of `O(n)`, where `n ~ nfa.states().len()`. (A general DFA's time /// and space complexity is `O(2^n)`.) This smaller time bound is achieved /// because there is at most one DFA state created for each NFA state. If /// additional DFA states would be required, then the pattern is not one-pass /// and construction will fail. /// /// Note though that currently, this DFA uses a fully dense representation. /// This means that while its space complexity is no worse than an NFA, it may /// in practice use more memory because of higher constant factors. The reason /// for this trade off is two-fold. Firstly, a dense representation makes the /// search faster. Secondly, the bigger an NFA, the more unlikely it is to be /// one-pass. Therefore, most one-pass DFAs are usually pretty small. /// /// # Example /// /// This example shows that the one-pass DFA implements Unicode word boundaries /// correctly while simultaneously reporting spans for capturing groups that /// participate in a match. (This is the only DFA that implements full support /// for Unicode word boundaries.) /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::onepass::DFA, Match, Span}; /// /// let re = DFA::new(r"\b(?P\w+)[[:space:]]+(?P\w+)\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Шерлок Холмс", &mut caps); /// assert_eq!(Some(Match::must(0, 0..23)), caps.get_match()); /// assert_eq!(Some(Span::from(0..12)), caps.get_group_by_name("first")); /// assert_eq!(Some(Span::from(13..23)), caps.get_group_by_name("last")); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: iteration /// /// Unlike other regex engines in this crate, this one does not provide /// iterator search functions. This is because a one-pass DFA only supports /// anchored searches, and so iterator functions are generally not applicable. /// /// However, if you know that all of your matches are /// directly adjacent, then an iterator can be used. The /// [`util::iter::Searcher`](crate::util::iter::Searcher) type can be used for /// this purpose: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::onepass::DFA, /// util::iter::Searcher, /// Anchored, Input, Span, /// }; /// /// let re = DFA::new(r"\w(\d)\w")?; /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); /// let input = Input::new("a1zb2yc3x").anchored(Anchored::Yes); /// /// let mut it = Searcher::new(input).into_captures_iter(caps, |input, caps| { /// Ok(re.try_search(&mut cache, input, caps)?) /// }).infallible(); /// let caps0 = it.next().unwrap(); /// assert_eq!(Some(Span::from(1..2)), caps0.get_group(1)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] pub struct DFA { /// The configuration provided by the caller. config: Config, /// The NFA used to build this DFA. /// /// NOTE: We probably don't need to store the NFA here, but we use enough /// bits from it that it's convenient to do so. And there really isn't much /// cost to doing so either, since an NFA is reference counted internally. nfa: NFA, /// The transition table. Given a state ID 's' and a byte of haystack 'b', /// the next state is `table[sid + classes[byte]]`. /// /// The stride of this table (i.e., the number of columns) is always /// a power of 2, even if the alphabet length is smaller. This makes /// converting between state IDs and state indices very cheap. /// /// Note that the stride always includes room for one extra "transition" /// that isn't actually a transition. It is a 'PatternEpsilons' that is /// used for match states only. Because of this, the maximum number of /// active columns in the transition table is 257, which means the maximum /// stride is 512 (the next power of 2 greater than or equal to 257). table: Vec, /// The DFA state IDs of the starting states. /// /// `starts[0]` is always present and corresponds to the starting state /// when searching for matches of any pattern in the DFA. /// /// `starts[i]` where i>0 corresponds to the starting state for the pattern /// ID 'i-1'. These starting states are optional. starts: Vec, /// Every state ID >= this value corresponds to a match state. /// /// This is what a search uses to detect whether a state is a match state /// or not. It requires only a simple comparison instead of bit-unpacking /// the PatternEpsilons from every state. min_match_id: StateID, /// The alphabet of this DFA, split into equivalence classes. Bytes in the /// same equivalence class can never discriminate between a match and a /// non-match. classes: ByteClasses, /// The number of elements in each state in the transition table. This may /// be less than the stride, since the stride is always a power of 2 and /// the alphabet length can be anything up to and including 256. alphabet_len: usize, /// The number of columns in the transition table, expressed as a power of /// 2. stride2: usize, /// The offset at which the PatternEpsilons for a match state is stored in /// the transition table. /// /// PERF: One wonders whether it would be better to put this in a separate /// allocation, since only match states have a non-empty PatternEpsilons /// and the number of match states tends be dwarfed by the number of /// non-match states. So this would save '8*len(non_match_states)' for each /// DFA. The question is whether moving this to a different allocation will /// lead to a perf hit during searches. You might think dealing with match /// states is rare, but some regexes spend a lot of time in match states /// gobbling up input. But... match state handling is already somewhat /// expensive, so maybe this wouldn't do much? Either way, it's worth /// experimenting. pateps_offset: usize, /// The first explicit slot index. This refers to the first slot appearing /// immediately after the last implicit slot. It is always 'patterns.len() /// * 2'. /// /// We record this because we only store the explicit slots in our DFA /// transition table that need to be saved. Implicit slots are handled /// automatically as part of the search. explicit_slot_start: usize, } impl DFA { /// Parse the given regular expression using the default configuration and /// return the corresponding one-pass DFA. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let re = DFA::new("foo[0-9]+bar")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "foo12345barzzz", &mut caps); /// assert_eq!(Some(Match::must(0, 0..11)), caps.get_match()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] #[inline] pub fn new(pattern: &str) -> Result { DFA::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let re = DFA::new_many(&["[a-z]+", "[0-9]+"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "abc123", &mut caps); /// assert_eq!(Some(Match::must(0, 0..3)), caps.get_match()); /// /// re.captures(&mut cache, "123abc", &mut caps); /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] #[inline] pub fn new_many>(patterns: &[P]) -> Result { DFA::builder().build_many(patterns) } /// Like `new`, but builds a one-pass DFA directly from an NFA. This is /// useful if you already have an NFA, or even if you hand-assembled the /// NFA. /// /// # Example /// /// This shows how to hand assemble a regular expression via its HIR, /// compile an NFA from it and build a one-pass DFA from the NFA. /// /// ``` /// use regex_automata::{ /// dfa::onepass::DFA, /// nfa::thompson::NFA, /// Match, /// }; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = DFA::new_from_nfa(nfa)?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let expected = Some(Match::must(0, 0..1)); /// re.captures(&mut cache, "A", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_from_nfa(nfa: NFA) -> Result { DFA::builder().build_from_nfa(nfa) } /// Create a new one-pass DFA that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let dfa = DFA::always_match()?; /// let mut cache = dfa.create_cache(); /// let mut caps = dfa.create_captures(); /// /// let expected = Match::must(0, 0..0); /// dfa.captures(&mut cache, "", &mut caps); /// assert_eq!(Some(expected), caps.get_match()); /// dfa.captures(&mut cache, "foo", &mut caps); /// assert_eq!(Some(expected), caps.get_match()); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); Builder::new().build_from_nfa(nfa) } /// Create a new one-pass DFA that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::dfa::onepass::DFA; /// /// let dfa = DFA::never_match()?; /// let mut cache = dfa.create_cache(); /// let mut caps = dfa.create_captures(); /// /// dfa.captures(&mut cache, "", &mut caps); /// assert_eq!(None, caps.get_match()); /// dfa.captures(&mut cache, "foo", &mut caps); /// assert_eq!(None, caps.get_match()); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); Builder::new().build_from_nfa(nfa) } /// Return a default configuration for a DFA. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of a DFA. /// /// # Example /// /// This example shows how to change the match semantics of this DFA from /// its default "leftmost first" to "all." When using "all," non-greediness /// doesn't apply and neither does preference order matching. Instead, the /// longest match possible is always returned. (Although, by construction, /// it's impossible for a one-pass DFA to have a different answer for /// "preference order" vs "longest match.") /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match, MatchKind}; /// /// let re = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build(r"(abc)+?")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// re.captures(&mut cache, "abcabc", &mut caps); /// // Normally, the non-greedy repetition would give us a 0..3 match. /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a DFA. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::onepass::DFA, /// nfa::thompson, /// util::syntax, /// Match, /// }; /// /// let re = DFA::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = b"foo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 0..8)); /// re.captures(&mut cache, haystack, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn builder() -> Builder { Builder::new() } /// Create a new empty set of capturing groups that is guaranteed to be /// valid for the search APIs on this DFA. /// /// A `Captures` value created for a specific DFA cannot be used with any /// other DFA. /// /// This is a convenience function for [`Captures::all`]. See the /// [`Captures`] documentation for an explanation of its alternative /// constructors that permit the DFA to do less work during a search, and /// thus might make it faster. #[inline] pub fn create_captures(&self) -> Captures { Captures::all(self.nfa.group_info().clone()) } /// Create a new cache for this DFA. /// /// The cache returned should only be used for searches for this /// DFA. If you want to reuse the cache for another DFA, then you /// must call [`Cache::reset`] with that DFA (or, equivalently, /// [`DFA::reset_cache`]). #[inline] pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this DFA (and only this DFA). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different DFA. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different DFA. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let re1 = DFA::new(r"\w")?; /// let re2 = DFA::new(r"\W")?; /// let mut caps1 = re1.create_captures(); /// let mut caps2 = re2.create_captures(); /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the one-pass DFA we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn reset_cache(&self, cache: &mut Cache) { cache.reset(self); } /// Return the config for this one-pass DFA. #[inline] pub fn get_config(&self) -> &Config { &self.config } /// Returns a reference to the underlying NFA. #[inline] pub fn get_nfa(&self) -> &NFA { &self.nfa } /// Returns the total number of patterns compiled into this DFA. /// /// In the case of a DFA that contains no patterns, this returns `0`. #[inline] pub fn pattern_len(&self) -> usize { self.get_nfa().pattern_len() } /// Returns the total number of states in this one-pass DFA. /// /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose /// a low level DFA API. Therefore, this routine has little use other than /// being informational. #[inline] pub fn state_len(&self) -> usize { self.table.len() >> self.stride2() } /// Returns the total number of elements in the alphabet for this DFA. /// /// That is, this returns the total number of transitions that each /// state in this DFA must have. The maximum alphabet size is 256, which /// corresponds to each possible byte value. /// /// The alphabet size may be less than 256 though, and unless /// [`Config::byte_classes`] is disabled, it is typically must less than /// 256. Namely, bytes are grouped into equivalence classes such that no /// two bytes in the same class can distinguish a match from a non-match. /// For example, in the regex `^[a-z]+$`, the ASCII bytes `a-z` could /// all be in the same equivalence class. This leads to a massive space /// savings. /// /// Note though that the alphabet length does _not_ necessarily equal the /// total stride space taken up by a single DFA state in the transition /// table. Namely, for performance reasons, the stride is always the /// smallest power of two that is greater than or equal to the alphabet /// length. For this reason, [`DFA::stride`] or [`DFA::stride2`] are /// often more useful. The alphabet length is typically useful only for /// informational purposes. /// /// Note also that unlike dense or sparse DFAs, a one-pass DFA does /// not have a special end-of-input (EOI) transition. This is because /// a one-pass DFA handles look-around assertions explicitly (like the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM)) and does not build /// them into the transitions of the DFA. #[inline] pub fn alphabet_len(&self) -> usize { self.alphabet_len } /// Returns the total stride for every state in this DFA, expressed as the /// exponent of a power of 2. The stride is the amount of space each state /// takes up in the transition table, expressed as a number of transitions. /// (Unused transitions map to dead states.) /// /// The stride of a DFA is always equivalent to the smallest power of /// 2 that is greater than or equal to the DFA's alphabet length. This /// definition uses extra space, but possibly permits faster translation /// between state identifiers and their corresponding offsets in this DFA's /// transition table. /// /// For example, if the DFA's stride is 16 transitions, then its `stride2` /// is `4` since `2^4 = 16`. /// /// The minimum `stride2` value is `1` (corresponding to a stride of `2`) /// while the maximum `stride2` value is `9` (corresponding to a stride /// of `512`). The maximum in theory should be `8`, but because of some /// implementation quirks that may be relaxed in the future, it is one more /// than `8`. (Do note that a maximal stride is incredibly rare, as it /// would imply that there is almost no redundant in the regex pattern.) /// /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose /// a low level DFA API. Therefore, this routine has little use other than /// being informational. #[inline] pub fn stride2(&self) -> usize { self.stride2 } /// Returns the total stride for every state in this DFA. This corresponds /// to the total number of transitions used by each state in this DFA's /// transition table. /// /// Please see [`DFA::stride2`] for more information. In particular, this /// returns the stride as the number of transitions, where as `stride2` /// returns it as the exponent of a power of 2. /// /// Note that unlike dense or sparse DFAs, a one-pass DFA does not expose /// a low level DFA API. Therefore, this routine has little use other than /// being informational. #[inline] pub fn stride(&self) -> usize { 1 << self.stride2() } /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to /// represent this DFA. /// /// This does **not** include the stack size used up by this DFA. To /// compute that, use `std::mem::size_of::()`. #[inline] pub fn memory_usage(&self) -> usize { use core::mem::size_of; self.table.len() * size_of::() + self.starts.len() * size_of::() } } impl DFA { /// Executes an anchored leftmost forward search, and returns true if and /// only if this one-pass DFA matches the given haystack. /// /// This routine may short circuit if it knows that scanning future /// input will never lead to a different result. In particular, if the /// underlying DFA enters a match state, then this routine will return /// `true` immediately without inspecting any future input. (Consider how /// this might make a difference given the regex `a+` on the haystack /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, /// but routines like `find` need to continue searching because `+` is /// greedy by default.) /// /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the /// given configuration was [`Anchored::No`] (which is the default). /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in the following circumstances: /// /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. Concretely, /// this occurs when using [`Anchored::Pattern`] without enabling /// [`Config::starts_for_each_pattern`]. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`DFA::try_search`] if you want to handle these panics as error /// values instead. /// /// # Example /// /// This shows basic usage: /// /// ``` /// use regex_automata::dfa::onepass::DFA; /// /// let re = DFA::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "foo12345bar")); /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `captures` returns /// a match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Input}; /// /// let re = DFA::new("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{dfa::onepass::DFA, nfa::thompson::NFA, Input}; /// /// let re = DFA::builder() /// .thompson(NFA::config().utf8(false)) /// .build("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> bool { let mut input = input.into().earliest(true); if matches!(input.get_anchored(), Anchored::No) { input.set_anchored(Anchored::Yes); } self.try_search_slots(cache, &input, &mut []).unwrap().is_some() } /// Executes an anchored leftmost forward search, and returns a `Match` if /// and only if this one-pass DFA matches the given haystack. /// /// This routine only includes the overall match span. To get access to the /// individual spans of each capturing group, use [`DFA::captures`]. /// /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the /// given configuration was [`Anchored::No`] (which is the default). /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in the following circumstances: /// /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. Concretely, /// this occurs when using [`Anchored::Pattern`] without enabling /// [`Config::starts_for_each_pattern`]. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`DFA::try_search`] if you want to handle these panics as error /// values instead. /// /// # Example /// /// Leftmost first match semantics corresponds to the match with the /// smallest starting offset, but where the end offset is determined by /// preferring earlier branches in the original regular expression. For /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` /// will match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. (This crate does not currently support /// leftmost longest semantics.) /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let re = DFA::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..8); /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over later parts. /// let re = DFA::new("abc|a")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..3); /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Option { let mut input = input.into(); if matches!(input.get_anchored(), Anchored::No) { input.set_anchored(Anchored::Yes); } if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; let start = slots[0].unwrap().get(); let end = slots[1].unwrap().get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.try_search_slots(cache, &input, &mut slots).unwrap()?; let start = slots[pid.as_usize() * 2].unwrap().get(); let end = slots[pid.as_usize() * 2 + 1].unwrap().get(); Some(Match::new(pid, Span { start, end })) } /// Executes an anchored leftmost forward search and writes the spans /// of capturing groups that participated in a match into the provided /// [`Captures`] value. If no match was found, then [`Captures::is_match`] /// is guaranteed to return `false`. /// /// The given `Input` is forcefully set to use [`Anchored::Yes`] if the /// given configuration was [`Anchored::No`] (which is the default). /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in the following circumstances: /// /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. Concretely, /// this occurs when using [`Anchored::Pattern`] without enabling /// [`Config::starts_for_each_pattern`]. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`DFA::try_search`] if you want to handle these panics as error /// values instead. /// /// # Example /// /// This shows a simple example of a one-pass regex that extracts /// capturing group spans. /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Match, Span}; /// /// let re = DFA::new( /// // Notice that we use ASCII here. The corresponding Unicode regex /// // is sadly not one-pass. /// "(?P[[:alpha:]]+)[[:space:]]+(?P[[:alpha:]]+)", /// )?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures<'h, I: Into>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) { let mut input = input.into(); if matches!(input.get_anchored(), Anchored::No) { input.set_anchored(Anchored::Yes); } self.try_search(cache, &input, caps).unwrap(); } /// Executes an anchored leftmost forward search and writes the spans /// of capturing groups that participated in a match into the provided /// [`Captures`] value. If no match was found, then [`Captures::is_match`] /// is guaranteed to return `false`. /// /// The differences with [`DFA::captures`] are: /// /// 1. This returns an error instead of panicking if the search fails. /// 2. Accepts an `&Input` instead of a `Into`. This permits reusing /// the same input for multiple searches, which _may_ be important for /// latency. /// 3. This does not automatically change the [`Anchored`] mode from `No` /// to `Yes`. Instead, if [`Input::anchored`] is `Anchored::No`, then an /// error is returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in the following circumstances: /// /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. Concretely, /// this occurs when using [`Anchored::Pattern`] without enabling /// [`Config::starts_for_each_pattern`]. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-regex that permits searching /// for specific patterns. Note that this is somewhat less useful than /// in other regex engines, since a one-pass DFA by definition has no /// ambiguity about which pattern can match at a position. That is, if it /// were possible for two different patterns to match at the same starting /// position, then the multi-regex would not be one-pass and construction /// would have failed. /// /// Nevertheless, this can still be useful if you only care about matches /// for a specific pattern, and want the DFA to report "no match" even if /// some other pattern would have matched. /// /// Note that in order to make use of this functionality, /// [`Config::starts_for_each_pattern`] must be enabled. It is disabled /// by default since it may result in higher memory usage. /// /// ``` /// use regex_automata::{ /// dfa::onepass::DFA, Anchored, Input, Match, PatternID, /// }; /// /// let re = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) /// .build_many(&["[a-z]+", "[0-9]+"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "123abc"; /// let input = Input::new(haystack).anchored(Anchored::Yes); /// /// // A normal multi-pattern search will show pattern 1 matches. /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); /// /// // If we only want to report pattern 0 matches, then we'll get no /// // match here. /// let input = input.anchored(Anchored::Pattern(PatternID::must(0))); /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, Match}; /// /// // one-pass DFAs fully support Unicode word boundaries! /// // A sad joke is that a Unicode aware regex like \w+\s is not one-pass. /// // :-( /// let re = DFA::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// let input = Input::new(&haystack[3..6]).anchored(Anchored::Yes); /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6).anchored(Anchored::Yes); /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) -> Result<(), MatchError> { let pid = self.try_search_slots(cache, input, caps.slots_mut())?; caps.set_pattern(pid); Ok(()) } /// Executes an anchored leftmost forward search and writes the spans /// of capturing groups that participated in a match into the provided /// `slots`, and returns the matching pattern ID. The contents of the /// slots for patterns other than the matching pattern are unspecified. If /// no match was found, then `None` is returned and the contents of all /// `slots` is unspecified. /// /// This is like [`DFA::try_search`], but it accepts a raw slots slice /// instead of a `Captures` value. This is useful in contexts where you /// don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with /// [`pattern_len() * 2`](crate::dfa::onepass::DFA::pattern_len) /// slots, if you only care about the overall match spans for each matching /// pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in the following circumstances: /// /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. Concretely, /// this occurs when using [`Anchored::Pattern`] without enabling /// [`Config::starts_for_each_pattern`]. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// use regex_automata::{dfa::onepass::DFA, Anchored, Input, PatternID}; /// /// let re = DFA::new_many(&[ /// r"[a-zA-Z]+", /// r"[0-9]+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("123").anchored(Anchored::Yes); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(0), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(3), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { return self.try_search_slots_imp(cache, input, slots); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { return self.try_search_slots_imp(cache, input, slots); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return Ok(got); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); Ok(got) } #[inline(never)] fn try_search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); match self.search_imp(cache, input, slots)? { None => return Ok(None), Some(pid) if !utf8empty => return Ok(Some(pid)), Some(pid) => { // These slot indices are always correct because we know our // 'pid' is valid and thus we know that the slot indices for it // are valid. let slot_start = pid.as_usize().wrapping_mul(2); let slot_end = slot_start.wrapping_add(1); // OK because we know we have a match and we know our caller // provided slots are big enough (which we make true above if // the caller didn't). Namely, we're only here when 'utf8empty' // is true, and when that's true, we require slots for every // pattern. let start = slots[slot_start].unwrap().get(); let end = slots[slot_end].unwrap().get(); // If our match splits a codepoint, then we cannot report is // as a match. And since one-pass DFAs only support anchored // searches, we don't try to skip ahead to find the next match. // We can just quit with nothing. if start == end && !input.is_char_boundary(start) { return Ok(None); } Ok(Some(pid)) } } } } impl DFA { fn search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { // PERF: Some ideas. I ran out of steam after my initial impl to try // many of these. // // 1) Try doing more state shuffling. Right now, all we do is push // match states to the end of the transition table so that we can do // 'if sid >= self.min_match_id' to know whether we're in a match // state or not. But what about doing something like dense DFAs and // pushing dead, match and states with captures/looks all toward the // beginning of the transition table. Then we could do 'if sid <= // self.max_special_id', in which case, we need to do some special // handling of some sort. Otherwise, we get the happy path, just // like in a DFA search. The main argument against this is that the // one-pass DFA is likely to be used most often with capturing groups // and if capturing groups are common, then this might wind up being a // pessimization. // // 2) Consider moving 'PatternEpsilons' out of the transition table. // It is only needed for match states and usually a small minority of // states are match states. Therefore, we're using an extra 'u64' for // most states. // // 3) I played around with the match state handling and it seems like // there is probably a lot left on the table for improvement. The // key tension is that the 'find_match' routine is a giant mess, but // splitting it out into a non-inlineable function is a non-starter // because the match state might consume input, so 'find_match' COULD // be called quite a lot, and a function call at that point would trash // perf. In theory, we could detect whether a match state consumes // input and then specialize our search routine based on that. In that // case, maybe an extra function call is OK, but even then, it might be // too much of a latency hit. Another idea is to just try and figure // out how to reduce the code size of 'find_match'. RE2 has a trick // here where the match handling isn't done if we know the next byte of // input yields a match too. Maybe we adopt that? // // This just might be a tricky DFA to optimize. if input.is_done() { return Ok(None); } // We unfortunately have a bit of book-keeping to do to set things // up. We do have to setup our cache and clear all of our slots. In // particular, clearing the slots is necessary for the case where we // report a match, but one of the capturing groups didn't participate // in the match but had a span set from a previous search. That would // be bad. In theory, we could avoid all this slot clearing if we knew // that every slot was always activated for every match. Then we would // know they would always be overwritten when a match is found. let explicit_slots_len = core::cmp::min( Slots::LIMIT, slots.len().saturating_sub(self.explicit_slot_start), ); cache.setup_search(explicit_slots_len); for slot in cache.explicit_slots() { *slot = None; } for slot in slots.iter_mut() { *slot = None; } // We set the starting slots for every pattern up front. This does // increase our latency somewhat, but it avoids having to do it every // time we see a match state (which could be many times in a single // search if the match state consumes input). for pid in self.nfa.patterns() { let i = pid.as_usize() * 2; if i >= slots.len() { break; } slots[i] = NonMaxUsize::new(input.start()); } let mut pid = None; let mut next_sid = match input.get_anchored() { Anchored::Yes => self.start(), Anchored::Pattern(pid) => self.start_pattern(pid)?, Anchored::No => { // If the regex is itself always anchored, then we're fine, // even if the search is configured to be unanchored. if !self.nfa.is_always_start_anchored() { return Err(MatchError::unsupported_anchored( Anchored::No, )); } self.start() } }; let leftmost_first = matches!(self.config.get_match_kind(), MatchKind::LeftmostFirst); for at in input.start()..input.end() { let sid = next_sid; let trans = self.transition(sid, input.haystack()[at]); next_sid = trans.state_id(); let epsilons = trans.epsilons(); if sid >= self.min_match_id { if self.find_match(cache, input, at, sid, slots, &mut pid) { if input.get_earliest() || (leftmost_first && trans.match_wins()) { return Ok(pid); } } } if sid == DEAD || (!epsilons.looks().is_empty() && !self.nfa.look_matcher().matches_set_inline( epsilons.looks(), input.haystack(), at, )) { return Ok(pid); } epsilons.slots().apply(at, cache.explicit_slots()); } if next_sid >= self.min_match_id { self.find_match( cache, input, input.end(), next_sid, slots, &mut pid, ); } Ok(pid) } /// Assumes 'sid' is a match state and looks for whether a match can /// be reported. If so, appropriate offsets are written to 'slots' and /// 'matched_pid' is set to the matching pattern ID. /// /// Even when 'sid' is a match state, it's possible that a match won't /// be reported. For example, when the conditional epsilon transitions /// leading to the match state aren't satisfied at the given position in /// the haystack. #[cfg_attr(feature = "perf-inline", inline(always))] fn find_match( &self, cache: &mut Cache, input: &Input<'_>, at: usize, sid: StateID, slots: &mut [Option], matched_pid: &mut Option, ) -> bool { debug_assert!(sid >= self.min_match_id); let pateps = self.pattern_epsilons(sid); let epsilons = pateps.epsilons(); if !epsilons.looks().is_empty() && !self.nfa.look_matcher().matches_set_inline( epsilons.looks(), input.haystack(), at, ) { return false; } let pid = pateps.pattern_id_unchecked(); // This calculation is always correct because we know our 'pid' is // valid and thus we know that the slot indices for it are valid. let slot_end = pid.as_usize().wrapping_mul(2).wrapping_add(1); // Set the implicit 'end' slot for the matching pattern. (The 'start' // slot was set at the beginning of the search.) if slot_end < slots.len() { slots[slot_end] = NonMaxUsize::new(at); } // If the caller provided enough room, copy the previously recorded // explicit slots from our scratch space to the caller provided slots. // We *also* need to set any explicit slots that are active as part of // the path to the match state. if self.explicit_slot_start < slots.len() { // NOTE: The 'cache.explicit_slots()' slice is setup at the // beginning of every search such that it is guaranteed to return a // slice of length equivalent to 'slots[explicit_slot_start..]'. slots[self.explicit_slot_start..] .copy_from_slice(cache.explicit_slots()); epsilons.slots().apply(at, &mut slots[self.explicit_slot_start..]); } *matched_pid = Some(pid); true } } impl DFA { /// Returns the anchored start state for matching any pattern in this DFA. fn start(&self) -> StateID { self.starts[0] } /// Returns the anchored start state for matching the given pattern. If /// 'starts_for_each_pattern' /// was not enabled, then this returns an error. If the given pattern is /// not in this DFA, then `Ok(None)` is returned. fn start_pattern(&self, pid: PatternID) -> Result { if !self.config.get_starts_for_each_pattern() { return Err(MatchError::unsupported_anchored(Anchored::Pattern( pid, ))); } // 'starts' always has non-zero length. The first entry is always the // anchored starting state for all patterns, and the following entries // are optional and correspond to the anchored starting states for // patterns at pid+1. Thus, starts.len()-1 corresponds to the total // number of patterns that one can explicitly search for. (And it may // be zero.) Ok(self.starts.get(pid.one_more()).copied().unwrap_or(DEAD)) } /// Returns the transition from the given state ID and byte of input. The /// transition includes the next state ID, the slots that should be saved /// and any conditional epsilon transitions that must be satisfied in order /// to take this transition. fn transition(&self, sid: StateID, byte: u8) -> Transition { let offset = sid.as_usize() << self.stride2(); let class = self.classes.get(byte).as_usize(); self.table[offset + class] } /// Set the transition from the given state ID and byte of input to the /// transition given. fn set_transition(&mut self, sid: StateID, byte: u8, to: Transition) { let offset = sid.as_usize() << self.stride2(); let class = self.classes.get(byte).as_usize(); self.table[offset + class] = to; } /// Return an iterator of "sparse" transitions for the given state ID. /// "sparse" in this context means that consecutive transitions that are /// equivalent are returned as one group, and transitions to the DEAD state /// are ignored. /// /// This winds up being useful for debug printing, since it's much terser /// to display runs of equivalent transitions than the transition for every /// possible byte value. Indeed, in practice, it's very common for runs /// of equivalent transitions to appear. fn sparse_transitions(&self, sid: StateID) -> SparseTransitionIter<'_> { let start = sid.as_usize() << self.stride2(); let end = start + self.alphabet_len(); SparseTransitionIter { it: self.table[start..end].iter().enumerate(), cur: None, } } /// Return the pattern epsilons for the given state ID. /// /// If the given state ID does not correspond to a match state ID, then the /// pattern epsilons returned is empty. fn pattern_epsilons(&self, sid: StateID) -> PatternEpsilons { let offset = sid.as_usize() << self.stride2(); PatternEpsilons(self.table[offset + self.pateps_offset].0) } /// Set the pattern epsilons for the given state ID. fn set_pattern_epsilons(&mut self, sid: StateID, pateps: PatternEpsilons) { let offset = sid.as_usize() << self.stride2(); self.table[offset + self.pateps_offset] = Transition(pateps.0); } /// Returns the state ID prior to the one given. This returns None if the /// given ID is the first DFA state. fn prev_state_id(&self, id: StateID) -> Option { if id == DEAD { None } else { // CORRECTNESS: Since 'id' is not the first state, subtracting 1 // is always valid. Some(StateID::new_unchecked(id.as_usize().checked_sub(1).unwrap())) } } /// Returns the state ID of the last state in this DFA's transition table. /// "last" in this context means the last state to appear in memory, i.e., /// the one with the greatest ID. fn last_state_id(&self) -> StateID { // CORRECTNESS: A DFA table is always non-empty since it always at // least contains a DEAD state. Since every state has the same stride, // we can just compute what the "next" state ID would have been and // then subtract 1 from it. StateID::new_unchecked( (self.table.len() >> self.stride2()).checked_sub(1).unwrap(), ) } /// Move the transitions from 'id1' to 'id2' and vice versa. /// /// WARNING: This does not update the rest of the transition table to have /// transitions to 'id1' changed to 'id2' and vice versa. This merely moves /// the states in memory. pub(super) fn swap_states(&mut self, id1: StateID, id2: StateID) { let o1 = id1.as_usize() << self.stride2(); let o2 = id2.as_usize() << self.stride2(); for b in 0..self.stride() { self.table.swap(o1 + b, o2 + b); } } /// Map all state IDs in this DFA (transition table + start states) /// according to the closure given. pub(super) fn remap(&mut self, map: impl Fn(StateID) -> StateID) { for i in 0..self.state_len() { let offset = i << self.stride2(); for b in 0..self.alphabet_len() { let next = self.table[offset + b].state_id(); self.table[offset + b].set_state_id(map(next)); } } for i in 0..self.starts.len() { self.starts[i] = map(self.starts[i]); } } } impl core::fmt::Debug for DFA { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { fn debug_state_transitions( f: &mut core::fmt::Formatter, dfa: &DFA, sid: StateID, ) -> core::fmt::Result { for (i, (start, end, trans)) in dfa.sparse_transitions(sid).enumerate() { let next = trans.state_id(); if i > 0 { write!(f, ", ")?; } if start == end { write!( f, "{:?} => {:?}", DebugByte(start), next.as_usize(), )?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), next.as_usize(), )?; } if trans.match_wins() { write!(f, " (MW)")?; } if !trans.epsilons().is_empty() { write!(f, " ({:?})", trans.epsilons())?; } } Ok(()) } writeln!(f, "onepass::DFA(")?; for index in 0..self.state_len() { let sid = StateID::must(index); let pateps = self.pattern_epsilons(sid); if sid == DEAD { write!(f, "D ")?; } else if pateps.pattern_id().is_some() { write!(f, "* ")?; } else { write!(f, " ")?; } write!(f, "{:06?}", sid.as_usize())?; if !pateps.is_empty() { write!(f, " ({:?})", pateps)?; } write!(f, ": ")?; debug_state_transitions(f, self, sid)?; write!(f, "\n")?; } writeln!(f, "")?; for (i, &sid) in self.starts.iter().enumerate() { if i == 0 { writeln!(f, "START(ALL): {:?}", sid.as_usize())?; } else { writeln!( f, "START(pattern: {:?}): {:?}", i - 1, sid.as_usize(), )?; } } writeln!(f, "state length: {:?}", self.state_len())?; writeln!(f, "pattern length: {:?}", self.pattern_len())?; writeln!(f, ")")?; Ok(()) } } /// An iterator over groups of consecutive equivalent transitions in a single /// state. #[derive(Debug)] struct SparseTransitionIter<'a> { it: core::iter::Enumerate>, cur: Option<(u8, u8, Transition)>, } impl<'a> Iterator for SparseTransitionIter<'a> { type Item = (u8, u8, Transition); fn next(&mut self) -> Option<(u8, u8, Transition)> { while let Some((b, &trans)) = self.it.next() { // Fine because we'll never have more than u8::MAX transitions in // one state. let b = b.as_u8(); let (prev_start, prev_end, prev_trans) = match self.cur { Some(t) => t, None => { self.cur = Some((b, b, trans)); continue; } }; if prev_trans == trans { self.cur = Some((prev_start, b, prev_trans)); } else { self.cur = Some((b, b, trans)); if prev_trans.state_id() != DEAD { return Some((prev_start, prev_end, prev_trans)); } } } if let Some((start, end, trans)) = self.cur.take() { if trans.state_id() != DEAD { return Some((start, end, trans)); } } None } } /// A cache represents mutable state that a one-pass [`DFA`] requires during a /// search. /// /// For a given one-pass DFA, its corresponding cache may be created either via /// [`DFA::create_cache`], or via [`Cache::new`]. They are equivalent in every /// way, except the former does not require explicitly importing `Cache`. /// /// A particular `Cache` is coupled with the one-pass DFA from which it was /// created. It may only be used with that one-pass DFA. A cache and its /// allocations may be re-purposed via [`Cache::reset`], in which case, it can /// only be used with the new one-pass DFA (and not the old one). #[derive(Clone, Debug)] pub struct Cache { /// Scratch space used to store slots during a search. Basically, we use /// the caller provided slots to store slots known when a match occurs. /// But after a match occurs, we might continue a search but ultimately /// fail to extend the match. When continuing the search, we need some /// place to store candidate capture offsets without overwriting the slot /// offsets recorded for the most recently seen match. explicit_slots: Vec>, /// The number of slots in the caller-provided 'Captures' value for the /// current search. This is always at most 'explicit_slots.len()', but /// might be less than it, if the caller provided fewer slots to fill. explicit_slot_len: usize, } impl Cache { /// Create a new [`onepass::DFA`](DFA) cache. /// /// A potentially more convenient routine to create a cache is /// [`DFA::create_cache`], as it does not require also importing the /// `Cache` type. /// /// If you want to reuse the returned `Cache` with some other one-pass DFA, /// then you must call [`Cache::reset`] with the desired one-pass DFA. pub fn new(re: &DFA) -> Cache { let mut cache = Cache { explicit_slots: vec![], explicit_slot_len: 0 }; cache.reset(re); cache } /// Reset this cache such that it can be used for searching with a /// different [`onepass::DFA`](DFA). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different one-pass DFA. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different one-pass /// DFA. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::onepass::DFA, Match}; /// /// let re1 = DFA::new(r"\w")?; /// let re2 = DFA::new(r"\W")?; /// let mut caps1 = re1.create_captures(); /// let mut caps2 = re2.create_captures(); /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// { re1.captures(&mut cache, "Δ", &mut caps1); caps1.get_match() }, /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the one-pass DFA we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// { re2.captures(&mut cache, "☃", &mut caps2); caps2.get_match() }, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &DFA) { let explicit_slot_len = re.get_nfa().group_info().explicit_slot_len(); self.explicit_slots.resize(explicit_slot_len, None); self.explicit_slot_len = explicit_slot_len; } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.explicit_slots.len() * core::mem::size_of::>() } fn explicit_slots(&mut self) -> &mut [Option] { &mut self.explicit_slots[..self.explicit_slot_len] } fn setup_search(&mut self, explicit_slot_len: usize) { self.explicit_slot_len = explicit_slot_len; } } /// Represents a single transition in a one-pass DFA. /// /// The high 21 bits corresponds to the state ID. The bit following corresponds /// to the special "match wins" flag. The remaining low 42 bits corresponds to /// the transition epsilons, which contains the slots that should be saved when /// this transition is followed and the conditional epsilon transitions that /// must be satisfied in order to follow this transition. #[derive(Clone, Copy, Eq, PartialEq)] struct Transition(u64); impl Transition { const STATE_ID_BITS: u64 = 21; const STATE_ID_SHIFT: u64 = 64 - Transition::STATE_ID_BITS; const STATE_ID_LIMIT: u64 = 1 << Transition::STATE_ID_BITS; const MATCH_WINS_SHIFT: u64 = 64 - (Transition::STATE_ID_BITS + 1); const INFO_MASK: u64 = 0x000003FF_FFFFFFFF; /// Return a new transition to the given state ID with the given epsilons. fn new(match_wins: bool, sid: StateID, epsilons: Epsilons) -> Transition { let match_wins = if match_wins { 1 << Transition::MATCH_WINS_SHIFT } else { 0 }; let sid = sid.as_u64() << Transition::STATE_ID_SHIFT; Transition(sid | match_wins | epsilons.0) } /// Returns true if and only if this transition points to the DEAD state. fn is_dead(self) -> bool { self.state_id() == DEAD } /// Return whether this transition has a "match wins" property. /// /// When a transition has this property, it means that if a match has been /// found and the search uses leftmost-first semantics, then that match /// should be returned immediately instead of continuing on. /// /// The "match wins" name comes from RE2, which uses a pretty much /// identical mechanism for implementing leftmost-first semantics. fn match_wins(&self) -> bool { (self.0 >> Transition::MATCH_WINS_SHIFT & 1) == 1 } /// Return the "next" state ID that this transition points to. fn state_id(&self) -> StateID { // OK because a Transition has a valid StateID in its upper bits by // construction. The cast to usize is also correct, even on 16-bit // targets because, again, we know the upper bits is a valid StateID, // which can never overflow usize on any supported target. StateID::new_unchecked( (self.0 >> Transition::STATE_ID_SHIFT).as_usize(), ) } /// Set the "next" state ID in this transition. fn set_state_id(&mut self, sid: StateID) { *self = Transition::new(self.match_wins(), sid, self.epsilons()); } /// Return the epsilons embedded in this transition. fn epsilons(&self) -> Epsilons { Epsilons(self.0 & Transition::INFO_MASK) } } impl core::fmt::Debug for Transition { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if self.is_dead() { return write!(f, "0"); } write!(f, "{}", self.state_id().as_usize())?; if self.match_wins() { write!(f, "-MW")?; } if !self.epsilons().is_empty() { write!(f, "-{:?}", self.epsilons())?; } Ok(()) } } /// A representation of a match state's pattern ID along with the epsilons for /// when a match occurs. /// /// A match state in a one-pass DFA, unlike in a more general DFA, has exactly /// one pattern ID. If it had more, then the original NFA would not have been /// one-pass. /// /// The "epsilons" part of this corresponds to what was found in the epsilon /// transitions between the transition taken in the last byte of input and the /// ultimate match state. This might include saving slots and/or conditional /// epsilon transitions that must be satisfied before one can report the match. /// /// Technically, every state has room for a 'PatternEpsilons', but it is only /// ever non-empty for match states. #[derive(Clone, Copy)] struct PatternEpsilons(u64); impl PatternEpsilons { const PATTERN_ID_BITS: u64 = 22; const PATTERN_ID_SHIFT: u64 = 64 - PatternEpsilons::PATTERN_ID_BITS; // A sentinel value indicating that this is not a match state. We don't // use 0 since 0 is a valid pattern ID. const PATTERN_ID_NONE: u64 = 0x00000000_003FFFFF; const PATTERN_ID_LIMIT: u64 = PatternEpsilons::PATTERN_ID_NONE; const PATTERN_ID_MASK: u64 = 0xFFFFFC00_00000000; const EPSILONS_MASK: u64 = 0x000003FF_FFFFFFFF; /// Return a new empty pattern epsilons that has no pattern ID and has no /// epsilons. This is suitable for non-match states. fn empty() -> PatternEpsilons { PatternEpsilons( PatternEpsilons::PATTERN_ID_NONE << PatternEpsilons::PATTERN_ID_SHIFT, ) } /// Whether this pattern epsilons is empty or not. It's empty when it has /// no pattern ID and an empty epsilons. fn is_empty(self) -> bool { self.pattern_id().is_none() && self.epsilons().is_empty() } /// Return the pattern ID in this pattern epsilons if one exists. fn pattern_id(self) -> Option { let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; if pid == PatternEpsilons::PATTERN_ID_LIMIT { None } else { Some(PatternID::new_unchecked(pid.as_usize())) } } /// Returns the pattern ID without checking whether it's valid. If this is /// called and there is no pattern ID in this `PatternEpsilons`, then this /// will likely produce an incorrect result or possibly even a panic or /// an overflow. But safety will not be violated. /// /// This is useful when you know a particular state is a match state. If /// it's a match state, then it must have a pattern ID. fn pattern_id_unchecked(self) -> PatternID { let pid = self.0 >> PatternEpsilons::PATTERN_ID_SHIFT; PatternID::new_unchecked(pid.as_usize()) } /// Return a new pattern epsilons with the given pattern ID, but the same /// epsilons. fn set_pattern_id(self, pid: PatternID) -> PatternEpsilons { PatternEpsilons( (pid.as_u64() << PatternEpsilons::PATTERN_ID_SHIFT) | (self.0 & PatternEpsilons::EPSILONS_MASK), ) } /// Return the epsilons part of this pattern epsilons. fn epsilons(self) -> Epsilons { Epsilons(self.0 & PatternEpsilons::EPSILONS_MASK) } /// Return a new pattern epsilons with the given epsilons, but the same /// pattern ID. fn set_epsilons(self, epsilons: Epsilons) -> PatternEpsilons { PatternEpsilons( (self.0 & PatternEpsilons::PATTERN_ID_MASK) | (u64::from(epsilons.0) & PatternEpsilons::EPSILONS_MASK), ) } } impl core::fmt::Debug for PatternEpsilons { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if self.is_empty() { return write!(f, "N/A"); } if let Some(pid) = self.pattern_id() { write!(f, "{}", pid.as_usize())?; } if !self.epsilons().is_empty() { if self.pattern_id().is_some() { write!(f, "/")?; } write!(f, "{:?}", self.epsilons())?; } Ok(()) } } /// Epsilons represents all of the NFA epsilons transitions that went into a /// single transition in a single DFA state. In this case, it only represents /// the epsilon transitions that have some kind of non-consuming side effect: /// either the transition requires storing the current position of the search /// into a slot, or the transition is conditional and requires the current /// position in the input to satisfy an assertion before the transition may be /// taken. /// /// This folds the cumulative effect of a group of NFA states (all connected /// by epsilon transitions) down into a single set of bits. While these bits /// can represent all possible conditional epsilon transitions, it only permits /// storing up to a somewhat small number of slots. /// /// Epsilons is represented as a 42-bit integer. For example, it is packed into /// the lower 42 bits of a `Transition`. (Where the high 22 bits contains a /// `StateID` and a special "match wins" property.) #[derive(Clone, Copy)] struct Epsilons(u64); impl Epsilons { const SLOT_MASK: u64 = 0x000003FF_FFFFFC00; const SLOT_SHIFT: u64 = 10; const LOOK_MASK: u64 = 0x00000000_000003FF; /// Create a new empty epsilons. It has no slots and no assertions that /// need to be satisfied. fn empty() -> Epsilons { Epsilons(0) } /// Returns true if this epsilons contains no slots and no assertions. fn is_empty(self) -> bool { self.0 == 0 } /// Returns the slot epsilon transitions. fn slots(self) -> Slots { Slots((self.0 >> Epsilons::SLOT_SHIFT).low_u32()) } /// Set the slot epsilon transitions. fn set_slots(self, slots: Slots) -> Epsilons { Epsilons( (u64::from(slots.0) << Epsilons::SLOT_SHIFT) | (self.0 & Epsilons::LOOK_MASK), ) } /// Return the set of look-around assertions in these epsilon transitions. fn looks(self) -> LookSet { LookSet { bits: (self.0 & Epsilons::LOOK_MASK).low_u32() } } /// Set the look-around assertions on these epsilon transitions. fn set_looks(self, look_set: LookSet) -> Epsilons { Epsilons( (self.0 & Epsilons::SLOT_MASK) | (u64::from(look_set.bits) & Epsilons::LOOK_MASK), ) } } impl core::fmt::Debug for Epsilons { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut wrote = false; if !self.slots().is_empty() { write!(f, "{:?}", self.slots())?; wrote = true; } if !self.looks().is_empty() { if wrote { write!(f, "/")?; } write!(f, "{:?}", self.looks())?; wrote = true; } if !wrote { write!(f, "N/A")?; } Ok(()) } } /// The set of epsilon transitions indicating that the current position in a /// search should be saved to a slot. /// /// This *only* represents explicit slots. So for example, the pattern /// `[a-z]+([0-9]+)([a-z]+)` has: /// /// * 3 capturing groups, thus 6 slots. /// * 1 implicit capturing group, thus 2 implicit slots. /// * 2 explicit capturing groups, thus 4 explicit slots. /// /// While implicit slots are represented by epsilon transitions in an NFA, we /// do not explicitly represent them here. Instead, implicit slots are assumed /// to be present and handled automatically in the search code. Therefore, /// that means we only need to represent explicit slots in our epsilon /// transitions. /// /// Its representation is a bit set. The bit 'i' is set if and only if there /// exists an explicit slot at index 'c', where 'c = (#patterns * 2) + i'. That /// is, the bit 'i' corresponds to the first explicit slot and the first /// explicit slot appears immediately following the last implicit slot. (If /// this is confusing, see `GroupInfo` for more details on how slots works.) /// /// A single `Slots` represents all the active slots in a sub-graph of an NFA, /// where all the states are connected by epsilon transitions. In effect, when /// traversing the one-pass DFA during a search, all slots set in a particular /// transition must be captured by recording the current search position. /// /// The API of `Slots` requires the caller to handle the explicit slot offset. /// That is, a `Slots` doesn't know where the explicit slots start for a /// particular NFA. Thus, if the callers see's the bit 'i' is set, then they /// need to do the arithmetic above to find 'c', which is the real actual slot /// index in the corresponding NFA. #[derive(Clone, Copy)] struct Slots(u32); impl Slots { const LIMIT: usize = 32; /// Insert the slot at the given bit index. fn insert(self, slot: usize) -> Slots { debug_assert!(slot < Slots::LIMIT); Slots(self.0 | (1 << slot.as_u32())) } /// Remove the slot at the given bit index. fn remove(self, slot: usize) -> Slots { debug_assert!(slot < Slots::LIMIT); Slots(self.0 & !(1 << slot.as_u32())) } /// Returns true if and only if this set contains no slots. fn is_empty(self) -> bool { self.0 == 0 } /// Returns an iterator over all of the set bits in this set. fn iter(self) -> SlotsIter { SlotsIter { slots: self } } /// For the position `at` in the current haystack, copy it to /// `caller_explicit_slots` for all slots that are in this set. /// /// Callers may pass a slice of any length. Slots in this set bigger than /// the length of the given explicit slots are simply skipped. /// /// The slice *must* correspond only to the explicit slots and the first /// element of the slice must always correspond to the first explicit slot /// in the corresponding NFA. fn apply( self, at: usize, caller_explicit_slots: &mut [Option], ) { if self.is_empty() { return; } let at = NonMaxUsize::new(at); for slot in self.iter() { if slot >= caller_explicit_slots.len() { break; } caller_explicit_slots[slot] = at; } } } impl core::fmt::Debug for Slots { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "S")?; for slot in self.iter() { write!(f, "-{:?}", slot)?; } Ok(()) } } /// An iterator over all of the bits set in a slot set. /// /// This returns the bit index that is set, so callers may need to offset it /// to get the actual NFA slot index. #[derive(Debug)] struct SlotsIter { slots: Slots, } impl Iterator for SlotsIter { type Item = usize; fn next(&mut self) -> Option { // Number of zeroes here is always <= u8::MAX, and so fits in a usize. let slot = self.slots.0.trailing_zeros().as_usize(); if slot >= Slots::LIMIT { return None; } self.slots = self.slots.remove(slot); Some(slot) } } /// An error that occurred during the construction of a one-pass DFA. /// /// This error does not provide many introspection capabilities. There are /// generally only two things you can do with it: /// /// * Obtain a human readable message via its `std::fmt::Display` impl. /// * Access an underlying [`thompson::BuildError`] type from its `source` /// method via the `std::error::Error` trait. This error only occurs when using /// convenience routines for building a one-pass DFA directly from a pattern /// string. /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } /// The kind of error that occurred during the construction of a one-pass DFA. #[derive(Clone, Debug)] enum BuildErrorKind { NFA(crate::nfa::thompson::BuildError), Word(UnicodeWordBoundaryError), TooManyStates { limit: u64 }, TooManyPatterns { limit: u64 }, UnsupportedLook { look: Look }, ExceededSizeLimit { limit: usize }, NotOnePass { msg: &'static str }, } impl BuildError { fn nfa(err: crate::nfa::thompson::BuildError) -> BuildError { BuildError { kind: BuildErrorKind::NFA(err) } } fn word(err: UnicodeWordBoundaryError) -> BuildError { BuildError { kind: BuildErrorKind::Word(err) } } fn too_many_states(limit: u64) -> BuildError { BuildError { kind: BuildErrorKind::TooManyStates { limit } } } fn too_many_patterns(limit: u64) -> BuildError { BuildError { kind: BuildErrorKind::TooManyPatterns { limit } } } fn unsupported_look(look: Look) -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedLook { look } } } fn exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } fn not_one_pass(msg: &'static str) -> BuildError { BuildError { kind: BuildErrorKind::NotOnePass { msg } } } } #[cfg(feature = "std")] impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { use self::BuildErrorKind::*; match self.kind { NFA(ref err) => Some(err), Word(ref err) => Some(err), _ => None, } } } impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::BuildErrorKind::*; match self.kind { NFA(_) => write!(f, "error building NFA"), Word(_) => write!(f, "NFA contains Unicode word boundary"), TooManyStates { limit } => write!( f, "one-pass DFA exceeded a limit of {:?} for number of states", limit, ), TooManyPatterns { limit } => write!( f, "one-pass DFA exceeded a limit of {:?} for number of patterns", limit, ), UnsupportedLook { look } => write!( f, "one-pass DFA does not support the {:?} assertion", look, ), ExceededSizeLimit { limit } => write!( f, "one-pass DFA exceeded size limit of {:?} during building", limit, ), NotOnePass { msg } => write!( f, "one-pass DFA could not be built because \ pattern is not one-pass: {}", msg, ), } } } #[cfg(all(test, feature = "syntax"))] mod tests { use alloc::string::ToString; use super::*; #[test] fn fail_conflicting_transition() { let predicate = |err: &str| err.contains("conflicting transition"); let err = DFA::new(r"a*[ab]").unwrap_err().to_string(); assert!(predicate(&err), "{}", err); } #[test] fn fail_multiple_epsilon() { let predicate = |err: &str| { err.contains("multiple epsilon transitions to same state") }; let err = DFA::new(r"(^|$)a").unwrap_err().to_string(); assert!(predicate(&err), "{}", err); } #[test] fn fail_multiple_match() { let predicate = |err: &str| { err.contains("multiple epsilon transitions to match state") }; let err = DFA::new_many(&[r"^", r"$"]).unwrap_err().to_string(); assert!(predicate(&err), "{}", err); } // This test is meant to build a one-pass regex with the maximum number of // possible slots. // // NOTE: Remember that the slot limit only applies to explicit capturing // groups. Any number of implicit capturing groups is supported (up to the // maximum number of supported patterns), since implicit groups are handled // by the search loop itself. #[test] fn max_slots() { // One too many... let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)(q)"; assert!(DFA::new(pat).is_err()); // Just right. let pat = r"(a)(b)(c)(d)(e)(f)(g)(h)(i)(j)(k)(l)(m)(n)(o)(p)"; assert!(DFA::new(pat).is_ok()); } // This test ensures that the one-pass DFA works with all look-around // assertions that we expect it to work with. // // The utility of this test is that each one-pass transition has a small // amount of space to store look-around assertions. Currently, there is // logic in the one-pass constructor to ensure there aren't more than ten // possible assertions. And indeed, there are only ten possible assertions // (at time of writing), so this is okay. But conceivably, more assertions // could be added. So we check that things at least work with what we // expect them to work with. #[test] fn assertions() { // haystack anchors assert!(DFA::new(r"^").is_ok()); assert!(DFA::new(r"$").is_ok()); // line anchors assert!(DFA::new(r"(?m)^").is_ok()); assert!(DFA::new(r"(?m)$").is_ok()); assert!(DFA::new(r"(?Rm)^").is_ok()); assert!(DFA::new(r"(?Rm)$").is_ok()); // word boundaries if cfg!(feature = "unicode-word-boundary") { assert!(DFA::new(r"\b").is_ok()); assert!(DFA::new(r"\B").is_ok()); } assert!(DFA::new(r"(?-u)\b").is_ok()); assert!(DFA::new(r"(?-u)\B").is_ok()); } #[cfg(not(miri))] // takes too long on miri #[test] fn is_one_pass() { use crate::util::syntax; assert!(DFA::new(r"a*b").is_ok()); if cfg!(feature = "unicode-perl") { assert!(DFA::new(r"\w").is_ok()); } assert!(DFA::new(r"(?-u)\w*\s").is_ok()); assert!(DFA::new(r"(?s:.)*?").is_ok()); assert!(DFA::builder() .syntax(syntax::Config::new().utf8(false)) .build(r"(?s-u:.)*?") .is_ok()); } #[test] fn is_not_one_pass() { assert!(DFA::new(r"a*a").is_err()); assert!(DFA::new(r"(?s-u:.)*?").is_err()); assert!(DFA::new(r"(?s:.)*?a").is_err()); } #[cfg(not(miri))] #[test] fn is_not_one_pass_bigger() { assert!(DFA::new(r"\w*\s").is_err()); } } regex-automata-0.4.9/src/dfa/regex.rs000064400000000000000000001012721046102023000155650ustar 00000000000000/*! A DFA-backed `Regex`. This module provides [`Regex`], which is defined generically over the [`Automaton`] trait. A `Regex` implements convenience routines you might have come to expect, such as finding the start/end of a match and iterating over all non-overlapping matches. This `Regex` type is limited in its capabilities to what a DFA can provide. Therefore, APIs involving capturing groups, for example, are not provided. Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that finds the end offset of a match, where as the other is a "reverse" DFA that find the start offset of a match. See the [parent module](crate::dfa) for examples. */ #[cfg(feature = "alloc")] use alloc::vec::Vec; #[cfg(feature = "dfa-build")] use crate::dfa::dense::BuildError; use crate::{ dfa::{automaton::Automaton, dense}, util::{iter, search::Input}, Anchored, Match, MatchError, }; #[cfg(feature = "alloc")] use crate::{ dfa::{sparse, StartKind}, util::search::MatchKind, }; // When the alloc feature is enabled, the regex type sets its A type parameter // to default to an owned dense DFA. But without alloc, we set no default. This // makes things a lot more convenient in the common case, since writing out the // DFA types is pretty annoying. // // Since we have two different definitions but only want to write one doc // string, we use a macro to capture the doc and other attributes once and then // repeat them for each definition. macro_rules! define_regex_type { ($(#[$doc:meta])*) => { #[cfg(feature = "alloc")] $(#[$doc])* pub struct Regex { forward: A, reverse: A, } #[cfg(not(feature = "alloc"))] $(#[$doc])* pub struct Regex { forward: A, reverse: A, } }; } define_regex_type!( /// A regular expression that uses deterministic finite automata for fast /// searching. /// /// A regular expression is comprised of two DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of /// a match while the reverse DFA is responsible for detecting the start /// of a match. Thus, in order to find the bounds of any given match, a /// forward search must first be run followed by a reverse search. A match /// found by the forward DFA guarantees that the reverse DFA will also find /// a match. /// /// The type of the DFA used by a `Regex` corresponds to the `A` type /// parameter, which must satisfy the [`Automaton`] trait. Typically, /// `A` is either a [`dense::DFA`](crate::dfa::dense::DFA) or a /// [`sparse::DFA`](crate::dfa::sparse::DFA), where dense DFAs use more /// memory but search faster, while sparse DFAs use less memory but search /// more slowly. /// /// # Crate features /// /// Note that despite what the documentation auto-generates, the _only_ /// crate feature needed to use this type is `dfa-search`. You do _not_ /// need to enable the `alloc` feature. /// /// By default, a regex's automaton type parameter is set to /// `dense::DFA>` when the `alloc` feature is enabled. For most /// in-memory work loads, this is the most convenient type that gives the /// best search performance. When the `alloc` feature is disabled, no /// default type is used. /// /// # When should I use this? /// /// Generally speaking, if you can afford the overhead of building a full /// DFA for your regex, and you don't need things like capturing groups, /// then this is a good choice if you're looking to optimize for matching /// speed. Note however that its speed may be worse than a general purpose /// regex engine if you don't provide a [`dense::Config::prefilter`] to the /// underlying DFA. /// /// # Sparse DFAs /// /// Since a `Regex` is generic over the [`Automaton`] trait, it can be /// used with any kind of DFA. While this crate constructs dense DFAs by /// default, it is easy enough to build corresponding sparse DFAs, and then /// build a regex from them: /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// // First, build a regex that uses dense DFAs. /// let dense_re = Regex::new("foo[0-9]+")?; /// /// // Second, build sparse DFAs from the forward and reverse dense DFAs. /// let fwd = dense_re.forward().to_sparse()?; /// let rev = dense_re.reverse().to_sparse()?; /// /// // Third, build a new regex from the constituent sparse DFAs. /// let sparse_re = Regex::builder().build_from_dfas(fwd, rev); /// /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert_eq!(true, sparse_re.is_match(b"foo123")); /// /// # Ok::<(), Box>(()) /// ``` /// /// Alternatively, one can use a [`Builder`] to construct a sparse DFA /// more succinctly. (Note though that dense DFAs are still constructed /// first internally, and then converted to sparse DFAs, as in the example /// above.) /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let sparse_re = Regex::builder().build_sparse(r"foo[0-9]+")?; /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert!(sparse_re.is_match(b"foo123")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Fallibility /// /// Most of the search routines defined on this type will _panic_ when the /// underlying search fails. This might be because the DFA gave up because /// it saw a quit byte, whether configured explicitly or via heuristic /// Unicode word boundary support, although neither are enabled by default. /// Or it might fail because an invalid `Input` configuration is given, /// for example, with an unsupported [`Anchored`] mode. /// /// If you need to handle these error cases instead of allowing them to /// trigger a panic, then the lower level [`Regex::try_search`] provides /// a fallible API that never panics. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte, and handle the error returned. This could be useful if, for /// example, you wanted to prevent a user supplied pattern from matching /// across a line boundary. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{dfa::{self, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dense(dfa::dense::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// /// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::quit(b'\n', 3); /// let got = re.try_search(&input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] ); #[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` pub fn new(pattern: &str) -> Result { Builder::new().build(pattern) } /// Like `new`, but parses multiple patterns into a single "regex set." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` pub fn new_many>( patterns: &[P], ) -> Result { Builder::new().build_many(patterns) } } #[cfg(all(feature = "syntax", feature = "dfa-build"))] impl Regex>> { /// Parse the given regular expression using the default configuration, /// except using sparse DFAs, and return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_sparse("foo[0-9]+bar")?; /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find(b"zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` pub fn new_sparse( pattern: &str, ) -> Result>>, BuildError> { Builder::new().build_sparse(pattern) } /// Like `new`, but parses multiple patterns into a single "regex set" /// using sparse DFAs. This otherwise similarly uses the default regex /// configuration. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new_many_sparse(&["[a-z]+", "[0-9]+"])?; /// /// let mut it = re.find_iter(b"abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` pub fn new_many_sparse>( patterns: &[P], ) -> Result>>, BuildError> { Builder::new().build_many_sparse(patterns) } } /// Convenience routines for regex construction. impl Regex> { /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } } /// Standard search routines for finding and iterating over matches. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!(true, re.is_match("foo12345bar")); /// assert_eq!(false, re.is_match("foobar")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { // Not only can we do an "earliest" search, but we can avoid doing a // reverse scan too. let input = input.into().earliest(true); self.forward().try_search_fwd(&input).map(|x| x.is_some()).unwrap() } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// // Greediness is applied appropriately. /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!(Some(Match::must(0, 3..11)), re.find("zzzfoo12345zzz")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// assert_eq!(Some(Match::must(0, 0..3)), re.find("abc")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>(&self, input: I) -> Option { self.try_search(&input.into()).unwrap() } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// This corresponds to the "standard" regex search iterator. /// /// # Panics /// /// If the search returns an error during iteration, then iteration /// panics. See [`Regex::find`] for the panic conditions. /// /// Use [`Regex::try_search`] with /// [`util::iter::Searcher`](crate::util::iter::Searcher) if you want to /// handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, dfa::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'h, I: Into>>( &'r self, input: I, ) -> FindMatches<'r, 'h, A> { let it = iter::Searcher::new(input.into()); FindMatches { re: self, it } } } /// Lower level fallible search routines that permit controlling where the /// search starts and ends in a particular sequence. impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but with two differences: /// /// 1. It is not generic over `Into` and instead accepts a /// `&Input`. This permits reusing the same `Input` for multiple searches /// without needing to create a new one. This _may_ help with latency. /// 2. It returns an error if the search could not complete where as /// [`Regex::find`] will panic. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in the following circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. #[inline] pub fn try_search( &self, input: &Input<'_>, ) -> Result, MatchError> { let (fwd, rev) = (self.forward(), self.reverse()); let end = match fwd.try_search_fwd(input)? { None => return Ok(None), Some(end) => end, }; // This special cases an empty match at the beginning of the search. If // our end matches our start, then since a reverse DFA can't match past // the start, it must follow that our starting position is also our end // position. So short circuit and skip the reverse search. if input.start() == end.offset() { return Ok(Some(Match::new( end.pattern(), end.offset()..end.offset(), ))); } // We can also skip the reverse search if we know our search was // anchored. This occurs either when the input config is anchored or // when we know the regex itself is anchored. In this case, we know the // start of the match, if one is found, must be the start of the // search. if self.is_anchored(input) { return Ok(Some(Match::new( end.pattern(), input.start()..end.offset(), ))); } // N.B. I have tentatively convinced myself that it isn't necessary // to specify the specific pattern for the reverse search since the // reverse search will always find the same pattern to match as the // forward search. But I lack a rigorous proof. Why not just provide // the pattern anyway? Well, if it is needed, then leaving it out // gives us a chance to find a witness. (Also, if we don't need to // specify the pattern, then we don't need to build the reverse DFA // with 'starts_for_each_pattern' enabled.) // // We also need to be careful to disable 'earliest' for the reverse // search, since it could be enabled for the forward search. In the // reverse case, to satisfy "leftmost" criteria, we need to match // as much as we can. We also need to be careful to make the search // anchored. We don't want the reverse search to report any matches // other than the one beginning at the end of our forward search. let revsearch = input .clone() .span(input.start()..end.offset()) .anchored(Anchored::Yes) .earliest(false); let start = rev .try_search_rev(&revsearch)? .expect("reverse search must match if forward search does"); assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); assert!(start.offset() <= end.offset()); Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } /// Returns true if either the given input specifies an anchored search /// or if the underlying DFA is always anchored. fn is_anchored(&self, input: &Input<'_>) -> bool { match input.get_anchored() { Anchored::No => self.forward().is_always_start_anchored(), Anchored::Yes | Anchored::Pattern(_) => true, } } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. impl Regex { /// Return the underlying DFA responsible for forward matching. /// /// This is useful for accessing the underlying DFA and converting it to /// some other format or size. See the [`Builder::build_from_dfas`] docs /// for an example of where this might be useful. pub fn forward(&self) -> &A { &self.forward } /// Return the underlying DFA responsible for reverse matching. /// /// This is useful for accessing the underlying DFA and converting it to /// some other format or size. See the [`Builder::build_from_dfas`] docs /// for an example of where this might be useful. pub fn reverse(&self) -> &A { &self.reverse } /// Returns the total number of patterns matched by this regex. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::dfa::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); self.forward().pattern_len() } } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// The type parameters are as follows: /// /// * `A` represents the type of the underlying DFA that implements the /// [`Automaton`] trait. /// /// The lifetime parameters are as follows: /// /// * `'h` represents the lifetime of the haystack being searched. /// * `'r` represents the lifetime of the regex object itself. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'h, A> { re: &'r Regex, it: iter::Searcher<'h>, } impl<'r, 'h, A: Automaton> Iterator for FindMatches<'r, 'h, A> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut it } = *self; it.advance(|input| re.try_search(input)) } } /// A builder for a regex based on deterministic finite automatons. /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction, the DFA construction and finally the regex searching /// itself. This builder is different from a general purpose regex builder in /// that it permits fine grain configuration of the construction process. The /// trade off for this is complexity, and the possibility of setting a /// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls /// whether the pattern itself can contain sub-expressions that match invalid /// UTF-8. /// * [`thompson::Config::utf8`](crate::nfa::thompson::Config::utf8) controls /// how the regex iterators themselves advance the starting position of the /// next search when a match with zero length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// Internally, building a regex requires building two DFAs, where one is /// responsible for finding the end of a match and the other is responsible /// for finding the start of a match. If you only need to detect whether /// something matched, or only the end of a match, then you should use a /// [`dense::Builder`] to construct a single DFA, which is cheaper than /// building two DFAs. /// /// # Build methods /// /// This builder has a few "build" methods. In general, it's the result of /// combining the following parameters: /// /// * Building one or many regexes. /// * Building a regex with dense or sparse DFAs. /// /// The simplest "build" method is [`Builder::build`]. It accepts a single /// pattern and builds a dense DFA using `usize` for the state identifier /// representation. /// /// The most general "build" method is [`Builder::build_many`], which permits /// building a regex that searches for multiple patterns simultaneously while /// using a specific state identifier representation. /// /// The most flexible "build" method, but hardest to use, is /// [`Builder::build_from_dfas`]. This exposes the fact that a [`Regex`] is /// just a pair of DFAs, and this method allows you to specify those DFAs /// exactly. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { #[cfg(feature = "dfa-build")] dfa: dense::Builder, } impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { Builder { #[cfg(feature = "dfa-build")] dfa: dense::Builder::new(), } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a regex from the given pattern using sparse DFAs. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_sparse( &self, pattern: &str, ) -> Result>>, BuildError> { self.build_many_sparse(&[pattern]) } /// Build a regex from the given patterns. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many>( &self, patterns: &[P], ) -> Result { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( dense::Config::new() .prefilter(None) .specialize_start_states(false) .start_kind(StartKind::Anchored) .match_kind(MatchKind::All), ) .thompson(crate::nfa::thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a sparse regex from the given patterns. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn build_many_sparse>( &self, patterns: &[P], ) -> Result>>, BuildError> { let re = self.build_many(patterns)?; let forward = re.forward().to_sparse()?; let reverse = re.reverse().to_sparse()?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a regex from its component forward and reverse DFAs. /// /// This is useful when deserializing a regex from some arbitrary /// memory region. This is also useful for building regexes from other /// types of DFAs. /// /// If you're building the DFAs from scratch instead of building new DFAs /// from other DFAs, then you'll need to make sure that the reverse DFA is /// configured correctly to match the intended semantics. Namely: /// /// * It should be anchored. /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. /// * Otherwise, its configuration should match the forward DFA. /// /// If these conditions aren't satisfied, then the behavior of searches is /// unspecified. /// /// Note that when using this constructor, no configuration is applied. /// Since this routine provides the DFAs to the builder, there is no /// opportunity to apply other configuration options. /// /// # Example /// /// This example is a bit a contrived. The usual use of these methods /// would involve serializing `initial_re` somewhere and then deserializing /// it later to build a regex. But in this case, we do everything in /// memory. /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); /// let re = Regex::builder().build_from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok::<(), Box>(()) /// ``` /// /// This example shows how to build a `Regex` that uses sparse DFAs instead /// of dense DFAs without using one of the convenience `build_sparse` /// routines: /// /// ``` /// use regex_automata::dfa::regex::Regex; /// /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let fwd = initial_re.forward().to_sparse()?; /// let rev = initial_re.reverse().to_sparse()?; /// let re = Regex::builder().build_from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok::<(), Box>(()) /// ``` pub fn build_from_dfas( &self, forward: A, reverse: A, ) -> Regex { Regex { forward, reverse } } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. #[cfg(all(feature = "syntax", feature = "dfa-build"))] pub fn thompson( &mut self, config: crate::nfa::thompson::Config, ) -> &mut Builder { self.dfa.thompson(config); self } /// Set the dense DFA compilation configuration for this builder using /// [`dense::Config`]. /// /// This permits setting things like whether the underlying DFAs should /// be minimized. #[cfg(feature = "dfa-build")] pub fn dense(&mut self, config: dense::Config) -> &mut Builder { self.dfa.configure(config); self } } impl Default for Builder { fn default() -> Builder { Builder::new() } } regex-automata-0.4.9/src/dfa/remapper.rs000064400000000000000000000233461046102023000162730ustar 00000000000000use alloc::vec::Vec; use crate::util::primitives::StateID; /// Remappable is a tightly coupled abstraction that facilitates remapping /// state identifiers in DFAs. /// /// The main idea behind remapping state IDs is that DFAs often need to check /// if a certain state is a "special" state of some kind (like a match state) /// during a search. Since this is extremely perf critical code, we want this /// check to be as fast as possible. Partitioning state IDs into, for example, /// into "non-match" and "match" states means one can tell if a state is a /// match state via a simple comparison of the state ID. /// /// The issue is that during the DFA construction process, it's not /// particularly easy to partition the states. Instead, the simplest thing is /// to often just do a pass over all of the states and shuffle them into their /// desired partitionings. To do that, we need a mechanism for swapping states. /// Hence, this abstraction. /// /// Normally, for such little code, I would just duplicate it. But this is a /// key optimization and the implementation is a bit subtle. So the abstraction /// is basically a ham-fisted attempt at DRY. The only place we use this is in /// the dense and one-pass DFAs. /// /// See also src/dfa/special.rs for a more detailed explanation of how dense /// DFAs are partitioned. pub(super) trait Remappable: core::fmt::Debug { /// Return the total number of states. fn state_len(&self) -> usize; /// Return the power-of-2 exponent that yields the stride. The pertinent /// laws here are, where N=stride2: 2^N=stride and len(alphabet) <= stride. fn stride2(&self) -> usize; /// Swap the states pointed to by the given IDs. The underlying finite /// state machine should be mutated such that all of the transitions in /// `id1` are now in the memory region where the transitions for `id2` /// were, and all of the transitions in `id2` are now in the memory region /// where the transitions for `id1` were. /// /// Essentially, this "moves" `id1` to `id2` and `id2` to `id1`. /// /// It is expected that, after calling this, the underlying value will be /// left in an inconsistent state, since any other transitions pointing to, /// e.g., `id1` need to be updated to point to `id2`, since that's where /// `id1` moved to. /// /// In order to "fix" the underlying inconsistent state, a `Remapper` /// should be used to guarantee that `remap` is called at the appropriate /// time. fn swap_states(&mut self, id1: StateID, id2: StateID); /// This must remap every single state ID in the underlying value according /// to the function given. For example, in a DFA, this should remap every /// transition and every starting state ID. fn remap(&mut self, map: impl Fn(StateID) -> StateID); } /// Remapper is an abstraction the manages the remapping of state IDs in a /// finite state machine. This is useful when one wants to shuffle states into /// different positions in the machine. /// /// One of the key complexities this manages is the ability to correctly move /// one state multiple times. /// /// Once shuffling is complete, `remap` must be called, which will rewrite /// all pertinent transitions to updated state IDs. Neglecting to call `remap` /// will almost certainly result in a corrupt machine. #[derive(Debug)] pub(super) struct Remapper { /// A map from the index of a state to its pre-multiplied identifier. /// /// When a state is swapped with another, then their corresponding /// locations in this map are also swapped. Thus, its new position will /// still point to its old pre-multiplied StateID. /// /// While there is a bit more to it, this then allows us to rewrite the /// state IDs in a DFA's transition table in a single pass. This is done /// by iterating over every ID in this map, then iterating over each /// transition for the state at that ID and re-mapping the transition from /// `old_id` to `map[dfa.to_index(old_id)]`. That is, we find the position /// in this map where `old_id` *started*, and set it to where it ended up /// after all swaps have been completed. map: Vec, /// A mapper from state index to state ID (and back). idxmap: IndexMapper, } impl Remapper { /// Create a new remapper from the given remappable implementation. The /// remapper can then be used to swap states. The remappable value given /// here must the same one given to `swap` and `remap`. pub(super) fn new(r: &impl Remappable) -> Remapper { let idxmap = IndexMapper { stride2: r.stride2() }; let map = (0..r.state_len()).map(|i| idxmap.to_state_id(i)).collect(); Remapper { map, idxmap } } /// Swap two states. Once this is called, callers must follow through to /// call `remap`, or else it's possible for the underlying remappable /// value to be in a corrupt state. pub(super) fn swap( &mut self, r: &mut impl Remappable, id1: StateID, id2: StateID, ) { if id1 == id2 { return; } r.swap_states(id1, id2); self.map.swap(self.idxmap.to_index(id1), self.idxmap.to_index(id2)); } /// Complete the remapping process by rewriting all state IDs in the /// remappable value according to the swaps performed. pub(super) fn remap(mut self, r: &mut impl Remappable) { // Update the map to account for states that have been swapped // multiple times. For example, if (A, C) and (C, G) are swapped, then // transitions previously pointing to A should now point to G. But if // we don't update our map, they will erroneously be set to C. All we // do is follow the swaps in our map until we see our original state // ID. // // The intuition here is to think about how changes are made to the // map: only through pairwise swaps. That means that starting at any // given state, it is always possible to find the loop back to that // state by following the swaps represented in the map (which might be // 0 swaps). // // We are also careful to clone the map before starting in order to // freeze it. We use the frozen map to find our loops, since we need to // update our map as well. Without freezing it, our updates could break // the loops referenced above and produce incorrect results. let oldmap = self.map.clone(); for i in 0..r.state_len() { let cur_id = self.idxmap.to_state_id(i); let mut new_id = oldmap[i]; if cur_id == new_id { continue; } loop { let id = oldmap[self.idxmap.to_index(new_id)]; if cur_id == id { self.map[i] = new_id; break; } new_id = id; } } r.remap(|next| self.map[self.idxmap.to_index(next)]); } } /// A simple type for mapping between state indices and state IDs. /// /// The reason why this exists is because state IDs are "premultiplied." That /// is, in order to get to the transitions for a particular state, one need /// only use the state ID as-is, instead of having to multiple it by transition /// table's stride. /// /// The downside of this is that it's inconvenient to map between state IDs /// using a dense map, e.g., Vec. That's because state IDs look like /// `0`, `0+stride`, `0+2*stride`, `0+3*stride`, etc., instead of `0`, `1`, /// `2`, `3`, etc. /// /// Since our state IDs are premultiplied, we can convert back-and-forth /// between IDs and indices by simply unmultiplying the IDs and multiplying the /// indices. #[derive(Debug)] struct IndexMapper { /// The power of 2 corresponding to the stride of the corresponding /// transition table. 'id >> stride2' de-multiplies an ID while 'index << /// stride2' pre-multiplies an index to an ID. stride2: usize, } impl IndexMapper { /// Convert a state ID to a state index. fn to_index(&self, id: StateID) -> usize { id.as_usize() >> self.stride2 } /// Convert a state index to a state ID. fn to_state_id(&self, index: usize) -> StateID { // CORRECTNESS: If the given index is not valid, then it is not // required for this to panic or return a valid state ID. We'll "just" // wind up with panics or silent logic errors at some other point. StateID::new_unchecked(index << self.stride2) } } #[cfg(feature = "dfa-build")] mod dense { use crate::{dfa::dense::OwnedDFA, util::primitives::StateID}; use super::Remappable; impl Remappable for OwnedDFA { fn state_len(&self) -> usize { OwnedDFA::state_len(self) } fn stride2(&self) -> usize { OwnedDFA::stride2(self) } fn swap_states(&mut self, id1: StateID, id2: StateID) { OwnedDFA::swap_states(self, id1, id2) } fn remap(&mut self, map: impl Fn(StateID) -> StateID) { OwnedDFA::remap(self, map) } } } #[cfg(feature = "dfa-onepass")] mod onepass { use crate::{dfa::onepass::DFA, util::primitives::StateID}; use super::Remappable; impl Remappable for DFA { fn state_len(&self) -> usize { DFA::state_len(self) } fn stride2(&self) -> usize { // We don't do pre-multiplication for the one-pass DFA, so // returning 0 has the effect of making state IDs and state indices // equivalent. 0 } fn swap_states(&mut self, id1: StateID, id2: StateID) { DFA::swap_states(self, id1, id2) } fn remap(&mut self, map: impl Fn(StateID) -> StateID) { DFA::remap(self, map) } } } regex-automata-0.4.9/src/dfa/search.rs000064400000000000000000000574701046102023000157320ustar 00000000000000use crate::{ dfa::{ accel, automaton::{Automaton, OverlappingState}, }, util::{ prefilter::Prefilter, primitives::StateID, search::{Anchored, HalfMatch, Input, Span}, }, MatchError, }; #[inline(never)] pub fn find_fwd( dfa: &A, input: &Input<'_>, ) -> Result, MatchError> { if input.is_done() { return Ok(None); } let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_prefilter() }; // Searching with a pattern ID is always anchored, so we should never use // a prefilter. if pre.is_some() { if input.get_earliest() { find_fwd_imp(dfa, input, pre, true) } else { find_fwd_imp(dfa, input, pre, false) } } else { if input.get_earliest() { find_fwd_imp(dfa, input, None, true) } else { find_fwd_imp(dfa, input, None, false) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_fwd_imp( dfa: &A, input: &Input<'_>, pre: Option<&'_ Prefilter>, earliest: bool, ) -> Result, MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.universal_start_state(Anchored::No).is_some(); let mut mat = None; let mut sid = init_fwd(dfa, input)?; let mut at = input.start(); // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr, $at:expr) => {{ let byte = *input.haystack().get_unchecked($at); dfa.next_state_unchecked($sid, byte) }}; } if let Some(ref pre) = pre { let span = Span::from(at..input.end()); // If a prefilter doesn't report false positives, then we don't need to // touch the DFA at all. However, since all matches include the pattern // ID, and the prefilter infrastructure doesn't report pattern IDs, we // limit this optimization to cases where there is exactly one pattern. // In that case, any match must be the 0th pattern. match pre.find(input.haystack(), span) { None => return Ok(mat), Some(ref span) => { at = span.start; if !universal_start { sid = prefilter_restart(dfa, &input, at)?; } } } } while at < input.end() { // SAFETY: There are two safety invariants we need to uphold here in // the loops below: that 'sid' and 'prev_sid' are valid state IDs // for this DFA, and that 'at' is a valid index into 'haystack'. // For the former, we rely on the invariant that next_state* and // start_state_forward always returns a valid state ID (given a valid // state ID in the former case). For the latter safety invariant, we // always guard unchecked access with a check that 'at' is less than // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we // ensure that 'at' is always in bounds. // // PERF: See a similar comment in src/hybrid/search.rs that justifies // this extra work to make the search loop fast. The same reasoning and // benchmarks apply here. let mut prev_sid; while at < input.end() { prev_sid = unsafe { next_unchecked!(sid, at) }; if dfa.is_special_state(prev_sid) || at + 3 >= input.end() { core::mem::swap(&mut prev_sid, &mut sid); break; } at += 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if dfa.is_special_state(sid) { break; } at += 1; prev_sid = unsafe { next_unchecked!(sid, at) }; if dfa.is_special_state(prev_sid) { core::mem::swap(&mut prev_sid, &mut sid); break; } at += 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if dfa.is_special_state(sid) { break; } at += 1; } if dfa.is_special_state(sid) { if dfa.is_start_state(sid) { if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => return Ok(mat), Some(ref span) => { // We want to skip any update to 'at' below // at the end of this iteration and just // jump immediately back to the next state // transition at the leading position of the // candidate match. // // ... but only if we actually made progress // with our prefilter, otherwise if the start // state has a self-loop, we can get stuck. if span.start > at { at = span.start; if !universal_start { sid = prefilter_restart(dfa, &input, at)?; } continue; } } } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); at = accel::find_fwd(needles, input.haystack(), at + 1) .unwrap_or(input.end()); continue; } } else if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); mat = Some(HalfMatch::new(pattern, at)); if earliest { return Ok(mat); } if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); at = accel::find_fwd(needles, input.haystack(), at + 1) .unwrap_or(input.end()); continue; } } else if dfa.is_accel_state(sid) { let needs = dfa.accelerator(sid); at = accel::find_fwd(needs, input.haystack(), at + 1) .unwrap_or(input.end()); continue; } else if dfa.is_dead_state(sid) { return Ok(mat); } else { // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. return Err(MatchError::quit(input.haystack()[at], at)); } } at += 1; } eoi_fwd(dfa, input, &mut sid, &mut mat)?; Ok(mat) } #[inline(never)] pub fn find_rev( dfa: &A, input: &Input<'_>, ) -> Result, MatchError> { if input.is_done() { return Ok(None); } if input.get_earliest() { find_rev_imp(dfa, input, true) } else { find_rev_imp(dfa, input, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_rev_imp( dfa: &A, input: &Input<'_>, earliest: bool, ) -> Result, MatchError> { let mut mat = None; let mut sid = init_rev(dfa, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use // an unsigned offset, 'at >= 0' is trivially always true. We could avoid // this extra case handling by using a signed offset, but Rust makes it // annoying to do. So... We just handle the empty case separately. if input.start() == input.end() { eoi_rev(dfa, input, &mut sid, &mut mat)?; return Ok(mat); } let mut at = input.end() - 1; macro_rules! next_unchecked { ($sid:expr, $at:expr) => {{ let byte = *input.haystack().get_unchecked($at); dfa.next_state_unchecked($sid, byte) }}; } loop { // SAFETY: See comments in 'find_fwd' for a safety argument. let mut prev_sid; while at >= input.start() { prev_sid = unsafe { next_unchecked!(sid, at) }; if dfa.is_special_state(prev_sid) || at <= input.start().saturating_add(3) { core::mem::swap(&mut prev_sid, &mut sid); break; } at -= 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if dfa.is_special_state(sid) { break; } at -= 1; prev_sid = unsafe { next_unchecked!(sid, at) }; if dfa.is_special_state(prev_sid) { core::mem::swap(&mut prev_sid, &mut sid); break; } at -= 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if dfa.is_special_state(sid) { break; } at -= 1; } if dfa.is_special_state(sid) { if dfa.is_start_state(sid) { if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) .unwrap_or(input.start()); } } else if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); // Since reverse searches report the beginning of a match // and the beginning is inclusive (not exclusive like the // end of a match), we add 1 to make it inclusive. mat = Some(HalfMatch::new(pattern, at + 1)); if earliest { return Ok(mat); } if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) .unwrap_or(input.start()); } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); // If the accelerator returns nothing, why don't we quit the // search? Well, if the accelerator doesn't find anything, that // doesn't mean we don't have a match. It just means that we // can't leave the current state given one of the 255 possible // byte values. However, there might be an EOI transition. So // we set 'at' to the end of the haystack, which will cause // this loop to stop and fall down into the EOI transition. at = accel::find_rev(needles, input.haystack(), at) .map(|i| i + 1) .unwrap_or(input.start()); } else if dfa.is_dead_state(sid) { return Ok(mat); } else { return Err(MatchError::quit(input.haystack()[at], at)); } } if at == input.start() { break; } at -= 1; } eoi_rev(dfa, input, &mut sid, &mut mat)?; Ok(mat) } #[inline(never)] pub fn find_overlapping_fwd( dfa: &A, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { state.mat = None; if input.is_done() { return Ok(()); } let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_prefilter() }; if pre.is_some() { find_overlapping_fwd_imp(dfa, input, pre, state) } else { find_overlapping_fwd_imp(dfa, input, None, state) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_overlapping_fwd_imp( dfa: &A, input: &Input<'_>, pre: Option<&'_ Prefilter>, state: &mut OverlappingState, ) -> Result<(), MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.universal_start_state(Anchored::No).is_some(); let mut sid = match state.id { None => { state.at = input.start(); init_fwd(dfa, input)? } Some(sid) => { if let Some(match_index) = state.next_match_index { let match_len = dfa.match_len(sid); if match_index < match_len { state.next_match_index = Some(match_index + 1); let pattern = dfa.match_pattern(sid, match_index); state.mat = Some(HalfMatch::new(pattern, state.at)); return Ok(()); } } // Once we've reported all matches at a given position, we need to // advance the search to the next position. state.at += 1; if state.at > input.end() { return Ok(()); } sid } }; // NOTE: We don't optimize the crap out of this routine primarily because // it seems like most find_overlapping searches will have higher match // counts, and thus, throughput is perhaps not as important. But if you // have a use case for something faster, feel free to file an issue. while state.at < input.end() { sid = dfa.next_state(sid, input.haystack()[state.at]); if dfa.is_special_state(sid) { state.id = Some(sid); if dfa.is_start_state(sid) { if let Some(ref pre) = pre { let span = Span::from(state.at..input.end()); match pre.find(input.haystack(), span) { None => return Ok(()), Some(ref span) => { if span.start > state.at { state.at = span.start; if !universal_start { sid = prefilter_restart( dfa, &input, state.at, )?; } continue; } } } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); state.at = accel::find_fwd( needles, input.haystack(), state.at + 1, ) .unwrap_or(input.end()); continue; } } else if dfa.is_match_state(sid) { state.next_match_index = Some(1); let pattern = dfa.match_pattern(sid, 0); state.mat = Some(HalfMatch::new(pattern, state.at)); return Ok(()); } else if dfa.is_accel_state(sid) { let needs = dfa.accelerator(sid); // If the accelerator returns nothing, why don't we quit the // search? Well, if the accelerator doesn't find anything, that // doesn't mean we don't have a match. It just means that we // can't leave the current state given one of the 255 possible // byte values. However, there might be an EOI transition. So // we set 'at' to the end of the haystack, which will cause // this loop to stop and fall down into the EOI transition. state.at = accel::find_fwd(needs, input.haystack(), state.at + 1) .unwrap_or(input.end()); continue; } else if dfa.is_dead_state(sid) { return Ok(()); } else { return Err(MatchError::quit( input.haystack()[state.at], state.at, )); } } state.at += 1; } let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat); state.id = Some(sid); if state.mat.is_some() { // '1' is always correct here since if we get to this point, this // always corresponds to the first (index '0') match discovered at // this position. So the next match to report at this position (if // it exists) is at index '1'. state.next_match_index = Some(1); } result } #[inline(never)] pub(crate) fn find_overlapping_rev( dfa: &A, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { state.mat = None; if input.is_done() { return Ok(()); } let mut sid = match state.id { None => { let sid = init_rev(dfa, input)?; state.id = Some(sid); if input.start() == input.end() { state.rev_eoi = true; } else { state.at = input.end() - 1; } sid } Some(sid) => { if let Some(match_index) = state.next_match_index { let match_len = dfa.match_len(sid); if match_index < match_len { state.next_match_index = Some(match_index + 1); let pattern = dfa.match_pattern(sid, match_index); state.mat = Some(HalfMatch::new(pattern, state.at)); return Ok(()); } } // Once we've reported all matches at a given position, we need // to advance the search to the next position. However, if we've // already followed the EOI transition, then we know we're done // with the search and there cannot be any more matches to report. if state.rev_eoi { return Ok(()); } else if state.at == input.start() { // At this point, we should follow the EOI transition. This // will cause us the skip the main loop below and fall through // to the final 'eoi_rev' transition. state.rev_eoi = true; } else { // We haven't hit the end of the search yet, so move on. state.at -= 1; } sid } }; while !state.rev_eoi { sid = dfa.next_state(sid, input.haystack()[state.at]); if dfa.is_special_state(sid) { state.id = Some(sid); if dfa.is_start_state(sid) { if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); state.at = accel::find_rev(needles, input.haystack(), state.at) .map(|i| i + 1) .unwrap_or(input.start()); } } else if dfa.is_match_state(sid) { state.next_match_index = Some(1); let pattern = dfa.match_pattern(sid, 0); state.mat = Some(HalfMatch::new(pattern, state.at + 1)); return Ok(()); } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); // If the accelerator returns nothing, why don't we quit the // search? Well, if the accelerator doesn't find anything, that // doesn't mean we don't have a match. It just means that we // can't leave the current state given one of the 255 possible // byte values. However, there might be an EOI transition. So // we set 'at' to the end of the haystack, which will cause // this loop to stop and fall down into the EOI transition. state.at = accel::find_rev(needles, input.haystack(), state.at) .map(|i| i + 1) .unwrap_or(input.start()); } else if dfa.is_dead_state(sid) { return Ok(()); } else { return Err(MatchError::quit( input.haystack()[state.at], state.at, )); } } if state.at == input.start() { break; } state.at -= 1; } let result = eoi_rev(dfa, input, &mut sid, &mut state.mat); state.rev_eoi = true; state.id = Some(sid); if state.mat.is_some() { // '1' is always correct here since if we get to this point, this // always corresponds to the first (index '0') match discovered at // this position. So the next match to report at this position (if // it exists) is at index '1'. state.next_match_index = Some(1); } result } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd( dfa: &A, input: &Input<'_>, ) -> Result { let sid = dfa.start_state_forward(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!dfa.is_match_state(sid)); Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &A, input: &Input<'_>, ) -> Result { let sid = dfa.start_state_reverse(input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!dfa.is_match_state(sid)); Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd( dfa: &A, input: &Input<'_>, sid: &mut StateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); match input.haystack().get(sp.end) { Some(&b) => { *sid = dfa.next_state(*sid, b); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } } } Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev( dfa: &A, input: &Input<'_>, sid: &mut StateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); if sp.start > 0 { let byte = input.haystack()[sp.start - 1]; *sid = dfa.next_state(*sid, byte); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } } Ok(()) } /// Re-compute the starting state that a DFA should be in after finding a /// prefilter candidate match at the position `at`. /// /// The function with the same name has a bit more docs in hybrid/search.rs. #[cfg_attr(feature = "perf-inline", inline(always))] fn prefilter_restart( dfa: &A, input: &Input<'_>, at: usize, ) -> Result { let mut input = input.clone(); input.set_start(at); init_fwd(dfa, &input) } regex-automata-0.4.9/src/dfa/sparse.rs000064400000000000000000003067641046102023000157650ustar 00000000000000/*! Types and routines specific to sparse DFAs. This module is the home of [`sparse::DFA`](DFA). Unlike the [`dense`] module, this module does not contain a builder or configuration specific for sparse DFAs. Instead, the intended way to build a sparse DFA is either by using a default configuration with its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the construction of a dense DFA with [`dense::Builder`] and then calling [`dense::DFA::to_sparse`]. For example, this configures a sparse DFA to do an overlapping search: ``` use regex_automata::{ dfa::{Automaton, OverlappingState, dense}, HalfMatch, Input, MatchKind, }; let dense_re = dense::Builder::new() .configure(dense::Config::new().match_kind(MatchKind::All)) .build(r"Samwise|Sam")?; let sparse_re = dense_re.to_sparse()?; // Setup our haystack and initial start state. let input = Input::new("Samwise"); let mut state = OverlappingState::start(); // First, 'Sam' will match. sparse_re.try_search_overlapping_fwd(&input, &mut state)?; assert_eq!(Some(HalfMatch::must(0, 3)), state.get_match()); // And now 'Samwise' will match. sparse_re.try_search_overlapping_fwd(&input, &mut state)?; assert_eq!(Some(HalfMatch::must(0, 7)), state.get_match()); # Ok::<(), Box>(()) ``` */ #[cfg(feature = "dfa-build")] use core::iter; use core::{fmt, mem::size_of}; #[cfg(feature = "dfa-build")] use alloc::{vec, vec::Vec}; #[cfg(feature = "dfa-build")] use crate::dfa::dense::{self, BuildError}; use crate::{ dfa::{ automaton::{fmt_state_indicator, Automaton, StartError}, dense::Flags, special::Special, StartKind, DEAD, }, util::{ alphabet::{ByteClasses, ByteSet}, escape::DebugByte, int::{Pointer, Usize, U16, U32}, prefilter::Prefilter, primitives::{PatternID, StateID}, search::Anchored, start::{self, Start, StartByteMap}, wire::{self, DeserializeError, Endian, SerializeError}, }, }; const LABEL: &str = "rust-regex-automata-dfa-sparse"; const VERSION: u32 = 2; /// A sparse deterministic finite automaton (DFA) with variable sized states. /// /// In contrast to a [dense::DFA], a sparse DFA uses a more space efficient /// representation for its transitions. Consequently, sparse DFAs may use much /// less memory than dense DFAs, but this comes at a price. In particular, /// reading the more space efficient transitions takes more work, and /// consequently, searching using a sparse DFA is typically slower than a dense /// DFA. /// /// A sparse DFA can be built using the default configuration via the /// [`DFA::new`] constructor. Otherwise, one can configure various aspects of a /// dense DFA via [`dense::Builder`], and then convert a dense DFA to a sparse /// DFA using [`dense::DFA::to_sparse`]. /// /// In general, a sparse DFA supports all the same search operations as a dense /// DFA. /// /// Making the choice between a dense and sparse DFA depends on your specific /// work load. If you can sacrifice a bit of search time performance, then a /// sparse DFA might be the best choice. In particular, while sparse DFAs are /// probably always slower than dense DFAs, you may find that they are easily /// fast enough for your purposes! /// /// # Type parameters /// /// A `DFA` has one type parameter, `T`, which is used to represent the parts /// of a sparse DFA. `T` is typically a `Vec` or a `&[u8]`. /// /// # The `Automaton` trait /// /// This type implements the [`Automaton`] trait, which means it can be used /// for searching. For example: /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] pub struct DFA { // When compared to a dense DFA, a sparse DFA *looks* a lot simpler // representation-wise. In reality, it is perhaps more complicated. Namely, // in a dense DFA, all information needs to be very cheaply accessible // using only state IDs. In a sparse DFA however, each state uses a // variable amount of space because each state encodes more information // than just its transitions. Each state also includes an accelerator if // one exists, along with the matching pattern IDs if the state is a match // state. // // That is, a lot of the complexity is pushed down into how each state // itself is represented. tt: Transitions, st: StartTable, special: Special, pre: Option, quitset: ByteSet, flags: Flags, } #[cfg(feature = "dfa-build")] impl DFA> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`] to set your own configuration, and then call /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new("foo[0-9]+bar")?; /// /// let expected = Some(HalfMatch::must(0, 11)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result>, BuildError> { dense::Builder::new() .build(pattern) .and_then(|dense| dense.to_sparse()) } /// Parse the given regular expressions using a default configuration and /// return the corresponding multi-DFA. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`] to set your own configuration, and then call /// [`dense::DFA::to_sparse`] to create a sparse DFA. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse}, HalfMatch, Input}; /// /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?; /// let expected = Some(HalfMatch::must(1, 3)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345bar"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result>, BuildError> { dense::Builder::new() .build_many(patterns) .and_then(|dense| dense.to_sparse()) } } #[cfg(feature = "dfa-build")] impl DFA> { /// Create a new DFA that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, sparse}, /// HalfMatch, Input, /// }; /// /// let dfa = sparse::DFA::always_match()?; /// /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(""))?); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result>, BuildError> { dense::DFA::always_match()?.to_sparse() } /// Create a new sparse DFA that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse}, Input}; /// /// let dfa = sparse::DFA::never_match()?; /// assert_eq!(None, dfa.try_search_fwd(&Input::new(""))?); /// assert_eq!(None, dfa.try_search_fwd(&Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result>, BuildError> { dense::DFA::never_match()?.to_sparse() } /// The implementation for constructing a sparse DFA from a dense DFA. pub(crate) fn from_dense>( dfa: &dense::DFA, ) -> Result>, BuildError> { // In order to build the transition table, we need to be able to write // state identifiers for each of the "next" transitions in each state. // Our state identifiers correspond to the byte offset in the // transition table at which the state is encoded. Therefore, we do not // actually know what the state identifiers are until we've allocated // exactly as much space as we need for each state. Thus, construction // of the transition table happens in two passes. // // In the first pass, we fill out the shell of each state, which // includes the transition length, the input byte ranges and // zero-filled space for the transitions and accelerators, if present. // In this first pass, we also build up a map from the state identifier // index of the dense DFA to the state identifier in this sparse DFA. // // In the second pass, we fill in the transitions based on the map // built in the first pass. // The capacity given here reflects a minimum. (Well, the true minimum // is likely even bigger, but hopefully this saves a few reallocs.) let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_len()); // This maps state indices from the dense DFA to StateIDs in the sparse // DFA. We build out this map on the first pass, and then use it in the // second pass to back-fill our transitions. let mut remap: Vec = vec![DEAD; dfa.state_len()]; for state in dfa.states() { let pos = sparse.len(); remap[dfa.to_index(state.id())] = StateID::new(pos) .map_err(|_| BuildError::too_many_states())?; // zero-filled space for the transition length sparse.push(0); sparse.push(0); let mut transition_len = 0; for (unit1, unit2, _) in state.sparse_transitions() { match (unit1.as_u8(), unit2.as_u8()) { (Some(b1), Some(b2)) => { transition_len += 1; sparse.push(b1); sparse.push(b2); } (None, None) => {} (Some(_), None) | (None, Some(_)) => { // can never occur because sparse_transitions never // groups EOI with any other transition. unreachable!() } } } // Add dummy EOI transition. This is never actually read while // searching, but having space equivalent to the total number // of transitions is convenient. Otherwise, we'd need to track // a different number of transitions for the byte ranges as for // the 'next' states. // // N.B. The loop above is not guaranteed to yield the EOI // transition, since it may point to a DEAD state. By putting // it here, we always write the EOI transition, and thus // guarantee that our transition length is >0. Why do we always // need the EOI transition? Because in order to implement // Automaton::next_eoi_state, this lets us just ask for the last // transition. There are probably other/better ways to do this. transition_len += 1; sparse.push(0); sparse.push(0); // Check some assumptions about transition length. assert_ne!( transition_len, 0, "transition length should be non-zero", ); assert!( transition_len <= 257, "expected transition length {} to be <= 257", transition_len, ); // Fill in the transition length. // Since transition length is always <= 257, we use the most // significant bit to indicate whether this is a match state or // not. let ntrans = if dfa.is_match_state(state.id()) { transition_len | (1 << 15) } else { transition_len }; wire::NE::write_u16(ntrans, &mut sparse[pos..]); // zero-fill the actual transitions. // Unwraps are OK since transition_length <= 257 and our minimum // support usize size is 16-bits. let zeros = usize::try_from(transition_len) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); sparse.extend(iter::repeat(0).take(zeros)); // If this is a match state, write the pattern IDs matched by this // state. if dfa.is_match_state(state.id()) { let plen = dfa.match_pattern_len(state.id()); // Write the actual pattern IDs with a u32 length prefix. // First, zero-fill space. let mut pos = sparse.len(); // Unwraps are OK since it's guaranteed that plen <= // PatternID::LIMIT, which is in turn guaranteed to fit into a // u32. let zeros = size_of::() .checked_mul(plen) .unwrap() .checked_add(size_of::()) .unwrap(); sparse.extend(iter::repeat(0).take(zeros)); // Now write the length prefix. wire::NE::write_u32( // Will never fail since u32::MAX is invalid pattern ID. // Thus, the number of pattern IDs is representable by a // u32. plen.try_into().expect("pattern ID length fits in u32"), &mut sparse[pos..], ); pos += size_of::(); // Now write the pattern IDs. for &pid in dfa.pattern_id_slice(state.id()) { pos += wire::write_pattern_id::( pid, &mut sparse[pos..], ); } } // And now add the accelerator, if one exists. An accelerator is // at most 4 bytes and at least 1 byte. The first byte is the // length, N. N bytes follow the length. The set of bytes that // follow correspond (exhaustively) to the bytes that must be seen // to leave this state. let accel = dfa.accelerator(state.id()); sparse.push(accel.len().try_into().unwrap()); sparse.extend_from_slice(accel); } let mut new = DFA { tt: Transitions { sparse, classes: dfa.byte_classes().clone(), state_len: dfa.state_len(), pattern_len: dfa.pattern_len(), }, st: StartTable::from_dense_dfa(dfa, &remap)?, special: dfa.special().remap(|id| remap[dfa.to_index(id)]), pre: dfa.get_prefilter().map(|p| p.clone()), quitset: dfa.quitset().clone(), flags: dfa.flags().clone(), }; // And here's our second pass. Iterate over all of the dense states // again, and update the transitions in each of the states in the // sparse DFA. for old_state in dfa.states() { let new_id = remap[dfa.to_index(old_state.id())]; let mut new_state = new.tt.state_mut(new_id); let sparse = old_state.sparse_transitions(); for (i, (_, _, next)) in sparse.enumerate() { let next = remap[dfa.to_index(next)]; new_state.set_next_at(i, next); } } debug!( "created sparse DFA, memory usage: {} (dense memory usage: {})", new.memory_usage(), dfa.memory_usage(), ); Ok(new) } } impl> DFA { /// Cheaply return a borrowed version of this sparse DFA. Specifically, the /// DFA returned always uses `&[u8]` for its transitions. pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> { DFA { tt: self.tt.as_ref(), st: self.st.as_ref(), special: self.special, pre: self.pre.clone(), quitset: self.quitset, flags: self.flags, } } /// Return an owned version of this sparse DFA. Specifically, the DFA /// returned always uses `Vec` for its transitions. /// /// Effectively, this returns a sparse DFA whose transitions live on the /// heap. #[cfg(feature = "alloc")] pub fn to_owned(&self) -> DFA> { DFA { tt: self.tt.to_owned(), st: self.st.to_owned(), special: self.special, pre: self.pre.clone(), quitset: self.quitset, flags: self.flags, } } /// Returns the starting state configuration for this DFA. /// /// The default is [`StartKind::Both`], which means the DFA supports both /// unanchored and anchored searches. However, this can generally lead to /// bigger DFAs. Therefore, a DFA might be compiled with support for just /// unanchored or anchored searches. In that case, running a search with /// an unsupported configuration will panic. pub fn start_kind(&self) -> StartKind { self.st.kind } /// Returns true only if this DFA has starting states for each pattern. /// /// When a DFA has starting states for each pattern, then a search with the /// DFA can be configured to only look for anchored matches of a specific /// pattern. Specifically, APIs like [`Automaton::try_search_fwd`] can /// accept a [`Anchored::Pattern`] if and only if this method returns true. /// Otherwise, an error will be returned. /// /// Note that if the DFA is empty, this always returns false. pub fn starts_for_each_pattern(&self) -> bool { self.st.pattern_len.is_some() } /// Returns the equivalence classes that make up the alphabet for this DFA. /// /// Unless [`dense::Config::byte_classes`] was disabled, it is possible /// that multiple distinct bytes are grouped into the same equivalence /// class if it is impossible for them to discriminate between a match and /// a non-match. This has the effect of reducing the overall alphabet size /// and in turn potentially substantially reducing the size of the DFA's /// transition table. /// /// The downside of using equivalence classes like this is that every state /// transition will automatically use this map to convert an arbitrary /// byte to its corresponding equivalence class. In practice this has a /// negligible impact on performance. pub fn byte_classes(&self) -> &ByteClasses { &self.tt.classes } /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to /// represent this DFA. /// /// This does **not** include the stack size used up by this DFA. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.tt.memory_usage() + self.st.memory_usage() } } /// Routines for converting a sparse DFA to other representations, such as raw /// bytes suitable for persistent storage. impl> DFA { /// Serialize this DFA as raw bytes to a `Vec` in little endian /// format. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Note that unlike a [`dense::DFA`]'s serialization methods, this does /// not add any initial padding to the returned bytes. Padding isn't /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // N.B. We use native endianness here to make the example work, but /// // using to_bytes_little_endian would work on a little endian target. /// let buf = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_little_endian(&self) -> Vec { self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in big endian /// format. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Note that unlike a [`dense::DFA`]'s serialization methods, this does /// not add any initial padding to the returned bytes. Padding isn't /// required for sparse DFAs since they have no alignment requirements. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // N.B. We use native endianness here to make the example work, but /// // using to_bytes_big_endian would work on a big endian target. /// let buf = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_big_endian(&self) -> Vec { self.to_bytes::() } /// Serialize this DFA as raw bytes to a `Vec` in native endian /// format. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Note that unlike a [`dense::DFA`]'s serialization methods, this does /// not add any initial padding to the returned bytes. Padding isn't /// required for sparse DFAs since they have no alignment requirements. /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the /// endianness of the target on which you're compiling DFA. For example, /// if serialization and deserialization happen in the same process or on /// the same machine. Otherwise, when serializing a DFA for use in a /// portable environment, you'll almost certainly want to serialize _both_ /// a little endian and a big endian version and then load the correct one /// based on the target's configuration. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA: /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// let buf = original_dfa.to_bytes_native_endian(); /// // Even if buf has initial padding, DFA::from_bytes will automatically /// // ignore it. /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "dfa-build")] pub fn to_bytes_native_endian(&self) -> Vec { self.to_bytes::() } /// The implementation of the public `to_bytes` serialization methods, /// which is generic over endianness. #[cfg(feature = "dfa-build")] fn to_bytes(&self) -> Vec { let mut buf = vec![0; self.write_to_len()]; // This should always succeed since the only possible serialization // error is providing a buffer that's too small, but we've ensured that // `buf` is big enough here. self.write_to::(&mut buf).unwrap(); buf } /// Serialize this DFA as raw bytes to the given slice, in little endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. /// let mut buf = [0u8; 4 * (1<<10)]; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_little_endian would work on a little endian target. /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_little_endian( &self, dst: &mut [u8], ) -> Result { self.write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in big endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. /// let mut buf = [0u8; 4 * (1<<10)]; /// // N.B. We use native endianness here to make the example work, but /// // using write_to_big_endian would work on a big endian target. /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_big_endian( &self, dst: &mut [u8], ) -> Result { self.write_to::(dst) } /// Serialize this DFA as raw bytes to the given slice, in native endian /// format. Upon success, the total number of bytes written to `dst` is /// returned. /// /// The written bytes are guaranteed to be deserialized correctly and /// without errors in a semver compatible release of this crate by a /// `DFA`'s deserialization APIs (assuming all other criteria for the /// deserialization APIs has been satisfied): /// /// * [`DFA::from_bytes`] /// * [`DFA::from_bytes_unchecked`] /// /// Generally speaking, native endian format should only be used when /// you know that the target you're compiling the DFA for matches the /// endianness of the target on which you're compiling DFA. For example, /// if serialization and deserialization happen in the same process or on /// the same machine. Otherwise, when serializing a DFA for use in a /// portable environment, you'll almost certainly want to serialize _both_ /// a little endian and a big endian version and then load the correct one /// based on the target's configuration. /// /// # Errors /// /// This returns an error if the given destination slice is not big enough /// to contain the full serialized DFA. If an error occurs, then nothing /// is written to `dst`. /// /// # Example /// /// This example shows how to serialize and deserialize a DFA without /// dynamic memory allocation. /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// // Create a 4KB buffer on the stack to store our serialized DFA. /// let mut buf = [0u8; 4 * (1<<10)]; /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_native_endian( &self, dst: &mut [u8], ) -> Result { self.write_to::(dst) } /// The implementation of the public `write_to` serialization methods, /// which is generic over endianness. fn write_to( &self, dst: &mut [u8], ) -> Result { let mut nw = 0; nw += wire::write_label(LABEL, &mut dst[nw..])?; nw += wire::write_endianness_check::(&mut dst[nw..])?; nw += wire::write_version::(VERSION, &mut dst[nw..])?; nw += { // Currently unused, intended for future flexibility E::write_u32(0, &mut dst[nw..]); size_of::() }; nw += self.flags.write_to::(&mut dst[nw..])?; nw += self.tt.write_to::(&mut dst[nw..])?; nw += self.st.write_to::(&mut dst[nw..])?; nw += self.special.write_to::(&mut dst[nw..])?; nw += self.quitset.write_to::(&mut dst[nw..])?; Ok(nw) } /// Return the total number of bytes required to serialize this DFA. /// /// This is useful for determining the size of the buffer required to pass /// to one of the serialization routines: /// /// * [`DFA::write_to_little_endian`] /// * [`DFA::write_to_big_endian`] /// * [`DFA::write_to_native_endian`] /// /// Passing a buffer smaller than the size returned by this method will /// result in a serialization error. /// /// # Example /// /// This example shows how to dynamically allocate enough room to serialize /// a sparse DFA. /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// // Compile our original DFA. /// let original_dfa = DFA::new("foo[0-9]+")?; /// /// let mut buf = vec![0; original_dfa.write_to_len()]; /// let written = original_dfa.write_to_native_endian(&mut buf)?; /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub fn write_to_len(&self) -> usize { wire::write_label_len(LABEL) + wire::write_endianness_check_len() + wire::write_version_len() + size_of::() // unused, intended for future flexibility + self.flags.write_to_len() + self.tt.write_to_len() + self.st.write_to_len() + self.special.write_to_len() + self.quitset.write_to_len() } } impl<'a> DFA<&'a [u8]> { /// Safely deserialize a sparse DFA with a specific state identifier /// representation. Upon success, this returns both the deserialized DFA /// and the number of bytes read from the given slice. Namely, the contents /// of the slice beyond the DFA are not read. /// /// Deserializing a DFA using this routine will never allocate heap memory. /// For safety purposes, the DFA's transitions will be verified such that /// every transition points to a valid state. If this verification is too /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which /// will always execute in constant time. /// /// The bytes given must be generated by one of the serialization APIs /// of a `DFA` using a semver compatible release of this crate. Those /// include: /// /// * [`DFA::to_bytes_little_endian`] /// * [`DFA::to_bytes_big_endian`] /// * [`DFA::to_bytes_native_endian`] /// * [`DFA::write_to_little_endian`] /// * [`DFA::write_to_big_endian`] /// * [`DFA::write_to_native_endian`] /// /// The `to_bytes` methods allocate and return a `Vec` for you. The /// `write_to` methods do not allocate and write to an existing slice /// (which may be on the stack). Since deserialization always uses the /// native endianness of the target platform, the serialization API you use /// should match the endianness of the target platform. (It's often a good /// idea to generate serialized DFAs for both forms of endianness and then /// load the correct one based on endianness.) /// /// # Errors /// /// Generally speaking, it's easier to state the conditions in which an /// error is _not_ returned. All of the following must be true: /// /// * The bytes given must be produced by one of the serialization APIs /// on this DFA, as mentioned above. /// * The endianness of the target platform matches the endianness used to /// serialized the provided DFA. /// /// If any of the above are not true, then an error will be returned. /// /// Note that unlike deserializing a [`dense::DFA`], deserializing a sparse /// DFA has no alignment requirements. That is, an alignment of `1` is /// valid. /// /// # Panics /// /// This routine will never panic for any input. /// /// # Example /// /// This example shows how to serialize a DFA to raw bytes, deserialize it /// and then use it for searching. /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: loading a DFA from static memory /// /// One use case this library supports is the ability to serialize a /// DFA to disk and then use `include_bytes!` to store it in a compiled /// Rust program. Those bytes can then be cheaply deserialized into a /// `DFA` structure at runtime and used for searching without having to /// re-compile the DFA (which can be quite costly). /// /// We can show this in two parts. The first part is serializing the DFA to /// a file: /// /// ```no_run /// use regex_automata::dfa::sparse::DFA; /// /// let dfa = DFA::new("foo[0-9]+")?; /// /// // Write a big endian serialized version of this DFA to a file. /// let bytes = dfa.to_bytes_big_endian(); /// std::fs::write("foo.bigendian.dfa", &bytes)?; /// /// // Do it again, but this time for little endian. /// let bytes = dfa.to_bytes_little_endian(); /// std::fs::write("foo.littleendian.dfa", &bytes)?; /// # Ok::<(), Box>(()) /// ``` /// /// And now the second part is embedding the DFA into the compiled program /// and deserializing it at runtime on first use. We use conditional /// compilation to choose the correct endianness. We do not need to employ /// any special tricks to ensure a proper alignment, since a sparse DFA has /// no alignment requirements. /// /// ```no_run /// use regex_automata::{ /// dfa::{Automaton, sparse::DFA}, /// util::lazy::Lazy, /// HalfMatch, Input, /// }; /// /// // This crate provides its own "lazy" type, kind of like /// // lazy_static! or once_cell::sync::Lazy. But it works in no-alloc /// // no-std environments and let's us write this using completely /// // safe code. /// static RE: Lazy> = Lazy::new(|| { /// # const _: &str = stringify! { /// #[cfg(target_endian = "big")] /// static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa"); /// #[cfg(target_endian = "little")] /// static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa"); /// # }; /// # static BYTES: &[u8] = b""; /// /// let (dfa, _) = DFA::from_bytes(BYTES) /// .expect("serialized DFA should be valid"); /// dfa /// }); /// /// let expected = Ok(Some(HalfMatch::must(0, 8))); /// assert_eq!(expected, RE.try_search_fwd(&Input::new("foo12345"))); /// ``` /// /// Alternatively, consider using /// [`lazy_static`](https://crates.io/crates/lazy_static) /// or /// [`once_cell`](https://crates.io/crates/once_cell), /// which will guarantee safety for you. pub fn from_bytes( slice: &'a [u8], ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { // SAFETY: This is safe because we validate both the sparse transitions // (by trying to decode every state) and start state ID list below. If // either validation fails, then we return an error. let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? }; let seen = dfa.tt.validate(&dfa.special)?; dfa.st.validate(&dfa.special, &seen)?; // N.B. dfa.special doesn't have a way to do unchecked deserialization, // so it has already been validated. Ok((dfa, nread)) } /// Deserialize a DFA with a specific state identifier representation in /// constant time by omitting the verification of the validity of the /// sparse transitions. /// /// This is just like [`DFA::from_bytes`], except it can potentially return /// a DFA that exhibits undefined behavior if its transitions contains /// invalid state identifiers. /// /// This routine is useful if you need to deserialize a DFA cheaply and /// cannot afford the transition validation performed by `from_bytes`. /// /// # Safety /// /// This routine is not safe because it permits callers to provide /// arbitrary transitions with possibly incorrect state identifiers. While /// the various serialization routines will never return an incorrect /// DFA, there is no guarantee that the bytes provided here are correct. /// While `from_bytes_unchecked` will still do several forms of basic /// validation, this routine does not check that the transitions themselves /// are correct. Given an incorrect transition table, it is possible for /// the search routines to access out-of-bounds memory because of explicit /// bounds check elision. /// /// # Example /// /// ``` /// use regex_automata::{dfa::{Automaton, sparse::DFA}, HalfMatch, Input}; /// /// let initial = DFA::new("foo[0-9]+")?; /// let bytes = initial.to_bytes_native_endian(); /// // SAFETY: This is guaranteed to be safe since the bytes given come /// // directly from a compatible serialization routine. /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 }; /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new("foo12345"))?); /// # Ok::<(), Box>(()) /// ``` pub unsafe fn from_bytes_unchecked( slice: &'a [u8], ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> { let mut nr = 0; nr += wire::read_label(&slice[nr..], LABEL)?; nr += wire::read_endianness_check(&slice[nr..])?; nr += wire::read_version(&slice[nr..], VERSION)?; let _unused = wire::try_read_u32(&slice[nr..], "unused space")?; nr += size_of::(); let (flags, nread) = Flags::from_bytes(&slice[nr..])?; nr += nread; let (tt, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (st, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?; nr += nread; let (special, nread) = Special::from_bytes(&slice[nr..])?; nr += nread; if special.max.as_usize() >= tt.sparse().len() { return Err(DeserializeError::generic( "max should not be greater than or equal to sparse bytes", )); } let (quitset, nread) = ByteSet::from_bytes(&slice[nr..])?; nr += nread; // Prefilters don't support serialization, so they're always absent. let pre = None; Ok((DFA { tt, st, special, pre, quitset, flags }, nr)) } } impl> fmt::Debug for DFA { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "sparse::DFA(")?; for state in self.tt.states() { fmt_state_indicator(f, self, state.id())?; writeln!(f, "{:06?}: {:?}", state.id().as_usize(), state)?; } writeln!(f, "")?; for (i, (start_id, anchored, sty)) in self.st.iter().enumerate() { if i % self.st.stride == 0 { match anchored { Anchored::No => writeln!(f, "START-GROUP(unanchored)")?, Anchored::Yes => writeln!(f, "START-GROUP(anchored)")?, Anchored::Pattern(pid) => writeln!( f, "START_GROUP(pattern: {:?})", pid.as_usize() )?, } } writeln!(f, " {:?} => {:06?}", sty, start_id.as_usize())?; } writeln!(f, "state length: {:?}", self.tt.state_len)?; writeln!(f, "pattern length: {:?}", self.pattern_len())?; writeln!(f, "flags: {:?}", self.flags)?; writeln!(f, ")")?; Ok(()) } } // SAFETY: We assert that our implementation of each method is correct. unsafe impl> Automaton for DFA { #[inline] fn is_special_state(&self, id: StateID) -> bool { self.special.is_special_state(id) } #[inline] fn is_dead_state(&self, id: StateID) -> bool { self.special.is_dead_state(id) } #[inline] fn is_quit_state(&self, id: StateID) -> bool { self.special.is_quit_state(id) } #[inline] fn is_match_state(&self, id: StateID) -> bool { self.special.is_match_state(id) } #[inline] fn is_start_state(&self, id: StateID) -> bool { self.special.is_start_state(id) } #[inline] fn is_accel_state(&self, id: StateID) -> bool { self.special.is_accel_state(id) } // This is marked as inline to help dramatically boost sparse searching, // which decodes each state it enters to follow the next transition. #[cfg_attr(feature = "perf-inline", inline(always))] fn next_state(&self, current: StateID, input: u8) -> StateID { let input = self.tt.classes.get(input); self.tt.state(current).next(input) } #[inline] unsafe fn next_state_unchecked( &self, current: StateID, input: u8, ) -> StateID { self.next_state(current, input) } #[inline] fn next_eoi_state(&self, current: StateID) -> StateID { self.tt.state(current).next_eoi() } #[inline] fn pattern_len(&self) -> usize { self.tt.pattern_len } #[inline] fn match_len(&self, id: StateID) -> usize { self.tt.state(id).pattern_len() } #[inline] fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID { // This is an optimization for the very common case of a DFA with a // single pattern. This conditional avoids a somewhat more costly path // that finds the pattern ID from the state machine, which requires // a bit of slicing/pointer-chasing. This optimization tends to only // matter when matches are frequent. if self.tt.pattern_len == 1 { return PatternID::ZERO; } self.tt.state(id).pattern_id(match_index) } #[inline] fn has_empty(&self) -> bool { self.flags.has_empty } #[inline] fn is_utf8(&self) -> bool { self.flags.is_utf8 } #[inline] fn is_always_start_anchored(&self) -> bool { self.flags.is_always_start_anchored } #[inline] fn start_state( &self, config: &start::Config, ) -> Result { let anchored = config.get_anchored(); let start = match config.get_look_behind() { None => Start::Text, Some(byte) => { if !self.quitset.is_empty() && self.quitset.contains(byte) { return Err(StartError::quit(byte)); } self.st.start_map.get(byte) } }; self.st.start(anchored, start) } #[inline] fn universal_start_state(&self, mode: Anchored) -> Option { match mode { Anchored::No => self.st.universal_start_unanchored, Anchored::Yes => self.st.universal_start_anchored, Anchored::Pattern(_) => None, } } #[inline] fn accelerator(&self, id: StateID) -> &[u8] { self.tt.state(id).accelerator() } #[inline] fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref() } } /// The transition table portion of a sparse DFA. /// /// The transition table is the core part of the DFA in that it describes how /// to move from one state to another based on the input sequence observed. /// /// Unlike a typical dense table based DFA, states in a sparse transition /// table have variable size. That is, states with more transitions use more /// space than states with fewer transitions. This means that finding the next /// transition takes more work than with a dense DFA, but also typically uses /// much less space. #[derive(Clone)] struct Transitions { /// The raw encoding of each state in this DFA. /// /// Each state has the following information: /// /// * A set of transitions to subsequent states. Transitions to the dead /// state are omitted. /// * If the state can be accelerated, then any additional accelerator /// information. /// * If the state is a match state, then the state contains all pattern /// IDs that match when in that state. /// /// To decode a state, use Transitions::state. /// /// In practice, T is either Vec or &[u8]. sparse: T, /// A set of equivalence classes, where a single equivalence class /// represents a set of bytes that never discriminate between a match /// and a non-match in the DFA. Each equivalence class corresponds to a /// single character in this DFA's alphabet, where the maximum number of /// characters is 257 (each possible value of a byte plus the special /// EOI transition). Consequently, the number of equivalence classes /// corresponds to the number of transitions for each DFA state. Note /// though that the *space* used by each DFA state in the transition table /// may be larger. The total space used by each DFA state is known as the /// stride and is documented above. /// /// The only time the number of equivalence classes is fewer than 257 is /// if the DFA's kind uses byte classes which is the default. Equivalence /// classes should generally only be disabled when debugging, so that /// the transitions themselves aren't obscured. Disabling them has no /// other benefit, since the equivalence class map is always used while /// searching. In the vast majority of cases, the number of equivalence /// classes is substantially smaller than 257, particularly when large /// Unicode classes aren't used. /// /// N.B. Equivalence classes aren't particularly useful in a sparse DFA /// in the current implementation, since equivalence classes generally tend /// to correspond to continuous ranges of bytes that map to the same /// transition. So in a sparse DFA, equivalence classes don't really lead /// to a space savings. In the future, it would be good to try and remove /// them from sparse DFAs entirely, but requires a bit of work since sparse /// DFAs are built from dense DFAs, which are in turn built on top of /// equivalence classes. classes: ByteClasses, /// The total number of states in this DFA. Note that a DFA always has at /// least one state---the dead state---even the empty DFA. In particular, /// the dead state always has ID 0 and is correspondingly always the first /// state. The dead state is never a match state. state_len: usize, /// The total number of unique patterns represented by these match states. pattern_len: usize, } impl<'a> Transitions<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); let (state_len, nr) = wire::try_read_u32_as_usize(&slice, "state length")?; slice = &slice[nr..]; let (pattern_len, nr) = wire::try_read_u32_as_usize(&slice, "pattern length")?; slice = &slice[nr..]; let (classes, nr) = ByteClasses::from_bytes(&slice)?; slice = &slice[nr..]; let (len, nr) = wire::try_read_u32_as_usize(&slice, "sparse transitions length")?; slice = &slice[nr..]; wire::check_slice_len(slice, len, "sparse states byte length")?; let sparse = &slice[..len]; slice = &slice[len..]; let trans = Transitions { sparse, classes, state_len, pattern_len }; Ok((trans, slice.as_ptr().as_usize() - slice_start)) } } impl> Transitions { /// Writes a serialized form of this transition table to the buffer given. /// If the buffer is too small, then an error is returned. To determine /// how big the buffer must be, use `write_to_len`. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small( "sparse transition table", )); } dst = &mut dst[..nwrite]; // write state length E::write_u32(u32::try_from(self.state_len).unwrap(), dst); dst = &mut dst[size_of::()..]; // write pattern length E::write_u32(u32::try_from(self.pattern_len).unwrap(), dst); dst = &mut dst[size_of::()..]; // write byte class map let n = self.classes.write_to(dst)?; dst = &mut dst[n..]; // write number of bytes in sparse transitions E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst); dst = &mut dst[size_of::()..]; // write actual transitions let mut id = DEAD; while id.as_usize() < self.sparse().len() { let state = self.state(id); let n = state.write_to::(&mut dst)?; dst = &mut dst[n..]; // The next ID is the offset immediately following `state`. id = StateID::new(id.as_usize() + state.write_to_len()).unwrap(); } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { size_of::() // state length + size_of::() // pattern length + self.classes.write_to_len() + size_of::() // sparse transitions length + self.sparse().len() } /// Validates that every state ID in this transition table is valid. /// /// That is, every state ID can be used to correctly index a state in this /// table. fn validate(&self, sp: &Special) -> Result { let mut verified = Seen::new(); // We need to make sure that we decode the correct number of states. // Otherwise, an empty set of transitions would validate even if the // recorded state length is non-empty. let mut len = 0; // We can't use the self.states() iterator because it assumes the state // encodings are valid. It could panic if they aren't. let mut id = DEAD; while id.as_usize() < self.sparse().len() { // Before we even decode the state, we check that the ID itself // is well formed. That is, if it's a special state then it must // actually be a quit, dead, accel, match or start state. if sp.is_special_state(id) { let is_actually_special = sp.is_dead_state(id) || sp.is_quit_state(id) || sp.is_match_state(id) || sp.is_start_state(id) || sp.is_accel_state(id); if !is_actually_special { // This is kind of a cryptic error message... return Err(DeserializeError::generic( "found sparse state tagged as special but \ wasn't actually special", )); } } let state = self.try_state(sp, id)?; verified.insert(id); // The next ID should be the offset immediately following `state`. id = StateID::new(wire::add( id.as_usize(), state.write_to_len(), "next state ID offset", )?) .map_err(|err| { DeserializeError::state_id_error(err, "next state ID offset") })?; len += 1; } // Now that we've checked that all top-level states are correct and // importantly, collected a set of valid state IDs, we have all the // information we need to check that all transitions are correct too. // // Note that we can't use `valid_ids` to iterate because it will // be empty in no-std no-alloc contexts. (And yes, that means our // verification isn't quite as good.) We can use `self.states()` // though at least, since we know that all states can at least be // decoded and traversed correctly. for state in self.states() { // Check that all transitions in this state are correct. for i in 0..state.ntrans { let to = state.next_at(i); // For no-alloc, we just check that the state can decode. It is // technically possible that the state ID could still point to // a non-existent state even if it decodes (fuzzing proved this // to be true), but it shouldn't result in any memory unsafety // or panics in non-debug mode. #[cfg(not(feature = "alloc"))] { let _ = self.try_state(sp, to)?; } #[cfg(feature = "alloc")] { if !verified.contains(&to) { return Err(DeserializeError::generic( "found transition that points to a \ non-existent state", )); } } } } if len != self.state_len { return Err(DeserializeError::generic( "mismatching sparse state length", )); } Ok(verified) } /// Converts these transitions to a borrowed value. fn as_ref(&self) -> Transitions<&'_ [u8]> { Transitions { sparse: self.sparse(), classes: self.classes.clone(), state_len: self.state_len, pattern_len: self.pattern_len, } } /// Converts these transitions to an owned value. #[cfg(feature = "alloc")] fn to_owned(&self) -> Transitions> { Transitions { sparse: self.sparse().to_vec(), classes: self.classes.clone(), state_len: self.state_len, pattern_len: self.pattern_len, } } /// Return a convenient representation of the given state. /// /// This panics if the state is invalid. /// /// This is marked as inline to help dramatically boost sparse searching, /// which decodes each state it enters to follow the next transition. Other /// functions involved are also inlined, which should hopefully eliminate /// a lot of the extraneous decoding that is never needed just to follow /// the next transition. #[cfg_attr(feature = "perf-inline", inline(always))] fn state(&self, id: StateID) -> State<'_> { let mut state = &self.sparse()[id.as_usize()..]; let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &state[2..]; let (input_ranges, state) = state.split_at(ntrans * 2); let (next, state) = state.split_at(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { let npats = wire::read_u32(&state).as_usize(); state[4..].split_at(npats * 4) } else { (&[][..], state) }; let accel_len = usize::from(state[0]); let accel = &state[1..accel_len + 1]; State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel } } /// Like `state`, but will return an error if the state encoding is /// invalid. This is useful for verifying states after deserialization, /// which is required for a safe deserialization API. /// /// Note that this only verifies that this state is decodable and that /// all of its data is consistent. It does not verify that its state ID /// transitions point to valid states themselves, nor does it verify that /// every pattern ID is valid. fn try_state( &self, sp: &Special, id: StateID, ) -> Result, DeserializeError> { if id.as_usize() > self.sparse().len() { return Err(DeserializeError::generic( "invalid caller provided sparse state ID", )); } let mut state = &self.sparse()[id.as_usize()..]; // Encoding format starts with a u16 that stores the total number of // transitions in this state. let (mut ntrans, _) = wire::try_read_u16_as_usize(state, "state transition length")?; let is_match = ((1 << 15) & ntrans) != 0; ntrans &= !(1 << 15); state = &state[2..]; if ntrans > 257 || ntrans == 0 { return Err(DeserializeError::generic( "invalid transition length", )); } if is_match && !sp.is_match_state(id) { return Err(DeserializeError::generic( "state marked as match but not in match ID range", )); } else if !is_match && sp.is_match_state(id) { return Err(DeserializeError::generic( "state in match ID range but not marked as match state", )); } // Each transition has two pieces: an inclusive range of bytes on which // it is defined, and the state ID that those bytes transition to. The // pairs come first, followed by a corresponding sequence of state IDs. let input_ranges_len = ntrans.checked_mul(2).unwrap(); wire::check_slice_len(state, input_ranges_len, "sparse byte pairs")?; let (input_ranges, state) = state.split_at(input_ranges_len); // Every range should be of the form A-B, where A<=B. for pair in input_ranges.chunks(2) { let (start, end) = (pair[0], pair[1]); if start > end { return Err(DeserializeError::generic("invalid input range")); } } // And now extract the corresponding sequence of state IDs. We leave // this sequence as a &[u8] instead of a &[S] because sparse DFAs do // not have any alignment requirements. let next_len = ntrans .checked_mul(self.id_len()) .expect("state size * #trans should always fit in a usize"); wire::check_slice_len(state, next_len, "sparse trans state IDs")?; let (next, state) = state.split_at(next_len); // We can at least verify that every state ID is in bounds. for idbytes in next.chunks(self.id_len()) { let (id, _) = wire::read_state_id(idbytes, "sparse state ID in try_state")?; wire::check_slice_len( self.sparse(), id.as_usize(), "invalid sparse state ID", )?; } // If this is a match state, then read the pattern IDs for this state. // Pattern IDs is a u32-length prefixed sequence of native endian // encoded 32-bit integers. let (pattern_ids, state) = if is_match { let (npats, nr) = wire::try_read_u32_as_usize(state, "pattern ID length")?; let state = &state[nr..]; if npats == 0 { return Err(DeserializeError::generic( "state marked as a match, but pattern length is zero", )); } let pattern_ids_len = wire::mul(npats, 4, "sparse pattern ID byte length")?; wire::check_slice_len( state, pattern_ids_len, "sparse pattern IDs", )?; let (pattern_ids, state) = state.split_at(pattern_ids_len); for patbytes in pattern_ids.chunks(PatternID::SIZE) { wire::read_pattern_id( patbytes, "sparse pattern ID in try_state", )?; } (pattern_ids, state) } else { (&[][..], state) }; if is_match && pattern_ids.is_empty() { return Err(DeserializeError::generic( "state marked as a match, but has no pattern IDs", )); } if sp.is_match_state(id) && pattern_ids.is_empty() { return Err(DeserializeError::generic( "state marked special as a match, but has no pattern IDs", )); } if sp.is_match_state(id) != is_match { return Err(DeserializeError::generic( "whether state is a match or not is inconsistent", )); } // Now read this state's accelerator info. The first byte is the length // of the accelerator, which is typically 0 (for no acceleration) but // is no bigger than 3. The length indicates the number of bytes that // follow, where each byte corresponds to a transition out of this // state. if state.is_empty() { return Err(DeserializeError::generic("no accelerator length")); } let (accel_len, state) = (usize::from(state[0]), &state[1..]); if accel_len > 3 { return Err(DeserializeError::generic( "sparse invalid accelerator length", )); } else if accel_len == 0 && sp.is_accel_state(id) { return Err(DeserializeError::generic( "got no accelerators in state, but in accelerator ID range", )); } else if accel_len > 0 && !sp.is_accel_state(id) { return Err(DeserializeError::generic( "state in accelerator ID range, but has no accelerators", )); } wire::check_slice_len( state, accel_len, "sparse corrupt accelerator length", )?; let (accel, _) = (&state[..accel_len], &state[accel_len..]); let state = State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel, }; if sp.is_quit_state(state.next_at(state.ntrans - 1)) { return Err(DeserializeError::generic( "state with EOI transition to quit state is illegal", )); } Ok(state) } /// Return an iterator over all of the states in this DFA. /// /// The iterator returned yields tuples, where the first element is the /// state ID and the second element is the state itself. fn states(&self) -> StateIter<'_, T> { StateIter { trans: self, id: DEAD.as_usize() } } /// Returns the sparse transitions as raw bytes. fn sparse(&self) -> &[u8] { self.sparse.as_ref() } /// Returns the number of bytes represented by a single state ID. fn id_len(&self) -> usize { StateID::SIZE } /// Return the memory usage, in bytes, of these transitions. /// /// This does not include the size of a `Transitions` value itself. fn memory_usage(&self) -> usize { self.sparse().len() } } #[cfg(feature = "dfa-build")] impl> Transitions { /// Return a convenient mutable representation of the given state. /// This panics if the state is invalid. fn state_mut(&mut self, id: StateID) -> StateMut<'_> { let mut state = &mut self.sparse_mut()[id.as_usize()..]; let mut ntrans = wire::read_u16(&state).as_usize(); let is_match = (1 << 15) & ntrans != 0; ntrans &= !(1 << 15); state = &mut state[2..]; let (input_ranges, state) = state.split_at_mut(ntrans * 2); let (next, state) = state.split_at_mut(ntrans * StateID::SIZE); let (pattern_ids, state) = if is_match { let npats = wire::read_u32(&state).as_usize(); state[4..].split_at_mut(npats * 4) } else { (&mut [][..], state) }; let accel_len = usize::from(state[0]); let accel = &mut state[1..accel_len + 1]; StateMut { id, is_match, ntrans, input_ranges, next, pattern_ids, accel, } } /// Returns the sparse transitions as raw mutable bytes. fn sparse_mut(&mut self) -> &mut [u8] { self.sparse.as_mut() } } /// The set of all possible starting states in a DFA. /// /// See the eponymous type in the `dense` module for more details. This type /// is very similar to `dense::StartTable`, except that its underlying /// representation is `&[u8]` instead of `&[S]`. (The latter would require /// sparse DFAs to be aligned, which is explicitly something we do not require /// because we don't really need it.) #[derive(Clone)] struct StartTable { /// The initial start state IDs as a contiguous table of native endian /// encoded integers, represented by `S`. /// /// In practice, T is either Vec or &[u8] and has no alignment /// requirements. /// /// The first `2 * stride` (currently always 8) entries always correspond /// to the starts states for the entire DFA, with the first 4 entries being /// for unanchored searches and the second 4 entries being for anchored /// searches. To keep things simple, we always use 8 entries even if the /// `StartKind` is not both. /// /// After that, there are `stride * patterns` state IDs, where `patterns` /// may be zero in the case of a DFA with no patterns or in the case where /// the DFA was built without enabling starting states for each pattern. table: T, /// The starting state configuration supported. When 'both', both /// unanchored and anchored searches work. When 'unanchored', anchored /// searches panic. When 'anchored', unanchored searches panic. kind: StartKind, /// The start state configuration for every possible byte. start_map: StartByteMap, /// The number of starting state IDs per pattern. stride: usize, /// The total number of patterns for which starting states are encoded. /// This is `None` for DFAs that were built without start states for each /// pattern. Thus, one cannot use this field to say how many patterns /// are in the DFA in all cases. It is specific to how many patterns are /// represented in this start table. pattern_len: Option, /// The universal starting state for unanchored searches. This is only /// present when the DFA supports unanchored searches and when all starting /// state IDs for an unanchored search are equivalent. universal_start_unanchored: Option, /// The universal starting state for anchored searches. This is only /// present when the DFA supports anchored searches and when all starting /// state IDs for an anchored search are equivalent. universal_start_anchored: Option, } #[cfg(feature = "dfa-build")] impl StartTable> { fn new>( dfa: &dense::DFA, pattern_len: Option, ) -> StartTable> { let stride = Start::len(); // This is OK since the only way we're here is if a dense DFA could be // constructed successfully, which uses the same space. let len = stride .checked_mul(pattern_len.unwrap_or(0)) .unwrap() .checked_add(stride.checked_mul(2).unwrap()) .unwrap() .checked_mul(StateID::SIZE) .unwrap(); StartTable { table: vec![0; len], kind: dfa.start_kind(), start_map: dfa.start_map().clone(), stride, pattern_len, universal_start_unanchored: dfa .universal_start_state(Anchored::No), universal_start_anchored: dfa.universal_start_state(Anchored::Yes), } } fn from_dense_dfa>( dfa: &dense::DFA, remap: &[StateID], ) -> Result>, BuildError> { // Unless the DFA has start states compiled for each pattern, then // as far as the starting state table is concerned, there are zero // patterns to account for. It will instead only store starting states // for the entire DFA. let start_pattern_len = if dfa.starts_for_each_pattern() { Some(dfa.pattern_len()) } else { None }; let mut sl = StartTable::new(dfa, start_pattern_len); for (old_start_id, anchored, sty) in dfa.starts() { let new_start_id = remap[dfa.to_index(old_start_id)]; sl.set_start(anchored, sty, new_start_id); } Ok(sl) } } impl<'a> StartTable<&'a [u8]> { unsafe fn from_bytes_unchecked( mut slice: &'a [u8], ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> { let slice_start = slice.as_ptr().as_usize(); let (kind, nr) = StartKind::from_bytes(slice)?; slice = &slice[nr..]; let (start_map, nr) = StartByteMap::from_bytes(slice)?; slice = &slice[nr..]; let (stride, nr) = wire::try_read_u32_as_usize(slice, "sparse start table stride")?; slice = &slice[nr..]; if stride != Start::len() { return Err(DeserializeError::generic( "invalid sparse starting table stride", )); } let (maybe_pattern_len, nr) = wire::try_read_u32_as_usize(slice, "sparse start table patterns")?; slice = &slice[nr..]; let pattern_len = if maybe_pattern_len.as_u32() == u32::MAX { None } else { Some(maybe_pattern_len) }; if pattern_len.map_or(false, |len| len > PatternID::LIMIT) { return Err(DeserializeError::generic( "sparse invalid number of patterns", )); } let (universal_unanchored, nr) = wire::try_read_u32(slice, "universal unanchored start")?; slice = &slice[nr..]; let universal_start_unanchored = if universal_unanchored == u32::MAX { None } else { Some(StateID::try_from(universal_unanchored).map_err(|e| { DeserializeError::state_id_error( e, "universal unanchored start", ) })?) }; let (universal_anchored, nr) = wire::try_read_u32(slice, "universal anchored start")?; slice = &slice[nr..]; let universal_start_anchored = if universal_anchored == u32::MAX { None } else { Some(StateID::try_from(universal_anchored).map_err(|e| { DeserializeError::state_id_error(e, "universal anchored start") })?) }; let pattern_table_size = wire::mul( stride, pattern_len.unwrap_or(0), "sparse invalid pattern length", )?; // Our start states always start with a single stride of start states // for the entire automaton which permit it to match any pattern. What // follows it are an optional set of start states for each pattern. let start_state_len = wire::add( wire::mul(2, stride, "start state stride too big")?, pattern_table_size, "sparse invalid 'any' pattern starts size", )?; let table_bytes_len = wire::mul( start_state_len, StateID::SIZE, "sparse pattern table bytes length", )?; wire::check_slice_len( slice, table_bytes_len, "sparse start ID table", )?; let table = &slice[..table_bytes_len]; slice = &slice[table_bytes_len..]; let sl = StartTable { table, kind, start_map, stride, pattern_len, universal_start_unanchored, universal_start_anchored, }; Ok((sl, slice.as_ptr().as_usize() - slice_start)) } } impl> StartTable { fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small( "sparse starting table ids", )); } dst = &mut dst[..nwrite]; // write start kind let nw = self.kind.write_to::(dst)?; dst = &mut dst[nw..]; // write start byte map let nw = self.start_map.write_to(dst)?; dst = &mut dst[nw..]; // write stride E::write_u32(u32::try_from(self.stride).unwrap(), dst); dst = &mut dst[size_of::()..]; // write pattern length E::write_u32( u32::try_from(self.pattern_len.unwrap_or(0xFFFF_FFFF)).unwrap(), dst, ); dst = &mut dst[size_of::()..]; // write universal start unanchored state id, u32::MAX if absent E::write_u32( self.universal_start_unanchored .map_or(u32::MAX, |sid| sid.as_u32()), dst, ); dst = &mut dst[size_of::()..]; // write universal start anchored state id, u32::MAX if absent E::write_u32( self.universal_start_anchored.map_or(u32::MAX, |sid| sid.as_u32()), dst, ); dst = &mut dst[size_of::()..]; // write start IDs for (sid, _, _) in self.iter() { E::write_u32(sid.as_u32(), dst); dst = &mut dst[StateID::SIZE..]; } Ok(nwrite) } /// Returns the number of bytes the serialized form of this transition /// table will use. fn write_to_len(&self) -> usize { self.kind.write_to_len() + self.start_map.write_to_len() + size_of::() // stride + size_of::() // # patterns + size_of::() // universal unanchored start + size_of::() // universal anchored start + self.table().len() } /// Validates that every starting state ID in this table is valid. /// /// That is, every starting state ID can be used to correctly decode a /// state in the DFA's sparse transitions. fn validate( &self, sp: &Special, seen: &Seen, ) -> Result<(), DeserializeError> { for (id, _, _) in self.iter() { if !seen.contains(&id) { return Err(DeserializeError::generic( "found invalid start state ID", )); } if sp.is_match_state(id) { return Err(DeserializeError::generic( "start states cannot be match states", )); } } Ok(()) } /// Converts this start list to a borrowed value. fn as_ref(&self) -> StartTable<&'_ [u8]> { StartTable { table: self.table(), kind: self.kind, start_map: self.start_map.clone(), stride: self.stride, pattern_len: self.pattern_len, universal_start_unanchored: self.universal_start_unanchored, universal_start_anchored: self.universal_start_anchored, } } /// Converts this start list to an owned value. #[cfg(feature = "alloc")] fn to_owned(&self) -> StartTable> { StartTable { table: self.table().to_vec(), kind: self.kind, start_map: self.start_map.clone(), stride: self.stride, pattern_len: self.pattern_len, universal_start_unanchored: self.universal_start_unanchored, universal_start_anchored: self.universal_start_anchored, } } /// Return the start state for the given index and pattern ID. If the /// pattern ID is None, then the corresponding start state for the entire /// DFA is returned. If the pattern ID is not None, then the corresponding /// starting state for the given pattern is returned. If this start table /// does not have individual starting states for each pattern, then this /// panics. fn start( &self, anchored: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); let index = match anchored { Anchored::No => { if !self.kind.has_unanchored() { return Err(StartError::unsupported_anchored(anchored)); } start_index } Anchored::Yes => { if !self.kind.has_anchored() { return Err(StartError::unsupported_anchored(anchored)); } self.stride + start_index } Anchored::Pattern(pid) => { let len = match self.pattern_len { None => { return Err(StartError::unsupported_anchored(anchored)) } Some(len) => len, }; if pid.as_usize() >= len { return Ok(DEAD); } (2 * self.stride) + (self.stride * pid.as_usize()) + start_index } }; let start = index * StateID::SIZE; // This OK since we're allowed to assume that the start table contains // valid StateIDs. Ok(wire::read_state_id_unchecked(&self.table()[start..]).0) } /// Return an iterator over all start IDs in this table. fn iter(&self) -> StartStateIter<'_, T> { StartStateIter { st: self, i: 0 } } /// Returns the total number of start state IDs in this table. fn len(&self) -> usize { self.table().len() / StateID::SIZE } /// Returns the table as a raw slice of bytes. fn table(&self) -> &[u8] { self.table.as_ref() } /// Return the memory usage, in bytes, of this start list. /// /// This does not include the size of a `StartTable` value itself. fn memory_usage(&self) -> usize { self.table().len() } } #[cfg(feature = "dfa-build")] impl> StartTable { /// Set the start state for the given index and pattern. /// /// If the pattern ID or state ID are not valid, then this will panic. fn set_start(&mut self, anchored: Anchored, start: Start, id: StateID) { let start_index = start.as_usize(); let index = match anchored { Anchored::No => start_index, Anchored::Yes => self.stride + start_index, Anchored::Pattern(pid) => { let pid = pid.as_usize(); let len = self .pattern_len .expect("start states for each pattern enabled"); assert!(pid < len, "invalid pattern ID {:?}", pid); self.stride .checked_mul(pid) .unwrap() .checked_add(self.stride.checked_mul(2).unwrap()) .unwrap() .checked_add(start_index) .unwrap() } }; let start = index * StateID::SIZE; let end = start + StateID::SIZE; wire::write_state_id::( id, &mut self.table.as_mut()[start..end], ); } } /// An iterator over all state state IDs in a sparse DFA. struct StartStateIter<'a, T> { st: &'a StartTable, i: usize, } impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> { type Item = (StateID, Anchored, Start); fn next(&mut self) -> Option<(StateID, Anchored, Start)> { let i = self.i; if i >= self.st.len() { return None; } self.i += 1; // This unwrap is okay since the stride of any DFA must always match // the number of start state types. let start_type = Start::from_usize(i % self.st.stride).unwrap(); let anchored = if i < self.st.stride { Anchored::No } else if i < (2 * self.st.stride) { Anchored::Yes } else { let pid = (i - (2 * self.st.stride)) / self.st.stride; Anchored::Pattern(PatternID::new(pid).unwrap()) }; let start = i * StateID::SIZE; let end = start + StateID::SIZE; let bytes = self.st.table()[start..end].try_into().unwrap(); // This is OK since we're allowed to assume that any IDs in this start // table are correct and valid for this DFA. let id = StateID::from_ne_bytes_unchecked(bytes); Some((id, anchored, start_type)) } } impl<'a, T> fmt::Debug for StartStateIter<'a, T> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("StartStateIter").field("i", &self.i).finish() } } /// An iterator over all states in a sparse DFA. /// /// This iterator yields tuples, where the first element is the state ID and /// the second element is the state itself. struct StateIter<'a, T> { trans: &'a Transitions, id: usize, } impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> { type Item = State<'a>; fn next(&mut self) -> Option> { if self.id >= self.trans.sparse().len() { return None; } let state = self.trans.state(StateID::new_unchecked(self.id)); self.id = self.id + state.write_to_len(); Some(state) } } impl<'a, T> fmt::Debug for StateIter<'a, T> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("StateIter").field("id", &self.id).finish() } } /// A representation of a sparse DFA state that can be cheaply materialized /// from a state identifier. #[derive(Clone)] struct State<'a> { /// The identifier of this state. id: StateID, /// Whether this is a match state or not. is_match: bool, /// The number of transitions in this state. ntrans: usize, /// Pairs of input ranges, where there is one pair for each transition. /// Each pair specifies an inclusive start and end byte range for the /// corresponding transition. input_ranges: &'a [u8], /// Transitions to the next state. This slice contains native endian /// encoded state identifiers, with `S` as the representation. Thus, there /// are `ntrans * size_of::()` bytes in this slice. next: &'a [u8], /// If this is a match state, then this contains the pattern IDs that match /// when the DFA is in this state. /// /// This is a contiguous sequence of 32-bit native endian encoded integers. pattern_ids: &'a [u8], /// An accelerator for this state, if present. If this state has no /// accelerator, then this is an empty slice. When non-empty, this slice /// has length at most 3 and corresponds to the exhaustive set of bytes /// that must be seen in order to transition out of this state. accel: &'a [u8], } impl<'a> State<'a> { /// Searches for the next transition given an input byte. If no such /// transition could be found, then a dead state is returned. /// /// This is marked as inline to help dramatically boost sparse searching, /// which decodes each state it enters to follow the next transition. #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&self, input: u8) -> StateID { // This straight linear search was observed to be much better than // binary search on ASCII haystacks, likely because a binary search // visits the ASCII case last but a linear search sees it first. A // binary search does do a little better on non-ASCII haystacks, but // not by much. There might be a better trade off lurking here. for i in 0..(self.ntrans - 1) { let (start, end) = self.range(i); if start <= input && input <= end { return self.next_at(i); } // We could bail early with an extra branch: if input < b1, then // we know we'll never find a matching transition. Interestingly, // this extra branch seems to not help performance, or will even // hurt it. It's likely very dependent on the DFA itself and what // is being searched. } DEAD } /// Returns the next state ID for the special EOI transition. fn next_eoi(&self) -> StateID { self.next_at(self.ntrans - 1) } /// Returns the identifier for this state. fn id(&self) -> StateID { self.id } /// Returns the inclusive input byte range for the ith transition in this /// state. fn range(&self, i: usize) -> (u8, u8) { (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) } /// Returns the next state for the ith transition in this state. fn next_at(&self, i: usize) -> StateID { let start = i * StateID::SIZE; let end = start + StateID::SIZE; let bytes = self.next[start..end].try_into().unwrap(); StateID::from_ne_bytes_unchecked(bytes) } /// Returns the pattern ID for the given match index. If the match index /// is invalid, then this panics. fn pattern_id(&self, match_index: usize) -> PatternID { let start = match_index * PatternID::SIZE; wire::read_pattern_id_unchecked(&self.pattern_ids[start..]).0 } /// Returns the total number of pattern IDs for this state. This is always /// zero when `is_match` is false. fn pattern_len(&self) -> usize { assert_eq!(0, self.pattern_ids.len() % 4); self.pattern_ids.len() / 4 } /// Return an accelerator for this state. fn accelerator(&self) -> &'a [u8] { self.accel } /// Write the raw representation of this state to the given buffer using /// the given endianness. fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small( "sparse state transitions", )); } let ntrans = if self.is_match { self.ntrans | (1 << 15) } else { self.ntrans }; E::write_u16(u16::try_from(ntrans).unwrap(), dst); dst = &mut dst[size_of::()..]; dst[..self.input_ranges.len()].copy_from_slice(self.input_ranges); dst = &mut dst[self.input_ranges.len()..]; for i in 0..self.ntrans { E::write_u32(self.next_at(i).as_u32(), dst); dst = &mut dst[StateID::SIZE..]; } if self.is_match { E::write_u32(u32::try_from(self.pattern_len()).unwrap(), dst); dst = &mut dst[size_of::()..]; for i in 0..self.pattern_len() { let pid = self.pattern_id(i); E::write_u32(pid.as_u32(), dst); dst = &mut dst[PatternID::SIZE..]; } } dst[0] = u8::try_from(self.accel.len()).unwrap(); dst[1..][..self.accel.len()].copy_from_slice(self.accel); Ok(nwrite) } /// Return the total number of bytes that this state consumes in its /// encoded form. fn write_to_len(&self) -> usize { let mut len = 2 + (self.ntrans * 2) + (self.ntrans * StateID::SIZE) + (1 + self.accel.len()); if self.is_match { len += size_of::() + self.pattern_ids.len(); } len } } impl<'a> fmt::Debug for State<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let mut printed = false; for i in 0..(self.ntrans - 1) { let next = self.next_at(i); if next == DEAD { continue; } if printed { write!(f, ", ")?; } let (start, end) = self.range(i); if start == end { write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize())?; } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), next.as_usize(), )?; } printed = true; } let eoi = self.next_at(self.ntrans - 1); if eoi != DEAD { if printed { write!(f, ", ")?; } write!(f, "EOI => {:?}", eoi.as_usize())?; } Ok(()) } } /// A representation of a mutable sparse DFA state that can be cheaply /// materialized from a state identifier. #[cfg(feature = "dfa-build")] struct StateMut<'a> { /// The identifier of this state. id: StateID, /// Whether this is a match state or not. is_match: bool, /// The number of transitions in this state. ntrans: usize, /// Pairs of input ranges, where there is one pair for each transition. /// Each pair specifies an inclusive start and end byte range for the /// corresponding transition. input_ranges: &'a mut [u8], /// Transitions to the next state. This slice contains native endian /// encoded state identifiers, with `S` as the representation. Thus, there /// are `ntrans * size_of::()` bytes in this slice. next: &'a mut [u8], /// If this is a match state, then this contains the pattern IDs that match /// when the DFA is in this state. /// /// This is a contiguous sequence of 32-bit native endian encoded integers. pattern_ids: &'a [u8], /// An accelerator for this state, if present. If this state has no /// accelerator, then this is an empty slice. When non-empty, this slice /// has length at most 3 and corresponds to the exhaustive set of bytes /// that must be seen in order to transition out of this state. accel: &'a mut [u8], } #[cfg(feature = "dfa-build")] impl<'a> StateMut<'a> { /// Sets the ith transition to the given state. fn set_next_at(&mut self, i: usize, next: StateID) { let start = i * StateID::SIZE; let end = start + StateID::SIZE; wire::write_state_id::(next, &mut self.next[start..end]); } } #[cfg(feature = "dfa-build")] impl<'a> fmt::Debug for StateMut<'a> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let state = State { id: self.id, is_match: self.is_match, ntrans: self.ntrans, input_ranges: self.input_ranges, next: self.next, pattern_ids: self.pattern_ids, accel: self.accel, }; fmt::Debug::fmt(&state, f) } } // In order to validate everything, we not only need to make sure we // can decode every state, but that every transition in every state // points to a valid state. There are many duplicative transitions, so // we record state IDs that we've verified so that we don't redo the // decoding work. // // Except, when in no_std mode, we don't have dynamic memory allocation // available to us, so we skip this optimization. It's not clear // whether doing something more clever is worth it just yet. If you're // profiling this code and need it to run faster, please file an issue. // // OK, so we also use this to record the set of valid state IDs. Since // it is possible for a transition to point to an invalid state ID that // still (somehow) deserializes to a valid state. So we need to make // sure our transitions are limited to actually correct state IDs. // The problem is, I'm not sure how to do this verification step in // no-std no-alloc mode. I think we'd *have* to store the set of valid // state IDs in the DFA itself. For now, we don't do this verification // in no-std no-alloc mode. The worst thing that can happen is an // incorrect result. But no panics or memory safety problems should // result. Because we still do validate that the state itself is // "valid" in the sense that everything it points to actually exists. // // ---AG #[derive(Debug)] struct Seen { #[cfg(feature = "alloc")] set: alloc::collections::BTreeSet, #[cfg(not(feature = "alloc"))] set: core::marker::PhantomData, } #[cfg(feature = "alloc")] impl Seen { fn new() -> Seen { Seen { set: alloc::collections::BTreeSet::new() } } fn insert(&mut self, id: StateID) { self.set.insert(id); } fn contains(&self, id: &StateID) -> bool { self.set.contains(id) } } #[cfg(not(feature = "alloc"))] impl Seen { fn new() -> Seen { Seen { set: core::marker::PhantomData } } fn insert(&mut self, _id: StateID) {} fn contains(&self, _id: &StateID) -> bool { true } } /* /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs /// of input bytes that delineate an inclusive range of bytes. If the input /// byte is in the range, then the corresponding transition is a match. /// /// This binary search accepts a slice of these pairs and returns the position /// of the matching pair (the ith transition), or None if no matching pair /// could be found. /// /// Note that this routine is not currently used since it was observed to /// either decrease performance when searching ASCII, or did not provide enough /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here /// for posterity in case we can find a way to use it. /// /// In theory, we could use the standard library's search routine if we could /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently /// guaranteed to be safe and is thus UB (since I don't think the in-memory /// representation of `(u8, u8)` has been nailed down). One could define a /// repr(C) type, but the casting doesn't seem justified. #[cfg_attr(feature = "perf-inline", inline(always))] fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option { debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); debug_assert!(ranges.len() <= 512, "ranges should be short"); let (mut left, mut right) = (0, ranges.len() / 2); while left < right { let mid = (left + right) / 2; let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); if needle < b1 { right = mid; } else if needle > b2 { left = mid + 1; } else { return Some(mid); } } None } */ #[cfg(all(test, feature = "syntax", feature = "dfa-build"))] mod tests { use crate::{ dfa::{dense::DFA, Automaton}, nfa::thompson, Input, MatchError, }; // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. #[test] fn heuristic_unicode_forward() { let dfa = DFA::builder() .configure(DFA::config().unicode_word_boundary(true)) .thompson(thompson::Config::new().reverse(true)) .build(r"\b[0-9]+\b") .unwrap() .to_sparse() .unwrap(); let input = Input::new("β123").range(2..); let expected = MatchError::quit(0xB2, 1); let got = dfa.try_search_fwd(&input); assert_eq!(Err(expected), got); let input = Input::new("123β").range(..3); let expected = MatchError::quit(0xCE, 3); let got = dfa.try_search_fwd(&input); assert_eq!(Err(expected), got); } // See the analogous test in src/hybrid/dfa.rs and src/dfa/dense.rs. #[test] fn heuristic_unicode_reverse() { let dfa = DFA::builder() .configure(DFA::config().unicode_word_boundary(true)) .thompson(thompson::Config::new().reverse(true)) .build(r"\b[0-9]+\b") .unwrap() .to_sparse() .unwrap(); let input = Input::new("β123").range(2..); let expected = MatchError::quit(0xB2, 1); let got = dfa.try_search_rev(&input); assert_eq!(Err(expected), got); let input = Input::new("123β").range(..3); let expected = MatchError::quit(0xCE, 3); let got = dfa.try_search_rev(&input); assert_eq!(Err(expected), got); } } regex-automata-0.4.9/src/dfa/special.rs000064400000000000000000000505621046102023000161000ustar 00000000000000use crate::{ dfa::DEAD, util::{ primitives::StateID, wire::{self, DeserializeError, Endian, SerializeError}, }, }; macro_rules! err { ($msg:expr) => { return Err(DeserializeError::generic($msg)); }; } // Special represents the identifiers in a DFA that correspond to "special" // states. If a state is one or more of the following, then it is considered // special: // // * dead - A non-matching state where all outgoing transitions lead back to // itself. There is only one of these, regardless of whether minimization // has run. The dead state always has an ID of 0. i.e., It is always the // first state in a DFA. // * quit - A state that is entered whenever a byte is seen that should cause // a DFA to give up and stop searching. This results in a MatchError::quit // error being returned at search time. The default configuration for a DFA // has no quit bytes, which means this state is unreachable by default, // although it is always present for reasons of implementation simplicity. // This state is only reachable when the caller configures the DFA to quit // on certain bytes. There is always exactly one of these states and it // is always the second state. (Its actual ID depends on the size of the // alphabet in dense DFAs, since state IDs are premultiplied in order to // allow them to be used directly as indices into the transition table.) // * match - An accepting state, i.e., indicative of a match. There may be // zero or more of these states. // * accelerated - A state where all of its outgoing transitions, except a // few, loop back to itself. These states are candidates for acceleration // via memchr during search. There may be zero or more of these states. // * start - A non-matching state that indicates where the automaton should // start during a search. There is always at least one starting state and // all are guaranteed to be non-match states. (A start state cannot be a // match state because the DFAs in this crate delay all matches by one byte. // So every search that finds a match must move through one transition to // some other match state, even when searching an empty string.) // // These are not mutually exclusive categories. Namely, the following // overlappings can occur: // // * {dead, start} - If a DFA can never lead to a match and it is minimized, // then it will typically compile to something where all starting IDs point // to the DFA's dead state. // * {match, accelerated} - It is possible for a match state to have the // majority of its transitions loop back to itself, which means it's // possible for a match state to be accelerated. // * {start, accelerated} - Similarly, it is possible for a start state to be // accelerated. Note that it is possible for an accelerated state to be // neither a match or a start state. Also note that just because both match // and start states overlap with accelerated states does not mean that // match and start states overlap with each other. In fact, they are // guaranteed not to overlap. // // As a special mention, every DFA always has a dead and a quit state, even // though from the perspective of the DFA, they are equivalent. (Indeed, // minimization special cases them to ensure they don't get merged.) The // purpose of keeping them distinct is to use the quit state as a sentinel to // distguish between whether a search finished successfully without finding // anything or whether it gave up before finishing. // // So the main problem we want to solve here is the *fast* detection of whether // a state is special or not. And we also want to do this while storing as // little extra data as possible. AND we want to be able to quickly determine // which categories a state falls into above if it is special. // // We achieve this by essentially shuffling all special states to the beginning // of a DFA. That is, all special states appear before every other non-special // state. By representing special states this way, we can determine whether a // state is special or not by a single comparison, where special.max is the // identifier of the last special state in the DFA: // // if current_state <= special.max: // ... do something with special state // // The only thing left to do is to determine what kind of special state // it is. Because what we do next depends on that. Since special states // are typically rare, we can afford to do a bit more extra work, but we'd // still like this to be as fast as possible. The trick we employ here is to // continue shuffling states even within the special state range. Such that // one contiguous region corresponds to match states, another for start states // and then an overlapping range for accelerated states. At a high level, our // special state detection might look like this (for leftmost searching, where // we continue searching even after seeing a match): // // byte = input[offset] // current_state = next_state(current_state, byte) // offset += 1 // if current_state <= special.max: // if current_state == 0: // # We can never leave a dead state, so this always marks the // # end of our search. // return last_match // if current_state == special.quit_id: // # A quit state means we give up. If he DFA has no quit state, // # then special.quit_id == 0 == dead, which is handled by the // # conditional above. // return Err(MatchError::quit { byte, offset: offset - 1 }) // if special.min_match <= current_state <= special.max_match: // last_match = Some(offset) // if special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // last_match = Some(offset) // elif special.min_start <= current_state <= special.max_start: // offset = prefilter.find(input, offset) // if special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // elif special.min_accel <= current_state <= special.max_accel: // offset = accelerate(input, offset) // // There are some small details left out of the logic above. For example, // in order to accelerate a state, we need to know which bytes to search for. // This in turn implies some extra data we need to store in the DFA. To keep // things compact, we would ideally only store // // N = special.max_accel - special.min_accel + 1 // // items. But state IDs are premultiplied, which means they are not contiguous. // So in order to take a state ID and index an array of accelerated structures, // we need to do: // // i = (state_id - special.min_accel) / stride // // (N.B. 'stride' is always a power of 2, so the above can be implemented via // '(state_id - special.min_accel) >> stride2', where 'stride2' is x in // 2^x=stride.) // // Moreover, some of these specialty categories may be empty. For example, // DFAs are not required to have any match states or any accelerated states. // In that case, the lower and upper bounds are both set to 0 (the dead state // ID) and the first `current_state == 0` check subsumes cases where the // ranges are empty. // // Loop unrolling, if applicable, has also been left out of the logic above. // // Graphically, the ranges look like this, where asterisks indicate ranges // that can be empty. Each 'x' is a state. // // quit // dead| // || // xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx // | | | | start | | // | |-------------| |-------| | // | match* | | | | // | | | | | // | |----------| | | // | accel* | | // | | | // | | | // |----------------------------|------------------------ // special non-special* #[derive(Clone, Copy, Debug)] pub(crate) struct Special { /// The identifier of the last special state in a DFA. A state is special /// if and only if its identifier is less than or equal to `max`. pub(crate) max: StateID, /// The identifier of the quit state in a DFA. (There is no analogous field /// for the dead state since the dead state's ID is always zero, regardless /// of state ID size.) pub(crate) quit_id: StateID, /// The identifier of the first match state. pub(crate) min_match: StateID, /// The identifier of the last match state. pub(crate) max_match: StateID, /// The identifier of the first accelerated state. pub(crate) min_accel: StateID, /// The identifier of the last accelerated state. pub(crate) max_accel: StateID, /// The identifier of the first start state. pub(crate) min_start: StateID, /// The identifier of the last start state. pub(crate) max_start: StateID, } impl Special { /// Creates a new set of special ranges for a DFA. All ranges are initially /// set to only contain the dead state. This is interpreted as an empty /// range. #[cfg(feature = "dfa-build")] pub(crate) fn new() -> Special { Special { max: DEAD, quit_id: DEAD, min_match: DEAD, max_match: DEAD, min_accel: DEAD, max_accel: DEAD, min_start: DEAD, max_start: DEAD, } } /// Remaps all of the special state identifiers using the function given. #[cfg(feature = "dfa-build")] pub(crate) fn remap(&self, map: impl Fn(StateID) -> StateID) -> Special { Special { max: map(self.max), quit_id: map(self.quit_id), min_match: map(self.min_match), max_match: map(self.max_match), min_accel: map(self.min_accel), max_accel: map(self.max_accel), min_start: map(self.min_start), max_start: map(self.max_start), } } /// Deserialize the given bytes into special state ranges. If the slice /// given is not big enough, then this returns an error. Similarly, if /// any of the expected invariants around special state ranges aren't /// upheld, an error is returned. Note that this does not guarantee that /// the information returned is correct. /// /// Upon success, this returns the number of bytes read in addition to the /// special state IDs themselves. pub(crate) fn from_bytes( mut slice: &[u8], ) -> Result<(Special, usize), DeserializeError> { wire::check_slice_len(slice, 8 * StateID::SIZE, "special states")?; let mut nread = 0; let mut read_id = |what| -> Result { let (id, nr) = wire::try_read_state_id(slice, what)?; nread += nr; slice = &slice[StateID::SIZE..]; Ok(id) }; let max = read_id("special max id")?; let quit_id = read_id("special quit id")?; let min_match = read_id("special min match id")?; let max_match = read_id("special max match id")?; let min_accel = read_id("special min accel id")?; let max_accel = read_id("special max accel id")?; let min_start = read_id("special min start id")?; let max_start = read_id("special max start id")?; let special = Special { max, quit_id, min_match, max_match, min_accel, max_accel, min_start, max_start, }; special.validate()?; assert_eq!(nread, special.write_to_len()); Ok((special, nread)) } /// Validate that the information describing special states satisfies /// all known invariants. pub(crate) fn validate(&self) -> Result<(), DeserializeError> { // Check that both ends of the range are DEAD or neither are. if self.min_match == DEAD && self.max_match != DEAD { err!("min_match is DEAD, but max_match is not"); } if self.min_match != DEAD && self.max_match == DEAD { err!("max_match is DEAD, but min_match is not"); } if self.min_accel == DEAD && self.max_accel != DEAD { err!("min_accel is DEAD, but max_accel is not"); } if self.min_accel != DEAD && self.max_accel == DEAD { err!("max_accel is DEAD, but min_accel is not"); } if self.min_start == DEAD && self.max_start != DEAD { err!("min_start is DEAD, but max_start is not"); } if self.min_start != DEAD && self.max_start == DEAD { err!("max_start is DEAD, but min_start is not"); } // Check that ranges are well formed. if self.min_match > self.max_match { err!("min_match should not be greater than max_match"); } if self.min_accel > self.max_accel { err!("min_accel should not be greater than max_accel"); } if self.min_start > self.max_start { err!("min_start should not be greater than max_start"); } // Check that ranges are ordered with respect to one another. if self.matches() && self.quit_id >= self.min_match { err!("quit_id should not be greater than min_match"); } if self.accels() && self.quit_id >= self.min_accel { err!("quit_id should not be greater than min_accel"); } if self.starts() && self.quit_id >= self.min_start { err!("quit_id should not be greater than min_start"); } if self.matches() && self.accels() && self.min_accel < self.min_match { err!("min_match should not be greater than min_accel"); } if self.matches() && self.starts() && self.min_start < self.min_match { err!("min_match should not be greater than min_start"); } if self.accels() && self.starts() && self.min_start < self.min_accel { err!("min_accel should not be greater than min_start"); } // Check that max is at least as big as everything else. if self.max < self.quit_id { err!("quit_id should not be greater than max"); } if self.max < self.max_match { err!("max_match should not be greater than max"); } if self.max < self.max_accel { err!("max_accel should not be greater than max"); } if self.max < self.max_start { err!("max_start should not be greater than max"); } Ok(()) } /// Validate that the special state information is compatible with the /// given state len. pub(crate) fn validate_state_len( &self, len: usize, stride2: usize, ) -> Result<(), DeserializeError> { // We assume that 'validate' has already passed, so we know that 'max' // is truly the max. So all we need to check is that the max state ID // is less than the state ID len. The max legal value here is len-1, // which occurs when there are no non-special states. if (self.max.as_usize() >> stride2) >= len { err!("max should not be greater than or equal to state length"); } Ok(()) } /// Write the IDs and ranges for special states to the given byte buffer. /// The buffer given must have enough room to store all data, otherwise /// this will return an error. The number of bytes written is returned /// on success. The number of bytes written is guaranteed to be a multiple /// of 8. pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { use crate::util::wire::write_state_id as write; if dst.len() < self.write_to_len() { return Err(SerializeError::buffer_too_small("special state ids")); } let mut nwrite = 0; nwrite += write::(self.max, &mut dst[nwrite..]); nwrite += write::(self.quit_id, &mut dst[nwrite..]); nwrite += write::(self.min_match, &mut dst[nwrite..]); nwrite += write::(self.max_match, &mut dst[nwrite..]); nwrite += write::(self.min_accel, &mut dst[nwrite..]); nwrite += write::(self.max_accel, &mut dst[nwrite..]); nwrite += write::(self.min_start, &mut dst[nwrite..]); nwrite += write::(self.max_start, &mut dst[nwrite..]); assert_eq!( self.write_to_len(), nwrite, "expected to write certain number of bytes", ); assert_eq!( nwrite % 8, 0, "expected to write multiple of 8 bytes for special states", ); Ok(nwrite) } /// Returns the total number of bytes written by `write_to`. pub(crate) fn write_to_len(&self) -> usize { 8 * StateID::SIZE } /// Sets the maximum special state ID based on the current values. This /// should be used once all possible state IDs are set. #[cfg(feature = "dfa-build")] pub(crate) fn set_max(&mut self) { use core::cmp::max; self.max = max( self.quit_id, max(self.max_match, max(self.max_accel, self.max_start)), ); } /// Sets the maximum special state ID such that starting states are not /// considered "special." This also marks the min/max starting states as /// DEAD such that 'is_start_state' always returns false, even if the state /// is actually a starting state. /// /// This is useful when there is no prefilter set. It will avoid /// ping-ponging between the hot path in the DFA search code and the start /// state handling code, which is typically only useful for executing a /// prefilter. #[cfg(feature = "dfa-build")] pub(crate) fn set_no_special_start_states(&mut self) { use core::cmp::max; self.max = max(self.quit_id, max(self.max_match, self.max_accel)); self.min_start = DEAD; self.max_start = DEAD; } /// Returns true if and only if the given state ID is a special state. #[inline] pub(crate) fn is_special_state(&self, id: StateID) -> bool { id <= self.max } /// Returns true if and only if the given state ID is a dead state. #[inline] pub(crate) fn is_dead_state(&self, id: StateID) -> bool { id == DEAD } /// Returns true if and only if the given state ID is a quit state. #[inline] pub(crate) fn is_quit_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.quit_id == id } /// Returns true if and only if the given state ID is a match state. #[inline] pub(crate) fn is_match_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_match <= id && id <= self.max_match } /// Returns true if and only if the given state ID is an accel state. #[inline] pub(crate) fn is_accel_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_accel <= id && id <= self.max_accel } /// Returns true if and only if the given state ID is a start state. #[inline] pub(crate) fn is_start_state(&self, id: StateID) -> bool { !self.is_dead_state(id) && self.min_start <= id && id <= self.max_start } /// Returns the total number of match states for a dense table based DFA. #[inline] pub(crate) fn match_len(&self, stride: usize) -> usize { if self.matches() { (self.max_match.as_usize() - self.min_match.as_usize() + stride) / stride } else { 0 } } /// Returns true if and only if there is at least one match state. #[inline] pub(crate) fn matches(&self) -> bool { self.min_match != DEAD } /// Returns the total number of accel states. #[cfg(feature = "dfa-build")] pub(crate) fn accel_len(&self, stride: usize) -> usize { if self.accels() { (self.max_accel.as_usize() - self.min_accel.as_usize() + stride) / stride } else { 0 } } /// Returns true if and only if there is at least one accel state. #[inline] pub(crate) fn accels(&self) -> bool { self.min_accel != DEAD } /// Returns true if and only if there is at least one start state. #[inline] pub(crate) fn starts(&self) -> bool { self.min_start != DEAD } } regex-automata-0.4.9/src/dfa/start.rs000064400000000000000000000052101046102023000156030ustar 00000000000000use core::mem::size_of; use crate::util::wire::{self, DeserializeError, Endian, SerializeError}; /// The kind of anchored starting configurations to support in a DFA. /// /// Fully compiled DFAs need to be explicitly configured as to which anchored /// starting configurations to support. The reason for not just supporting /// everything unconditionally is that it can use more resources (such as /// memory and build time). The downside of this is that if you try to execute /// a search using an [`Anchored`](crate::Anchored) mode that is not supported /// by the DFA, then the search will return an error. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum StartKind { /// Support both anchored and unanchored searches. Both, /// Support only unanchored searches. Requesting an anchored search will /// panic. /// /// Note that even if an unanchored search is requested, the pattern itself /// may still be anchored. For example, `^abc` will only match `abc` at the /// start of a haystack. This will remain true, even if the regex engine /// only supported unanchored searches. Unanchored, /// Support only anchored searches. Requesting an unanchored search will /// panic. Anchored, } impl StartKind { pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(StartKind, usize), DeserializeError> { wire::check_slice_len(slice, size_of::(), "start kind bytes")?; let (n, nr) = wire::try_read_u32(slice, "start kind integer")?; match n { 0 => Ok((StartKind::Both, nr)), 1 => Ok((StartKind::Unanchored, nr)), 2 => Ok((StartKind::Anchored, nr)), _ => Err(DeserializeError::generic("unrecognized start kind")), } } pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("start kind")); } let n = match *self { StartKind::Both => 0, StartKind::Unanchored => 1, StartKind::Anchored => 2, }; E::write_u32(n, dst); Ok(nwrite) } pub(crate) fn write_to_len(&self) -> usize { size_of::() } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn has_unanchored(&self) -> bool { matches!(*self, StartKind::Both | StartKind::Unanchored) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn has_anchored(&self) -> bool { matches!(*self, StartKind::Both | StartKind::Anchored) } } regex-automata-0.4.9/src/hybrid/dfa.rs000064400000000000000000005621531046102023000157450ustar 00000000000000/*! Types and routines specific to lazy DFAs. This module is the home of [`hybrid::dfa::DFA`](DFA). This module also contains a [`hybrid::dfa::Builder`](Builder) and a [`hybrid::dfa::Config`](Config) for configuring and building a lazy DFA. */ use core::{iter, mem::size_of}; use alloc::vec::Vec; use crate::{ hybrid::{ error::{BuildError, CacheError, StartError}, id::{LazyStateID, LazyStateIDError}, search, }, nfa::thompson, util::{ alphabet::{self, ByteClasses, ByteSet}, determinize::{self, State, StateBuilderEmpty, StateBuilderNFA}, empty, prefilter::Prefilter, primitives::{PatternID, StateID as NFAStateID}, search::{ Anchored, HalfMatch, Input, MatchError, MatchKind, PatternSet, }, sparse_set::SparseSets, start::{self, Start, StartByteMap}, }, }; /// The minimum number of states that a lazy DFA's cache size must support. /// /// This is checked at time of construction to ensure that at least some small /// number of states can fit in the given capacity allotment. If we can't fit /// at least this number of states, then the thinking is that it's pretty /// senseless to use the lazy DFA. More to the point, parts of the code do /// assume that the cache can fit at least some small number of states. const MIN_STATES: usize = SENTINEL_STATES + 2; /// The number of "sentinel" states that get added to every lazy DFA. /// /// These are special states indicating status conditions of a search: unknown, /// dead and quit. These states in particular also use zero NFA states, so /// their memory usage is quite small. This is relevant for computing the /// minimum memory needed for a lazy DFA cache. const SENTINEL_STATES: usize = 3; /// A hybrid NFA/DFA (also called a "lazy DFA") for regex searching. /// /// A lazy DFA is a DFA that builds itself at search time. It otherwise has /// very similar characteristics as a [`dense::DFA`](crate::dfa::dense::DFA). /// Indeed, both support precisely the same regex features with precisely the /// same semantics. /// /// Where as a `dense::DFA` must be completely built to handle any input before /// it may be used for search, a lazy DFA starts off effectively empty. During /// a search, a lazy DFA will build itself depending on whether it has already /// computed the next transition or not. If it has, then it looks a lot like /// a `dense::DFA` internally: it does a very fast table based access to find /// the next transition. Otherwise, if the state hasn't been computed, then it /// does determinization _for that specific transition_ to compute the next DFA /// state. /// /// The main selling point of a lazy DFA is that, in practice, it has /// the performance profile of a `dense::DFA` without the weakness of it /// taking worst case exponential time to build. Indeed, for each byte of /// input, the lazy DFA will construct as most one new DFA state. Thus, a /// lazy DFA achieves worst case `O(mn)` time for regex search (where `m ~ /// pattern.len()` and `n ~ haystack.len()`). /// /// The main downsides of a lazy DFA are: /// /// 1. It requires mutable "cache" space during search. This is where the /// transition table, among other things, is stored. /// 2. In pathological cases (e.g., if the cache is too small), it will run /// out of room and either require a bigger cache capacity or will repeatedly /// clear the cache and thus repeatedly regenerate DFA states. Overall, this /// will tend to be slower than a typical NFA simulation. /// /// # Capabilities /// /// Like a `dense::DFA`, a single lazy DFA fundamentally supports the following /// operations: /// /// 1. Detection of a match. /// 2. Location of the end of a match. /// 3. In the case of a lazy DFA with multiple patterns, which pattern matched /// is reported as well. /// /// A notable absence from the above list of capabilities is the location of /// the *start* of a match. In order to provide both the start and end of /// a match, *two* lazy DFAs are required. This functionality is provided by a /// [`Regex`](crate::hybrid::regex::Regex). /// /// # Example /// /// This shows how to build a lazy DFA with the default configuration and /// execute a search. Notice how, in contrast to a `dense::DFA`, we must create /// a cache and pass it to our search routine. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd( /// &mut cache, &Input::new("foo12345"))?, /// ); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct DFA { config: Config, nfa: thompson::NFA, stride2: usize, start_map: StartByteMap, classes: ByteClasses, quitset: ByteSet, cache_capacity: usize, } impl DFA { /// Parse the given regular expression using a default configuration and /// return the corresponding lazy DFA. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+bar")?; /// let mut cache = dfa.create_cache(); /// /// let expected = HalfMatch::must(0, 11); /// assert_eq!( /// Some(expected), /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { DFA::builder().build(pattern) } /// Parse the given regular expressions using a default configuration and /// return the corresponding lazy multi-DFA. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+"])?; /// let mut cache = dfa.create_cache(); /// /// let expected = HalfMatch::must(1, 3); /// assert_eq!( /// Some(expected), /// dfa.try_search_fwd(&mut cache, &Input::new("foo12345bar"))?, /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>(patterns: &[P]) -> Result { DFA::builder().build_many(patterns) } /// Create a new lazy DFA that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::always_match()?; /// let mut cache = dfa.create_cache(); /// /// let expected = HalfMatch::must(0, 0); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &Input::new(""))?, /// ); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &Input::new("foo"))?, /// ); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); Builder::new().build_from_nfa(nfa) } /// Create a new lazy DFA that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::never_match()?; /// let mut cache = dfa.create_cache(); /// /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new(""))?); /// assert_eq!(None, dfa.try_search_fwd(&mut cache, &Input::new("foo"))?); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); Builder::new().build_from_nfa(nfa) } /// Return a default configuration for a `DFA`. /// /// This is a convenience routine to avoid needing to import the [`Config`] /// type when customizing the construction of a lazy DFA. /// /// # Example /// /// This example shows how to build a lazy DFA that heuristically supports /// Unicode word boundaries. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, MatchError, Input}; /// /// let re = DFA::builder() /// .configure(DFA::config().unicode_word_boundary(true)) /// .build(r"\b\w+\b")?; /// let mut cache = re.create_cache(); /// /// // Since our haystack is all ASCII, the DFA search sees then and knows /// // it is legal to interpret Unicode word boundaries as ASCII word /// // boundaries. /// let input = Input::new("!!foo!!"); /// let expected = HalfMatch::must(0, 5); /// assert_eq!(Some(expected), re.try_search_fwd(&mut cache, &input)?); /// /// // But if our haystack contains non-ASCII, then the search will fail /// // with an error. /// let input = Input::new("!!βββ!!"); /// let expected = MatchError::quit(b'\xCE', 2); /// assert_eq!(Err(expected), re.try_search_fwd(&mut cache, &input)); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere for lazy DFAs. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, util::syntax, HalfMatch, Input}; /// /// let re = DFA::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new(b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"); /// let expected = Some(HalfMatch::must(0, 9)); /// let got = re.try_search_fwd(&mut cache, &input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new cache for this lazy DFA. /// /// The cache returned should only be used for searches for this /// lazy DFA. If you want to reuse the cache for another DFA, then /// you must call [`Cache::reset`] with that DFA (or, equivalently, /// [`DFA::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this lazy DFA (and only this DFA). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different lazy DFA. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// lazy DFA has been configured to "give up" after it has cleared the /// cache a certain number of times. /// /// Any lazy state ID generated by the cache prior to resetting it is /// invalid after the reset. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different DFA. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa1 = DFA::new(r"\w")?; /// let dfa2 = DFA::new(r"\W")?; /// /// let mut cache = dfa1.create_cache(); /// assert_eq!( /// Some(HalfMatch::must(0, 2)), /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, /// ); /// /// // Using 'cache' with dfa2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the DFA we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 'dfa1' is also not /// // allowed. /// dfa2.reset_cache(&mut cache); /// assert_eq!( /// Some(HalfMatch::must(0, 3)), /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { Lazy::new(self, cache).reset_cache() } /// Returns the total number of patterns compiled into this lazy DFA. /// /// In the case of a DFA that contains no patterns, this returns `0`. /// /// # Example /// /// This example shows the pattern length for a DFA that never matches: /// /// ``` /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::never_match()?; /// assert_eq!(dfa.pattern_len(), 0); /// # Ok::<(), Box>(()) /// ``` /// /// And another example for a DFA that matches at every position: /// /// ``` /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::always_match()?; /// assert_eq!(dfa.pattern_len(), 1); /// # Ok::<(), Box>(()) /// ``` /// /// And finally, a DFA that was constructed from multiple patterns: /// /// ``` /// use regex_automata::hybrid::dfa::DFA; /// /// let dfa = DFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(dfa.pattern_len(), 3); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.nfa.pattern_len() } /// Returns the equivalence classes that make up the alphabet for this DFA. /// /// Unless [`Config::byte_classes`] was disabled, it is possible that /// multiple distinct bytes are grouped into the same equivalence class /// if it is impossible for them to discriminate between a match and a /// non-match. This has the effect of reducing the overall alphabet size /// and in turn potentially substantially reducing the size of the DFA's /// transition table. /// /// The downside of using equivalence classes like this is that every state /// transition will automatically use this map to convert an arbitrary /// byte to its corresponding equivalence class. In practice this has a /// negligible impact on performance. pub fn byte_classes(&self) -> &ByteClasses { &self.classes } /// Returns this lazy DFA's configuration. pub fn get_config(&self) -> &Config { &self.config } /// Returns a reference to the underlying NFA. pub fn get_nfa(&self) -> &thompson::NFA { &self.nfa } /// Returns the stride, as a base-2 exponent, required for these /// equivalence classes. /// /// The stride is always the smallest power of 2 that is greater than or /// equal to the alphabet length. This is done so that converting between /// state IDs and indices can be done with shifts alone, which is much /// faster than integer division. fn stride2(&self) -> usize { self.stride2 } /// Returns the total stride for every state in this lazy DFA. This /// corresponds to the total number of transitions used by each state in /// this DFA's transition table. fn stride(&self) -> usize { 1 << self.stride2() } /// Returns the memory usage, in bytes, of this lazy DFA. /// /// This does **not** include the stack size used up by this lazy DFA. To /// compute that, use `std::mem::size_of::()`. This also does not /// include the size of the `Cache` used. /// /// This also does not include any heap memory used by the NFA inside of /// this hybrid NFA/DFA. This is because the NFA's ownership is shared, and /// thus not owned by this hybrid NFA/DFA. More practically, several regex /// engines in this crate embed an NFA, and reporting the NFA's memory /// usage in all of them would likely result in reporting higher heap /// memory than is actually used. pub fn memory_usage(&self) -> usize { // The only thing that uses heap memory in a DFA is the NFA. But the // NFA has shared ownership, so reporting its memory as part of the // hybrid DFA is likely to lead to double-counting the NFA memory // somehow. In particular, this DFA does not really own an NFA, so // including it in the DFA's memory usage doesn't seem semantically // correct. 0 } } impl DFA { /// Executes a forward search and returns the end position of the leftmost /// match that is found. If no match exists, then `None` is returned. /// /// In particular, this method continues searching even after it enters /// a match state. The search only terminates once it has reached the /// end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to run a basic search. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 8); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &Input::new("foo12345"))?, /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over later parts. /// let dfa = DFA::new("abc|a")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 3); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &Input::new("abc"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specific pattern search /// /// This example shows how to build a lazy multi-DFA that permits searching /// for specific patterns. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let mut cache = dfa.create_cache(); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(HalfMatch::must(1, 6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// let got = dfa.try_search_fwd(&mut cache, &input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// // N.B. We disable Unicode here so that we use a simple ASCII word /// // boundary. Alternatively, we could enable heuristic support for /// // Unicode word boundaries since our haystack is pure ASCII. /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; /// let mut cache = dfa.create_cache(); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about the /// // larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `3` instead of `6`. /// let expected = Some(HalfMatch::must(0, 3)); /// let got = dfa.try_search_fwd( /// &mut cache, /// &Input::new(&haystack[3..6]), /// )?; /// assert_eq!(expected, got); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let got = dfa.try_search_fwd( /// &mut cache, /// &Input::new(haystack).range(3..6), /// )?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_fwd( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let hm = match search::find_fwd(self, cache, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; // We get to this point when we know our DFA can match the empty string // AND when UTF-8 mode is enabled. In this case, we skip any matches // whose offset splits a codepoint. Such a match is necessarily a // zero-width match, because UTF-8 mode requires the underlying NFA // to be built such that all non-empty matches span valid UTF-8. // Therefore, any match that ends in the middle of a codepoint cannot // be part of a span of valid UTF-8 and thus must be an empty match. // In such cases, we skip it, so as not to report matches that split a // codepoint. // // Note that this is not a checked assumption. Callers *can* provide an // NFA with UTF-8 mode enabled but produces non-empty matches that span // invalid UTF-8. But doing so is documented to result in unspecified // behavior. empty::skip_splits_fwd(input, hm, hm.offset(), |input| { let got = search::find_fwd(self, cache, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes a reverse search and returns the start of the position of the /// leftmost match that is found. If no match exists, then `None` is /// returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This routine is principally useful when used in /// conjunction with the /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) /// configuration. In general, it's unlikely to be correct to use both /// `try_search_fwd` and `try_search_rev` with the same DFA since any /// particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson, /// hybrid::dfa::DFA, /// HalfMatch, Input, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 0); /// assert_eq!( /// Some(expected), /// dfa.try_search_rev(&mut cache, &Input::new("foo12345"))?, /// ); /// /// // Even though a match is found after reading the last byte (`c`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build("abc|c")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 0); /// assert_eq!(Some(expected), dfa.try_search_rev( /// &mut cache, &Input::new("abc"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"")?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&mut cache, &input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// underlying NFA disabled: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build(r"")?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&mut cache, &input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_rev( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let hm = match search::find_rev(self, cache, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; empty::skip_splits_rev(input, hm, hm.offset(), |input| { let got = search::find_rev(self, cache, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes an overlapping forward search and returns the end position of /// matches as they are found. If no match exists, then `None` is returned. /// /// This routine is principally only useful when searching for multiple /// patterns on inputs where multiple patterns may match the same regions /// of text. In particular, callers must preserve the automaton's search /// state from prior calls so that the implementation knows where the last /// match occurred. /// /// When using this routine to implement an iterator of overlapping /// matches, the `start` of the search should remain invariant throughout /// iteration. The `OverlappingState` given to the search will keep track /// of the current position of the search. (This is because multiple /// matches may be reported at the same position, so only the search /// implementation itself knows when to advance the position.) /// /// If for some reason you want the search to forget about its previous /// state and restart the search at a particular position, then setting the /// state to [`OverlappingState::start`] will accomplish that. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to run a basic overlapping search. Notice /// that we build the automaton with a `MatchKind::All` configuration. /// Overlapping searches are unlikely to work as one would expect when /// using the default `MatchKind::LeftmostFirst` match semantics, since /// leftmost-first matching is fundamentally incompatible with overlapping /// searches. Namely, overlapping searches need to report matches as they /// are seen, where as leftmost-first searches will continue searching even /// after a match has been observed in order to find the conventional end /// position of the match. More concretely, leftmost-first searches use /// dead states to terminate a search after a specific match can no longer /// be extended. Overlapping searches instead do the opposite by continuing /// the search to find totally new matches (potentially of other patterns). /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::dfa::{DFA, OverlappingState}, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); /// dfa.try_search_overlapping_fwd( /// &mut cache, &Input::new(haystack), &mut state, /// )?; /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first /// // pattern is returned after the second. This is because the second /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); /// dfa.try_search_overlapping_fwd( /// &mut cache, &Input::new(haystack), &mut state, /// )?; /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_overlapping_fwd( &self, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); search::find_overlapping_fwd(self, cache, input, state)?; match state.get_match() { None => Ok(()), Some(_) if !utf8empty => Ok(()), Some(_) => skip_empty_utf8_splits_overlapping( input, state, |input, state| { search::find_overlapping_fwd(self, cache, input, state) }, ), } } /// Executes a reverse overlapping search and returns the start of the /// position of the leftmost match that is found. If no match exists, then /// `None` is returned. /// /// When using this routine to implement an iterator of overlapping /// matches, the `start` of the search should remain invariant throughout /// iteration. The `OverlappingState` given to the search will keep track /// of the current position of the search. (This is because multiple /// matches may be reported at the same position, so only the search /// implementation itself knows when to advance the position.) /// /// If for some reason you want the search to forget about its previous /// state and restart the search at a particular position, then setting the /// state to [`OverlappingState::start`] will accomplish that. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::{DFA, OverlappingState}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true)) /// .build_many(&[r"", r"☃"])?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let input = Input::new("☃"); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// loop { /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; /// match state.get_match() { /// None => break, /// Some(hm) => matches.push(hm), /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(1, 0), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// underlying NFA disabled: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::{DFA, OverlappingState}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build_many(&[r"", r"☃"])?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let input = Input::new("☃"); /// let mut state = OverlappingState::start(); /// let mut matches = vec![]; /// loop { /// dfa.try_search_overlapping_rev(&mut cache, &input, &mut state)?; /// match state.get_match() { /// None => break, /// Some(hm) => matches.push(hm), /// } /// } /// /// // Now *all* positions match, even within a codepoint, /// // because we lifted the requirement that matches /// // correspond to valid UTF-8 spans. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(1, 0), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_overlapping_rev( &self, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); search::find_overlapping_rev(self, cache, input, state)?; match state.get_match() { None => Ok(()), Some(_) if !utf8empty => Ok(()), Some(_) => skip_empty_utf8_splits_overlapping( input, state, |input, state| { search::find_overlapping_rev(self, cache, input, state) }, ), } } /// Writes the set of patterns that match anywhere in the given search /// configuration to `patset`. If multiple patterns match at the same /// position and the underlying DFA supports overlapping matches, then all /// matching patterns are written to the given set. /// /// Unless all of the patterns in this DFA are anchored, then generally /// speaking, this will visit every byte in the haystack. /// /// This search routine *does not* clear the pattern set. This gives some /// flexibility to the caller (e.g., running multiple searches with the /// same pattern set), but does make the API bug-prone if you're reusing /// the same pattern set for multiple searches but intended them to be /// independent. /// /// If a pattern ID matched but the given `PatternSet` does not have /// sufficient capacity to store it, then it is not inserted and silently /// dropped. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to find all matching patterns in a haystack, /// even when some patterns match at the same position as other patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::dfa::DFA, /// Input, MatchKind, PatternSet, /// }; /// /// let patterns = &[ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", /// ]; /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// let mut cache = dfa.create_cache(); /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(dfa.pattern_len()); /// dfa.try_which_overlapping_matches(&mut cache, &input, &mut patset)?; /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), MatchError> { let mut state = OverlappingState::start(); while let Some(m) = { self.try_search_overlapping_fwd(cache, input, &mut state)?; state.get_match() } { let _ = patset.try_insert(m.pattern()); // There's nothing left to find, so we can stop. Or the caller // asked us to. if patset.is_full() || input.get_earliest() { break; } } Ok(()) } } impl DFA { /// Transitions from the current state to the next state, given the next /// byte of input. /// /// The given cache is used to either reuse pre-computed state /// transitions, or to store this newly computed transition for future /// reuse. Thus, this routine guarantees that it will never return a state /// ID that has an "unknown" tag. /// /// # State identifier validity /// /// The only valid value for `current` is the lazy state ID returned /// by the most recent call to `next_state`, `next_state_untagged`, /// `next_state_untagged_unchecked`, `start_state_forward` or /// `state_state_reverse` for the given `cache`. Any state ID returned from /// prior calls to these routines (with the same `cache`) is considered /// invalid (even if it gives an appearance of working). State IDs returned /// from _any_ prior call for different `cache` values are also always /// invalid. /// /// The returned ID is always a valid ID when `current` refers to a valid /// ID. Moreover, this routine is defined for all possible values of /// `input`. /// /// These validity rules are not checked, even in debug mode. Callers are /// required to uphold these rules themselves. /// /// Violating these state ID validity rules will not sacrifice memory /// safety, but _may_ produce an incorrect result or a panic. /// /// # Panics /// /// If the given ID does not refer to a valid state, then this routine /// may panic but it also may not panic and instead return an invalid or /// incorrect ID. /// /// # Example /// /// This shows a simplistic example for walking a lazy DFA for a given /// haystack by using the `next_state` method. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk the /// // special "EOI" transition at the end of the search. /// sid = dfa.next_eoi_state(&mut cache, sid)?; /// assert!(sid.is_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn next_state( &self, cache: &mut Cache, current: LazyStateID, input: u8, ) -> Result { let class = usize::from(self.classes.get(input)); let offset = current.as_usize_untagged() + class; let sid = cache.trans[offset]; if !sid.is_unknown() { return Ok(sid); } let unit = alphabet::Unit::u8(input); Lazy::new(self, cache).cache_next_state(current, unit) } /// Transitions from the current state to the next state, given the next /// byte of input and a state ID that is not tagged. /// /// The only reason to use this routine is performance. In particular, the /// `next_state` method needs to do some additional checks, among them is /// to account for identifiers to states that are not yet computed. In /// such a case, the transition is computed on the fly. However, if it is /// known that the `current` state ID is untagged, then these checks can be /// omitted. /// /// Since this routine does not compute states on the fly, it does not /// modify the cache and thus cannot return an error. Consequently, `cache` /// does not need to be mutable and it is possible for this routine to /// return a state ID corresponding to the special "unknown" state. In /// this case, it is the caller's responsibility to use the prior state /// ID and `input` with `next_state` in order to force the computation of /// the unknown transition. Otherwise, trying to use the "unknown" state /// ID will just result in transitioning back to itself, and thus never /// terminating. (This is technically a special exemption to the state ID /// validity rules, but is permissible since this routine is guarateed to /// never mutate the given `cache`, and thus the identifier is guaranteed /// to remain valid.) /// /// See [`LazyStateID`] for more details on what it means for a state ID /// to be tagged. Also, see /// [`next_state_untagged_unchecked`](DFA::next_state_untagged_unchecked) /// for this same idea, but with bounds checks forcefully elided. /// /// # State identifier validity /// /// The only valid value for `current` is an **untagged** lazy /// state ID returned by the most recent call to `next_state`, /// `next_state_untagged`, `next_state_untagged_unchecked`, /// `start_state_forward` or `state_state_reverse` for the given `cache`. /// Any state ID returned from prior calls to these routines (with the /// same `cache`) is considered invalid (even if it gives an appearance /// of working). State IDs returned from _any_ prior call for different /// `cache` values are also always invalid. /// /// The returned ID is always a valid ID when `current` refers to a valid /// ID, although it may be tagged. Moreover, this routine is defined for /// all possible values of `input`. /// /// Not all validity rules are checked, even in debug mode. Callers are /// required to uphold these rules themselves. /// /// Violating these state ID validity rules will not sacrifice memory /// safety, but _may_ produce an incorrect result or a panic. /// /// # Panics /// /// If the given ID does not refer to a valid state, then this routine /// may panic but it also may not panic and instead return an invalid or /// incorrect ID. /// /// # Example /// /// This shows a simplistic example for walking a lazy DFA for a given /// haystack by using the `next_state_untagged` method where possible. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// let mut at = 0; /// while at < haystack.len() { /// if sid.is_tagged() { /// sid = dfa.next_state(&mut cache, sid, haystack[at])?; /// } else { /// let mut prev_sid = sid; /// // We attempt to chew through as much as we can while moving /// // through untagged state IDs. Thus, the transition function /// // does less work on average per byte. (Unrolling this loop /// // may help even more.) /// while at < haystack.len() { /// prev_sid = sid; /// sid = dfa.next_state_untagged( /// &mut cache, sid, haystack[at], /// ); /// at += 1; /// if sid.is_tagged() { /// break; /// } /// } /// // We must ensure that we never proceed to the next iteration /// // with an unknown state ID. If we don't account for this /// // case, then search isn't guaranteed to terminate since all /// // transitions on unknown states loop back to itself. /// if sid.is_unknown() { /// sid = dfa.next_state( /// &mut cache, prev_sid, haystack[at - 1], /// )?; /// } /// } /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk the /// // special "EOI" transition at the end of the search. /// sid = dfa.next_eoi_state(&mut cache, sid)?; /// assert!(sid.is_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn next_state_untagged( &self, cache: &Cache, current: LazyStateID, input: u8, ) -> LazyStateID { debug_assert!(!current.is_tagged()); let class = usize::from(self.classes.get(input)); let offset = current.as_usize_unchecked() + class; cache.trans[offset] } /// Transitions from the current state to the next state, eliding bounds /// checks, given the next byte of input and a state ID that is not tagged. /// /// The only reason to use this routine is performance. In particular, the /// `next_state` method needs to do some additional checks, among them is /// to account for identifiers to states that are not yet computed. In /// such a case, the transition is computed on the fly. However, if it is /// known that the `current` state ID is untagged, then these checks can be /// omitted. /// /// Since this routine does not compute states on the fly, it does not /// modify the cache and thus cannot return an error. Consequently, `cache` /// does not need to be mutable and it is possible for this routine to /// return a state ID corresponding to the special "unknown" state. In /// this case, it is the caller's responsibility to use the prior state /// ID and `input` with `next_state` in order to force the computation of /// the unknown transition. Otherwise, trying to use the "unknown" state /// ID will just result in transitioning back to itself, and thus never /// terminating. (This is technically a special exemption to the state ID /// validity rules, but is permissible since this routine is guarateed to /// never mutate the given `cache`, and thus the identifier is guaranteed /// to remain valid.) /// /// See [`LazyStateID`] for more details on what it means for a state ID /// to be tagged. Also, see /// [`next_state_untagged`](DFA::next_state_untagged) /// for this same idea, but with memory safety guaranteed by retaining /// bounds checks. /// /// # State identifier validity /// /// The only valid value for `current` is an **untagged** lazy /// state ID returned by the most recent call to `next_state`, /// `next_state_untagged`, `next_state_untagged_unchecked`, /// `start_state_forward` or `state_state_reverse` for the given `cache`. /// Any state ID returned from prior calls to these routines (with the /// same `cache`) is considered invalid (even if it gives an appearance /// of working). State IDs returned from _any_ prior call for different /// `cache` values are also always invalid. /// /// The returned ID is always a valid ID when `current` refers to a valid /// ID, although it may be tagged. Moreover, this routine is defined for /// all possible values of `input`. /// /// Not all validity rules are checked, even in debug mode. Callers are /// required to uphold these rules themselves. /// /// Violating these state ID validity rules will not sacrifice memory /// safety, but _may_ produce an incorrect result or a panic. /// /// # Safety /// /// Callers of this method must guarantee that `current` refers to a valid /// state ID according to the rules described above. If `current` is not a /// valid state ID for this automaton, then calling this routine may result /// in undefined behavior. /// /// If `current` is valid, then the ID returned is valid for all possible /// values of `input`. #[inline] pub unsafe fn next_state_untagged_unchecked( &self, cache: &Cache, current: LazyStateID, input: u8, ) -> LazyStateID { debug_assert!(!current.is_tagged()); let class = usize::from(self.classes.get(input)); let offset = current.as_usize_unchecked() + class; *cache.trans.get_unchecked(offset) } /// Transitions from the current state to the next state for the special /// EOI symbol. /// /// The given cache is used to either reuse pre-computed state /// transitions, or to store this newly computed transition for future /// reuse. Thus, this routine guarantees that it will never return a state /// ID that has an "unknown" tag. /// /// This routine must be called at the end of every search in a correct /// implementation of search. Namely, lazy DFAs in this crate delay matches /// by one byte in order to support look-around operators. Thus, after /// reaching the end of a haystack, a search implementation must follow one /// last EOI transition. /// /// It is best to think of EOI as an additional symbol in the alphabet of a /// DFA that is distinct from every other symbol. That is, the alphabet of /// lazy DFAs in this crate has a logical size of 257 instead of 256, where /// 256 corresponds to every possible inhabitant of `u8`. (In practice, the /// physical alphabet size may be smaller because of alphabet compression /// via equivalence classes, but EOI is always represented somehow in the /// alphabet.) /// /// # State identifier validity /// /// The only valid value for `current` is the lazy state ID returned /// by the most recent call to `next_state`, `next_state_untagged`, /// `next_state_untagged_unchecked`, `start_state_forward` or /// `state_state_reverse` for the given `cache`. Any state ID returned from /// prior calls to these routines (with the same `cache`) is considered /// invalid (even if it gives an appearance of working). State IDs returned /// from _any_ prior call for different `cache` values are also always /// invalid. /// /// The returned ID is always a valid ID when `current` refers to a valid /// ID. /// /// These validity rules are not checked, even in debug mode. Callers are /// required to uphold these rules themselves. /// /// Violating these state ID validity rules will not sacrifice memory /// safety, but _may_ produce an incorrect result or a panic. /// /// # Panics /// /// If the given ID does not refer to a valid state, then this routine /// may panic but it also may not panic and instead return an invalid or /// incorrect ID. /// /// # Example /// /// This shows a simplistic example for walking a DFA for a given haystack, /// and then finishing the search with the final EOI transition. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, Input}; /// /// let dfa = DFA::new(r"[a-z]+r")?; /// let mut cache = dfa.create_cache(); /// let haystack = "bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk /// // the special "EOI" transition at the end of the search. Without this /// // final transition, the assert below will fail since the DFA will not /// // have entered a match state yet! /// sid = dfa.next_eoi_state(&mut cache, sid)?; /// assert!(sid.is_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn next_eoi_state( &self, cache: &mut Cache, current: LazyStateID, ) -> Result { let eoi = self.classes.eoi().as_usize(); let offset = current.as_usize_untagged() + eoi; let sid = cache.trans[offset]; if !sid.is_unknown() { return Ok(sid); } let unit = self.classes.eoi(); Lazy::new(self, cache).cache_next_state(current, unit) } /// Return the ID of the start state for this lazy DFA for the given /// starting configuration. /// /// Unlike typical DFA implementations, the start state for DFAs in this /// crate is dependent on a few different factors: /// /// * The [`Anchored`] mode of the search. Unanchored, anchored and /// anchored searches for a specific [`PatternID`] all use different start /// states. /// * Whether a "look-behind" byte exists. For example, the `^` anchor /// matches if and only if there is no look-behind byte. /// * The specific value of that look-behind byte. For example, a `(?m:^)` /// assertion only matches when there is either no look-behind byte, or /// when the look-behind byte is a line terminator. /// /// The [starting configuration](start::Config) provides the above /// information. /// /// This routine can be used for either forward or reverse searches. /// Although, as a convenience, if you have an [`Input`], then it /// may be more succinct to use [`DFA::start_state_forward`] or /// [`DFA::start_state_reverse`]. Note, for example, that the convenience /// routines return a [`MatchError`] on failure where as this routine /// returns a [`StartError`]. /// /// # Errors /// /// This may return a [`StartError`] if the search needs to give up when /// determining the start state (for example, if it sees a "quit" byte /// or if the cache has become inefficient). This can also return an /// error if the given configuration contains an unsupported [`Anchored`] /// configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state( &self, cache: &mut Cache, config: &start::Config, ) -> Result { let lazy = LazyRef::new(self, cache); let anchored = config.get_anchored(); let start = match config.get_look_behind() { None => Start::Text, Some(byte) => { if !self.quitset.is_empty() && self.quitset.contains(byte) { return Err(StartError::quit(byte)); } self.start_map.get(byte) } }; let start_id = lazy.get_cached_start_id(anchored, start)?; if !start_id.is_unknown() { return Ok(start_id); } Lazy::new(self, cache).cache_start_group(anchored, start) } /// Return the ID of the start state for this lazy DFA when executing a /// forward search. /// /// This is a convenience routine for calling [`DFA::start_state`] that /// converts the given [`Input`] to a [start configuration](start::Config). /// Additionally, if an error occurs, it is converted from a [`StartError`] /// to a [`MatchError`] using the offset information in the given /// [`Input`]. /// /// # Errors /// /// This may return a [`MatchError`] if the search needs to give up when /// determining the start state (for example, if it sees a "quit" byte or /// if the cache has become inefficient). This can also return an error if /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_forward( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { let config = start::Config::from_input_forward(input); self.start_state(cache, &config).map_err(|err| match err { StartError::Cache { .. } => MatchError::gave_up(input.start()), StartError::Quit { byte } => { let offset = input .start() .checked_sub(1) .expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => { MatchError::unsupported_anchored(mode) } }) } /// Return the ID of the start state for this lazy DFA when executing a /// reverse search. /// /// This is a convenience routine for calling [`DFA::start_state`] that /// converts the given [`Input`] to a [start configuration](start::Config). /// Additionally, if an error occurs, it is converted from a [`StartError`] /// to a [`MatchError`] using the offset information in the given /// [`Input`]. /// /// # Errors /// /// This may return a [`MatchError`] if the search needs to give up when /// determining the start state (for example, if it sees a "quit" byte or /// if the cache has become inefficient). This can also return an error if /// the given `Input` contains an unsupported [`Anchored`] configuration. #[cfg_attr(feature = "perf-inline", inline(always))] pub fn start_state_reverse( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result { let config = start::Config::from_input_reverse(input); self.start_state(cache, &config).map_err(|err| match err { StartError::Cache { .. } => MatchError::gave_up(input.end()), StartError::Quit { byte } => { let offset = input.end(); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => { MatchError::unsupported_anchored(mode) } }) } /// Returns the total number of patterns that match in this state. /// /// If the lazy DFA was compiled with one pattern, then this must /// necessarily always return `1` for all match states. /// /// A lazy DFA guarantees that [`DFA::match_pattern`] can be called with /// indices up to (but not including) the length returned by this routine /// without panicking. /// /// # Panics /// /// If the given state is not a match state, then this may either panic /// or return an incorrect result. /// /// # Example /// /// This example shows a simple instance of implementing overlapping /// matches. In particular, it shows not only how to determine how many /// patterns have matched in a particular state, but also how to access /// which specific patterns have matched. /// /// Notice that we must use [`MatchKind::All`] when building the DFA. If we /// used [`MatchKind::LeftmostFirst`] instead, then the DFA would not be /// constructed in a way that supports overlapping matches. (It would only /// report a single pattern that matches at any particular point in time.) /// /// Another thing to take note of is the patterns used and the order in /// which the pattern IDs are reported. In the example below, pattern `3` /// is yielded first. Why? Because it corresponds to the match that /// appears first. Namely, the `@` symbol is part of `\S+` but not part /// of any of the other patterns. Since the `\S+` pattern has a match that /// starts to the left of any other pattern, its ID is returned before any /// other. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, Input, MatchKind}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(&[ /// r"\w+", r"[a-z]+", r"[A-Z]+", r"\S+", /// ])?; /// let mut cache = dfa.create_cache(); /// let haystack = "@bar".as_bytes(); /// /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. /// let mut sid = dfa.start_state_forward( /// &mut cache, &Input::new(haystack), /// )?; /// // Walk all the bytes in the haystack. /// for &b in haystack { /// sid = dfa.next_state(&mut cache, sid, b)?; /// } /// sid = dfa.next_eoi_state(&mut cache, sid)?; /// /// assert!(sid.is_match()); /// assert_eq!(dfa.match_len(&mut cache, sid), 3); /// // The following calls are guaranteed to not panic since `match_len` /// // returned `3` above. /// assert_eq!(dfa.match_pattern(&mut cache, sid, 0).as_usize(), 3); /// assert_eq!(dfa.match_pattern(&mut cache, sid, 1).as_usize(), 0); /// assert_eq!(dfa.match_pattern(&mut cache, sid, 2).as_usize(), 1); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn match_len(&self, cache: &Cache, id: LazyStateID) -> usize { assert!(id.is_match()); LazyRef::new(self, cache).get_cached_state(id).match_len() } /// Returns the pattern ID corresponding to the given match index in the /// given state. /// /// See [`DFA::match_len`] for an example of how to use this method /// correctly. Note that if you know your lazy DFA is configured with a /// single pattern, then this routine is never necessary since it will /// always return a pattern ID of `0` for an index of `0` when `id` /// corresponds to a match state. /// /// Typically, this routine is used when implementing an overlapping /// search, as the example for `DFA::match_len` does. /// /// # Panics /// /// If the state ID is not a match state or if the match index is out /// of bounds for the given state, then this routine may either panic /// or produce an incorrect result. If the state ID is correct and the /// match index is correct, then this routine always produces a valid /// `PatternID`. #[inline] pub fn match_pattern( &self, cache: &Cache, id: LazyStateID, match_index: usize, ) -> PatternID { // This is an optimization for the very common case of a DFA with a // single pattern. This conditional avoids a somewhat more costly path // that finds the pattern ID from the corresponding `State`, which // requires a bit of slicing/pointer-chasing. This optimization tends // to only matter when matches are frequent. if self.pattern_len() == 1 { return PatternID::ZERO; } LazyRef::new(self, cache) .get_cached_state(id) .match_pattern(match_index) } } /// A cache represents a partially computed DFA. /// /// A cache is the key component that differentiates a classical DFA and a /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a /// complete transition table that can handle all possible inputs, a hybrid /// NFA/DFA starts with an empty transition table and builds only the parts /// required during search. The parts that are built are stored in a cache. For /// this reason, a cache is a required parameter for nearly every operation on /// a [`DFA`]. /// /// Caches can be created from their corresponding DFA via /// [`DFA::create_cache`]. A cache can only be used with either the DFA that /// created it, or the DFA that was most recently used to reset it with /// [`Cache::reset`]. Using a cache with any other DFA may result in panics /// or incorrect results. #[derive(Clone, Debug)] pub struct Cache { // N.B. If you're looking to understand how determinization works, it // is probably simpler to first grok src/dfa/determinize.rs, since that // doesn't have the "laziness" component. /// The transition table. /// /// Given a `current` LazyStateID and an `input` byte, the next state can /// be computed via `trans[untagged(current) + equiv_class(input)]`. Notice /// that no multiplication is used. That's because state identifiers are /// "premultiplied." /// /// Note that the next state may be the "unknown" state. In this case, the /// next state is not known and determinization for `current` on `input` /// must be performed. trans: Vec, /// The starting states for this DFA. /// /// These are computed lazily. Initially, these are all set to "unknown" /// lazy state IDs. /// /// When 'starts_for_each_pattern' is disabled (the default), then the size /// of this is constrained to the possible starting configurations based /// on the search parameters. (At time of writing, that's 4.) However, /// when starting states for each pattern is enabled, then there are N /// additional groups of starting states, where each group reflects the /// different possible configurations and N is the number of patterns. starts: Vec, /// A sequence of NFA/DFA powerset states that have been computed for this /// lazy DFA. This sequence is indexable by untagged LazyStateIDs. (Every /// tagged LazyStateID can be used to index this sequence by converting it /// to its untagged form.) states: Vec, /// A map from states to their corresponding IDs. This map may be accessed /// via the raw byte representation of a state, which means that a `State` /// does not need to be allocated to determine whether it already exists /// in this map. Indeed, the existence of such a state is what determines /// whether we allocate a new `State` or not. /// /// The higher level idea here is that we do just enough determinization /// for a state to check whether we've already computed it. If we have, /// then we can save a little (albeit not much) work. The real savings is /// in memory usage. If we never checked for trivially duplicate states, /// then our memory usage would explode to unreasonable levels. states_to_id: StateMap, /// Sparse sets used to track which NFA states have been visited during /// various traversals. sparses: SparseSets, /// Scratch space for traversing the NFA graph. (We use space on the heap /// instead of the call stack.) stack: Vec, /// Scratch space for building a NFA/DFA powerset state. This is used to /// help amortize allocation since not every powerset state generated is /// added to the cache. In particular, if it already exists in the cache, /// then there is no need to allocate a new `State` for it. scratch_state_builder: StateBuilderEmpty, /// A simple abstraction for handling the saving of at most a single state /// across a cache clearing. This is required for correctness. Namely, if /// adding a new state after clearing the cache fails, then the caller /// must retain the ability to continue using the state ID given. The /// state corresponding to the state ID is what we preserve across cache /// clearings. state_saver: StateSaver, /// The memory usage, in bytes, used by 'states' and 'states_to_id'. We /// track this as new states are added since states use a variable amount /// of heap. Tracking this as we add states makes it possible to compute /// the total amount of memory used by the determinizer in constant time. memory_usage_state: usize, /// The number of times the cache has been cleared. When a minimum cache /// clear count is set, then the cache will return an error instead of /// clearing the cache if the count has been exceeded. clear_count: usize, /// The total number of bytes searched since the last time this cache was /// cleared, not including the current search. /// /// This can be added to the length of the current search to get the true /// total number of bytes searched. /// /// This is generally only non-zero when the /// `Cache::search_{start,update,finish}` APIs are used to track search /// progress. bytes_searched: usize, /// The progress of the current search. /// /// This is only non-`None` when callers utlize the `Cache::search_start`, /// `Cache::search_update` and `Cache::search_finish` APIs. /// /// The purpose of recording search progress is to be able to make a /// determination about the efficiency of the cache. Namely, by keeping /// track of the progress: Option, } impl Cache { /// Create a new cache for the given lazy DFA. /// /// The cache returned should only be used for searches for the given DFA. /// If you want to reuse the cache for another DFA, then you must call /// [`Cache::reset`] with that DFA. pub fn new(dfa: &DFA) -> Cache { let mut cache = Cache { trans: alloc::vec![], starts: alloc::vec![], states: alloc::vec![], states_to_id: StateMap::new(), sparses: SparseSets::new(dfa.get_nfa().states().len()), stack: alloc::vec![], scratch_state_builder: StateBuilderEmpty::new(), state_saver: StateSaver::none(), memory_usage_state: 0, clear_count: 0, bytes_searched: 0, progress: None, }; debug!("pre-init lazy DFA cache size: {}", cache.memory_usage()); Lazy { dfa, cache: &mut cache }.init_cache(); debug!("post-init lazy DFA cache size: {}", cache.memory_usage()); cache } /// Reset this cache such that it can be used for searching with the given /// lazy DFA (and only that DFA). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different lazy DFA. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// lazy DFA has been configured to "give up" after it has cleared the /// cache a certain number of times. /// /// Any lazy state ID generated by the cache prior to resetting it is /// invalid after the reset. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different DFA. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa1 = DFA::new(r"\w")?; /// let dfa2 = DFA::new(r"\W")?; /// /// let mut cache = dfa1.create_cache(); /// assert_eq!( /// Some(HalfMatch::must(0, 2)), /// dfa1.try_search_fwd(&mut cache, &Input::new("Δ"))?, /// ); /// /// // Using 'cache' with dfa2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the DFA we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 'dfa1' is also not /// // allowed. /// cache.reset(&dfa2); /// assert_eq!( /// Some(HalfMatch::must(0, 3)), /// dfa2.try_search_fwd(&mut cache, &Input::new("☃"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, dfa: &DFA) { Lazy::new(dfa, self).reset_cache() } /// Initializes a new search starting at the given position. /// /// If a previous search was unfinished, then it is finished automatically /// and a new search is begun. /// /// Note that keeping track of search progress is _not necessary_ /// for correct implementations of search using a lazy DFA. Keeping /// track of search progress is only necessary if you want the /// [`Config::minimum_bytes_per_state`] configuration knob to work. #[inline] pub fn search_start(&mut self, at: usize) { // If a previous search wasn't marked as finished, then finish it // now automatically. if let Some(p) = self.progress.take() { self.bytes_searched += p.len(); } self.progress = Some(SearchProgress { start: at, at }); } /// Updates the current search to indicate that it has search to the /// current position. /// /// No special care needs to be taken for reverse searches. Namely, the /// position given may be _less than_ the starting position of the search. /// /// # Panics /// /// This panics if no search has been started by [`Cache::search_start`]. #[inline] pub fn search_update(&mut self, at: usize) { let p = self.progress.as_mut().expect("no in-progress search to update"); p.at = at; } /// Indicates that a search has finished at the given position. /// /// # Panics /// /// This panics if no search has been started by [`Cache::search_start`]. #[inline] pub fn search_finish(&mut self, at: usize) { let mut p = self.progress.take().expect("no in-progress search to finish"); p.at = at; self.bytes_searched += p.len(); } /// Returns the total number of bytes that have been searched since this /// cache was last cleared. /// /// This is useful for determining the efficiency of the cache. For /// example, the lazy DFA uses this value in conjunction with the /// [`Config::minimum_bytes_per_state`] knob to help determine whether it /// should quit searching. /// /// This always returns `0` if search progress isn't being tracked. Note /// that the lazy DFA search routines in this crate always track search /// progress. pub fn search_total_len(&self) -> usize { self.bytes_searched + self.progress.as_ref().map_or(0, |p| p.len()) } /// Returns the total number of times this cache has been cleared since it /// was either created or last reset. /// /// This is useful for informational purposes or if you want to change /// search strategies based on the number of times the cache has been /// cleared. pub fn clear_count(&self) -> usize { self.clear_count } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { const ID_SIZE: usize = size_of::(); const STATE_SIZE: usize = size_of::(); // NOTE: If you make changes to the below, then // 'minimum_cache_capacity' should be updated correspondingly. self.trans.len() * ID_SIZE + self.starts.len() * ID_SIZE + self.states.len() * STATE_SIZE // Maps likely use more memory than this, but it's probably close. + self.states_to_id.len() * (STATE_SIZE + ID_SIZE) + self.sparses.memory_usage() + self.stack.capacity() * ID_SIZE + self.scratch_state_builder.capacity() // Heap memory used by 'State' in both 'states' and 'states_to_id'. + self.memory_usage_state } } /// Keeps track of the progress of the current search. /// /// This is updated via the `Cache::search_{start,update,finish}` APIs to /// record how many bytes have been searched. This permits computing a /// heuristic that represents the efficiency of a cache, and thus helps inform /// whether the lazy DFA should give up or not. #[derive(Clone, Debug)] struct SearchProgress { start: usize, at: usize, } impl SearchProgress { /// Returns the length, in bytes, of this search so far. /// /// This automatically handles the case of a reverse search, where `at` /// is likely to be less than `start`. fn len(&self) -> usize { if self.start <= self.at { self.at - self.start } else { self.start - self.at } } } /// A map from states to state identifiers. When using std, we use a standard /// hashmap, since it's a bit faster for this use case. (Other maps, like /// one's based on FNV, have not yet been benchmarked.) /// /// The main purpose of this map is to reuse states where possible. This won't /// fully minimize the DFA, but it works well in a lot of cases. #[cfg(feature = "std")] type StateMap = std::collections::HashMap; #[cfg(not(feature = "std"))] type StateMap = alloc::collections::BTreeMap; /// A type that groups methods that require the base NFA/DFA and writable /// access to the cache. #[derive(Debug)] struct Lazy<'i, 'c> { dfa: &'i DFA, cache: &'c mut Cache, } impl<'i, 'c> Lazy<'i, 'c> { /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. fn new(dfa: &'i DFA, cache: &'c mut Cache) -> Lazy<'i, 'c> { Lazy { dfa, cache } } /// Return an immutable view by downgrading a writable cache to a read-only /// cache. fn as_ref<'a>(&'a self) -> LazyRef<'i, 'a> { LazyRef::new(self.dfa, self.cache) } /// This is marked as 'inline(never)' to avoid bloating methods on 'DFA' /// like 'next_state' and 'next_eoi_state' that are called in critical /// areas. The idea is to let the optimizer focus on the other areas of /// those methods as the hot path. /// /// Here's an example that justifies 'inline(never)' /// /// ```ignore /// regex-cli find match hybrid \ /// --cache-capacity 100000000 \ /// -p '\pL{100}' /// all-codepoints-utf8-100x /// ``` /// /// Where 'all-codepoints-utf8-100x' is the UTF-8 encoding of every /// codepoint, in sequence, repeated 100 times. /// /// With 'inline(never)' hyperfine reports 1.1s per run. With /// 'inline(always)', hyperfine reports 1.23s. So that's a 10% improvement. #[cold] #[inline(never)] fn cache_next_state( &mut self, mut current: LazyStateID, unit: alphabet::Unit, ) -> Result { let stride2 = self.dfa.stride2(); let empty_builder = self.get_state_builder(); let builder = determinize::next( self.dfa.get_nfa(), self.dfa.get_config().get_match_kind(), &mut self.cache.sparses, &mut self.cache.stack, &self.cache.states[current.as_usize_untagged() >> stride2], unit, empty_builder, ); let save_state = !self.as_ref().state_builder_fits_in_cache(&builder); if save_state { self.save_state(current); } let next = self.add_builder_state(builder, |sid| sid)?; if save_state { current = self.saved_state_id(); } // This is the payoff. The next time 'next_state' is called with this // state and alphabet unit, it will find this transition and avoid // having to re-determinize this transition. self.set_transition(current, unit, next); Ok(next) } /// Compute and cache the starting state for the given pattern ID (if /// present) and the starting configuration. /// /// This panics if a pattern ID is given and the DFA isn't configured to /// build anchored start states for each pattern. /// /// This will never return an unknown lazy state ID. /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. #[cold] #[inline(never)] fn cache_start_group( &mut self, anchored: Anchored, start: Start, ) -> Result { let nfa_start_id = match anchored { Anchored::No => self.dfa.get_nfa().start_unanchored(), Anchored::Yes => self.dfa.get_nfa().start_anchored(), Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { return Err(StartError::unsupported_anchored(anchored)); } match self.dfa.get_nfa().start_pattern(pid) { None => return Ok(self.as_ref().dead_id()), Some(sid) => sid, } } }; let id = self .cache_start_one(nfa_start_id, start) .map_err(StartError::cache)?; self.set_start_state(anchored, start, id); Ok(id) } /// Compute and cache the starting state for the given NFA state ID and the /// starting configuration. The NFA state ID might be one of the following: /// /// 1) An unanchored start state to match any pattern. /// 2) An anchored start state to match any pattern. /// 3) An anchored start state for a particular pattern. /// /// This will never return an unknown lazy state ID. /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. fn cache_start_one( &mut self, nfa_start_id: NFAStateID, start: Start, ) -> Result { let mut builder_matches = self.get_state_builder().into_matches(); determinize::set_lookbehind_from_start( self.dfa.get_nfa(), &start, &mut builder_matches, ); self.cache.sparses.set1.clear(); determinize::epsilon_closure( self.dfa.get_nfa(), nfa_start_id, builder_matches.look_have(), &mut self.cache.stack, &mut self.cache.sparses.set1, ); let mut builder = builder_matches.into_nfa(); determinize::add_nfa_states( &self.dfa.get_nfa(), &self.cache.sparses.set1, &mut builder, ); let tag_starts = self.dfa.get_config().get_specialize_start_states(); self.add_builder_state(builder, |id| { if tag_starts { id.to_start() } else { id } }) } /// Either add the given builder state to this cache, or return an ID to an /// equivalent state already in this cache. /// /// In the case where no equivalent state exists, the idmap function given /// may be used to transform the identifier allocated. This is useful if /// the caller needs to tag the ID with additional information. /// /// This will never return an unknown lazy state ID. /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. fn add_builder_state( &mut self, builder: StateBuilderNFA, idmap: impl Fn(LazyStateID) -> LazyStateID, ) -> Result { if let Some(&cached_id) = self.cache.states_to_id.get(builder.as_bytes()) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. self.put_state_builder(builder); return Ok(cached_id); } let result = self.add_state(builder.to_state(), idmap); self.put_state_builder(builder); result } /// Allocate a new state ID and add the given state to this cache. /// /// The idmap function given may be used to transform the identifier /// allocated. This is useful if the caller needs to tag the ID with /// additional information. /// /// This will never return an unknown lazy state ID. /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. fn add_state( &mut self, state: State, idmap: impl Fn(LazyStateID) -> LazyStateID, ) -> Result { if !self.as_ref().state_fits_in_cache(&state) { self.try_clear_cache()?; } // It's important for this to come second, since the above may clear // the cache. If we clear the cache after ID generation, then the ID // is likely bunk since it would have been generated based on a larger // transition table. let mut id = idmap(self.next_state_id()?); if state.is_match() { id = id.to_match(); } // Add room in the transition table. Since this is a fresh state, all // of its transitions are unknown. self.cache.trans.extend( iter::repeat(self.as_ref().unknown_id()).take(self.dfa.stride()), ); // When we add a sentinel state, we never want to set any quit // transitions. Technically, this is harmless, since sentinel states // have all of their transitions set to loop back to themselves. But // when creating sentinel states before the quit sentinel state, // this will try to call 'set_transition' on a state ID that doesn't // actually exist yet, which isn't allowed. So we just skip doing so // entirely. if !self.dfa.quitset.is_empty() && !self.as_ref().is_sentinel(id) { let quit_id = self.as_ref().quit_id(); for b in self.dfa.quitset.iter() { self.set_transition(id, alphabet::Unit::u8(b), quit_id); } } self.cache.memory_usage_state += state.memory_usage(); self.cache.states.push(state.clone()); self.cache.states_to_id.insert(state, id); Ok(id) } /// Allocate a new state ID. /// /// This will never return an unknown lazy state ID. /// /// If caching this state would otherwise result in a cache that has been /// cleared too many times, then an error is returned. fn next_state_id(&mut self) -> Result { let sid = match LazyStateID::new(self.cache.trans.len()) { Ok(sid) => sid, Err(_) => { self.try_clear_cache()?; // This has to pass since we check that ID capacity at // construction time can fit at least MIN_STATES states. LazyStateID::new(self.cache.trans.len()).unwrap() } }; Ok(sid) } /// Attempt to clear the cache used by this lazy DFA. /// /// If clearing the cache exceeds the minimum number of required cache /// clearings, then this will return a cache error. In this case, /// callers should bubble this up as the cache can't be used until it is /// reset. Implementations of search should convert this error into a /// [`MatchError::gave_up`]. /// /// If 'self.state_saver' is set to save a state, then this state is /// persisted through cache clearing. Otherwise, the cache is returned to /// its state after initialization with two exceptions: its clear count /// is incremented and some of its memory likely has additional capacity. /// That is, clearing a cache does _not_ release memory. /// /// Otherwise, any lazy state ID generated by the cache prior to resetting /// it is invalid after the reset. fn try_clear_cache(&mut self) -> Result<(), CacheError> { let c = self.dfa.get_config(); if let Some(min_count) = c.get_minimum_cache_clear_count() { if self.cache.clear_count >= min_count { if let Some(min_bytes_per) = c.get_minimum_bytes_per_state() { let len = self.cache.search_total_len(); let min_bytes = min_bytes_per.saturating_mul(self.cache.states.len()); // If we've searched 0 bytes then probably something has // gone wrong and the lazy DFA search implementation isn't // correctly updating the search progress state. if len == 0 { trace!( "number of bytes searched is 0, but \ a minimum bytes per state searched ({}) is \ enabled, maybe Cache::search_update \ is not being used?", min_bytes_per, ); } if len < min_bytes { trace!( "lazy DFA cache has been cleared {} times, \ which exceeds the limit of {}, \ AND its bytes searched per state is less \ than the configured minimum of {}, \ therefore lazy DFA is giving up \ (bytes searched since cache clear = {}, \ number of states = {})", self.cache.clear_count, min_count, min_bytes_per, len, self.cache.states.len(), ); return Err(CacheError::bad_efficiency()); } else { trace!( "lazy DFA cache has been cleared {} times, \ which exceeds the limit of {}, \ AND its bytes searched per state is greater \ than the configured minimum of {}, \ therefore lazy DFA is continuing! \ (bytes searched since cache clear = {}, \ number of states = {})", self.cache.clear_count, min_count, min_bytes_per, len, self.cache.states.len(), ); } } else { trace!( "lazy DFA cache has been cleared {} times, \ which exceeds the limit of {}, \ since there is no configured bytes per state \ minimum, lazy DFA is giving up", self.cache.clear_count, min_count, ); return Err(CacheError::too_many_cache_clears()); } } } self.clear_cache(); Ok(()) } /// Clears _and_ resets the cache. Resetting the cache means that no /// states are persisted and the clear count is reset to 0. No heap memory /// is released. /// /// Note that the caller may reset a cache with a different DFA than what /// it was created from. In which case, the cache can now be used with the /// new DFA (and not the old DFA). fn reset_cache(&mut self) { self.cache.state_saver = StateSaver::none(); self.clear_cache(); // If a new DFA is used, it might have a different number of NFA // states, so we need to make sure our sparse sets have the appropriate // size. self.cache.sparses.resize(self.dfa.get_nfa().states().len()); self.cache.clear_count = 0; self.cache.progress = None; } /// Clear the cache used by this lazy DFA. /// /// If 'self.state_saver' is set to save a state, then this state is /// persisted through cache clearing. Otherwise, the cache is returned to /// its state after initialization with two exceptions: its clear count /// is incremented and some of its memory likely has additional capacity. /// That is, clearing a cache does _not_ release memory. /// /// Otherwise, any lazy state ID generated by the cache prior to resetting /// it is invalid after the reset. fn clear_cache(&mut self) { self.cache.trans.clear(); self.cache.starts.clear(); self.cache.states.clear(); self.cache.states_to_id.clear(); self.cache.memory_usage_state = 0; self.cache.clear_count += 1; self.cache.bytes_searched = 0; if let Some(ref mut progress) = self.cache.progress { progress.start = progress.at; } trace!( "lazy DFA cache has been cleared (count: {})", self.cache.clear_count ); self.init_cache(); // If the state we want to save is one of the sentinel // (unknown/dead/quit) states, then 'init_cache' adds those back, and // their identifier values remains invariant. So there's no need to add // it again. (And indeed, doing so would be incorrect!) if let Some((old_id, state)) = self.cache.state_saver.take_to_save() { // If the state is one of the special sentinel states, then it is // automatically added by cache initialization and its ID always // remains the same. With that said, this should never occur since // the sentinel states are all loop states back to themselves. So // we should never be in a position where we're attempting to save // a sentinel state since we never compute transitions out of a // sentinel state. assert!( !self.as_ref().is_sentinel(old_id), "cannot save sentinel state" ); let new_id = self .add_state(state, |id| { if old_id.is_start() { // We don't need to consult the // 'specialize_start_states' config knob here, because // if it's disabled, old_id.is_start() will never // return true. id.to_start() } else { id } }) // The unwrap here is OK because lazy DFA creation ensures that // we have room in the cache to add MIN_STATES states. Since // 'init_cache' above adds 3, this adds a 4th. .expect("adding one state after cache clear must work"); self.cache.state_saver = StateSaver::Saved(new_id); } } /// Initialize this cache from emptiness to a place where it can be used /// for search. /// /// This is called both at cache creation time and after the cache has been /// cleared. /// /// Primarily, this adds the three sentinel states and allocates some /// initial memory. fn init_cache(&mut self) { // Why multiply by 2 here? Because we make room for both the unanchored // and anchored start states. Unanchored is first and then anchored. let mut starts_len = Start::len().checked_mul(2).unwrap(); // ... but if we also want start states for every pattern, we make room // for that too. if self.dfa.get_config().get_starts_for_each_pattern() { starts_len += Start::len() * self.dfa.pattern_len(); } self.cache .starts .extend(iter::repeat(self.as_ref().unknown_id()).take(starts_len)); // This is the set of NFA states that corresponds to each of our three // sentinel states: the empty set. let dead = State::dead(); // This sets up some states that we use as sentinels that are present // in every DFA. While it would be technically possible to implement // this DFA without explicitly putting these states in the transition // table, this is convenient to do to make `next_state` correct for all // valid state IDs without needing explicit conditionals to special // case these sentinel states. // // All three of these states are "dead" states. That is, all of // them transition only to themselves. So once you enter one of // these states, it's impossible to leave them. Thus, any correct // search routine must explicitly check for these state types. (Sans // `unknown`, since that is only used internally to represent missing // states.) let unk_id = self.add_state(dead.clone(), |id| id.to_unknown()).unwrap(); let dead_id = self.add_state(dead.clone(), |id| id.to_dead()).unwrap(); let quit_id = self.add_state(dead.clone(), |id| id.to_quit()).unwrap(); assert_eq!(unk_id, self.as_ref().unknown_id()); assert_eq!(dead_id, self.as_ref().dead_id()); assert_eq!(quit_id, self.as_ref().quit_id()); // The idea here is that if you start in an unknown/dead/quit state and // try to transition on them, then you should end up where you started. self.set_all_transitions(unk_id, unk_id); self.set_all_transitions(dead_id, dead_id); self.set_all_transitions(quit_id, quit_id); // All of these states are technically equivalent from the FSM // perspective, so putting all three of them in the cache isn't // possible. (They are distinct merely because we use their // identifiers as sentinels to mean something, as indicated by the // names.) Moreover, we wouldn't want to do that. Unknown and quit // states are special in that they are artificial constructions // this implementation. But dead states are a natural part of // determinization. When you reach a point in the NFA where you cannot // go anywhere else, a dead state will naturally arise and we MUST // reuse the canonical dead state that we've created here. Why? Because // it is the state ID that tells the search routine whether a state is // dead or not, and thus, whether to stop the search. Having a bunch of // distinct dead states would be quite wasteful! self.cache.states_to_id.insert(dead, dead_id); } /// Save the state corresponding to the ID given such that the state /// persists through a cache clearing. /// /// While the state may persist, the ID may not. In order to discover the /// new state ID, one must call 'saved_state_id' after a cache clearing. fn save_state(&mut self, id: LazyStateID) { let state = self.as_ref().get_cached_state(id).clone(); self.cache.state_saver = StateSaver::ToSave { id, state }; } /// Returns the updated lazy state ID for a state that was persisted /// through a cache clearing. /// /// It is only correct to call this routine when both a state has been /// saved and the cache has just been cleared. Otherwise, this panics. fn saved_state_id(&mut self) -> LazyStateID { self.cache .state_saver .take_saved() .expect("state saver does not have saved state ID") } /// Set all transitions on the state 'from' to 'to'. fn set_all_transitions(&mut self, from: LazyStateID, to: LazyStateID) { for unit in self.dfa.classes.representatives(..) { self.set_transition(from, unit, to); } } /// Set the transition on 'from' for 'unit' to 'to'. /// /// This panics if either 'from' or 'to' is invalid. /// /// All unit values are OK. fn set_transition( &mut self, from: LazyStateID, unit: alphabet::Unit, to: LazyStateID, ) { assert!(self.as_ref().is_valid(from), "invalid 'from' id: {:?}", from); assert!(self.as_ref().is_valid(to), "invalid 'to' id: {:?}", to); let offset = from.as_usize_untagged() + self.dfa.classes.get_by_unit(unit); self.cache.trans[offset] = to; } /// Set the start ID for the given pattern ID (if given) and starting /// configuration to the ID given. /// /// This panics if 'id' is not valid or if a pattern ID is given and /// 'starts_for_each_pattern' is not enabled. fn set_start_state( &mut self, anchored: Anchored, start: Start, id: LazyStateID, ) { assert!(self.as_ref().is_valid(id)); let start_index = start.as_usize(); let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { assert!( self.dfa.get_config().get_starts_for_each_pattern(), "attempted to search for a specific pattern \ without enabling starts_for_each_pattern", ); let pid = pid.as_usize(); (2 * Start::len()) + (Start::len() * pid) + start_index } }; self.cache.starts[index] = id; } /// Returns a state builder from this DFA that might have existing /// capacity. This helps avoid allocs in cases where a state is built that /// turns out to already be cached. /// /// Callers must put the state builder back with 'put_state_builder', /// otherwise the allocation reuse won't work. fn get_state_builder(&mut self) -> StateBuilderEmpty { core::mem::replace( &mut self.cache.scratch_state_builder, StateBuilderEmpty::new(), ) } /// Puts the given state builder back into this DFA for reuse. /// /// Note that building a 'State' from a builder always creates a new alloc, /// so callers should always put the builder back. fn put_state_builder(&mut self, builder: StateBuilderNFA) { let _ = core::mem::replace( &mut self.cache.scratch_state_builder, builder.clear(), ); } } /// A type that groups methods that require the base NFA/DFA and read-only /// access to the cache. #[derive(Debug)] struct LazyRef<'i, 'c> { dfa: &'i DFA, cache: &'c Cache, } impl<'i, 'c> LazyRef<'i, 'c> { /// Creates a new 'Lazy' wrapper for a DFA and its corresponding cache. fn new(dfa: &'i DFA, cache: &'c Cache) -> LazyRef<'i, 'c> { LazyRef { dfa, cache } } /// Return the ID of the start state for the given configuration. /// /// If the start state has not yet been computed, then this returns an /// unknown lazy state ID. #[cfg_attr(feature = "perf-inline", inline(always))] fn get_cached_start_id( &self, anchored: Anchored, start: Start, ) -> Result { let start_index = start.as_usize(); let index = match anchored { Anchored::No => start_index, Anchored::Yes => Start::len() + start_index, Anchored::Pattern(pid) => { if !self.dfa.get_config().get_starts_for_each_pattern() { return Err(StartError::unsupported_anchored(anchored)); } if pid.as_usize() >= self.dfa.pattern_len() { return Ok(self.dead_id()); } (2 * Start::len()) + (Start::len() * pid.as_usize()) + start_index } }; Ok(self.cache.starts[index]) } /// Return the cached NFA/DFA powerset state for the given ID. /// /// This panics if the given ID does not address a valid state. fn get_cached_state(&self, sid: LazyStateID) -> &State { let index = sid.as_usize_untagged() >> self.dfa.stride2(); &self.cache.states[index] } /// Returns true if and only if the given ID corresponds to a "sentinel" /// state. /// /// A sentinel state is a state that signifies a special condition of /// search, and where every transition maps back to itself. See LazyStateID /// for more details. Note that start and match states are _not_ sentinels /// since they may otherwise be real states with non-trivial transitions. /// The purposes of sentinel states is purely to indicate something. Their /// transitions are not meant to be followed. fn is_sentinel(&self, id: LazyStateID) -> bool { id == self.unknown_id() || id == self.dead_id() || id == self.quit_id() } /// Returns the ID of the unknown state for this lazy DFA. fn unknown_id(&self) -> LazyStateID { // This unwrap is OK since 0 is always a valid state ID. LazyStateID::new(0).unwrap().to_unknown() } /// Returns the ID of the dead state for this lazy DFA. fn dead_id(&self) -> LazyStateID { // This unwrap is OK since the maximum value here is 1 * 512 = 512, // which is <= 2047 (the maximum state ID on 16-bit systems). Where // 512 is the worst case for our equivalence classes (every byte is a // distinct class). LazyStateID::new(1 << self.dfa.stride2()).unwrap().to_dead() } /// Returns the ID of the quit state for this lazy DFA. fn quit_id(&self) -> LazyStateID { // This unwrap is OK since the maximum value here is 2 * 512 = 1024, // which is <= 2047 (the maximum state ID on 16-bit systems). Where // 512 is the worst case for our equivalence classes (every byte is a // distinct class). LazyStateID::new(2 << self.dfa.stride2()).unwrap().to_quit() } /// Returns true if and only if the given ID is valid. /// /// An ID is valid if it is both a valid index into the transition table /// and is a multiple of the DFA's stride. fn is_valid(&self, id: LazyStateID) -> bool { let id = id.as_usize_untagged(); id < self.cache.trans.len() && id % self.dfa.stride() == 0 } /// Returns true if adding the state given would fit in this cache. fn state_fits_in_cache(&self, state: &State) -> bool { let needed = self.cache.memory_usage() + self.memory_usage_for_one_more_state(state.memory_usage()); trace!( "lazy DFA cache capacity check: {:?} ?<=? {:?}", needed, self.dfa.cache_capacity ); needed <= self.dfa.cache_capacity } /// Returns true if adding the state to be built by the given builder would /// fit in this cache. fn state_builder_fits_in_cache(&self, state: &StateBuilderNFA) -> bool { let needed = self.cache.memory_usage() + self.memory_usage_for_one_more_state(state.as_bytes().len()); needed <= self.dfa.cache_capacity } /// Returns the additional memory usage, in bytes, required to add one more /// state to this cache. The given size should be the heap size, in bytes, /// that would be used by the new state being added. fn memory_usage_for_one_more_state( &self, state_heap_size: usize, ) -> usize { const ID_SIZE: usize = size_of::(); const STATE_SIZE: usize = size_of::(); self.dfa.stride() * ID_SIZE // additional space needed in trans table + STATE_SIZE // space in cache.states + (STATE_SIZE + ID_SIZE) // space in cache.states_to_id + state_heap_size // heap memory used by state itself } } /// A simple type that encapsulates the saving of a state ID through a cache /// clearing. /// /// A state ID can be marked for saving with ToSave, while a state ID can be /// saved itself with Saved. #[derive(Clone, Debug)] enum StateSaver { /// An empty state saver. In this case, no states (other than the special /// sentinel states) are preserved after clearing the cache. None, /// An ID of a state (and the state itself) that should be preserved after /// the lazy DFA's cache has been cleared. After clearing, the updated ID /// is stored in 'Saved' since it may have changed. ToSave { id: LazyStateID, state: State }, /// An ID that of a state that has been persisted through a lazy DFA /// cache clearing. The ID recorded here corresponds to an ID that was /// once marked as ToSave. The IDs are likely not equivalent even though /// the states they point to are. Saved(LazyStateID), } impl StateSaver { /// Create an empty state saver. fn none() -> StateSaver { StateSaver::None } /// Replace this state saver with an empty saver, and if this saver is a /// request to save a state, return that request. fn take_to_save(&mut self) -> Option<(LazyStateID, State)> { match core::mem::replace(self, StateSaver::None) { StateSaver::None | StateSaver::Saved(_) => None, StateSaver::ToSave { id, state } => Some((id, state)), } } /// Replace this state saver with an empty saver, and if this saver is a /// saved state (or a request to save a state), return that state's ID. /// /// The idea here is that a request to save a state isn't necessarily /// honored because it might not be needed. e.g., Some higher level code /// might request a state to be saved on the off chance that the cache gets /// cleared when a new state is added at a lower level. But if that new /// state is never added, then the cache is never cleared and the state and /// its ID remain unchanged. fn take_saved(&mut self) -> Option { match core::mem::replace(self, StateSaver::None) { StateSaver::None => None, StateSaver::Saved(id) | StateSaver::ToSave { id, .. } => Some(id), } } } /// The configuration used for building a lazy DFA. /// /// As a convenience, [`DFA::config`] is an alias for [`Config::new`]. The /// advantage of the former is that it often lets you avoid importing the /// `Config` type directly. /// /// A lazy DFA configuration is a simple data object that is typically used /// with [`Builder::configure`]. /// /// The default configuration guarantees that a search will never return a /// "gave up" or "quit" error, although it is possible for a search to fail /// if [`Config::starts_for_each_pattern`] wasn't enabled (which it is not by /// default) and an [`Anchored::Pattern`] mode is requested via [`Input`]. #[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." // This makes it possible to easily combine multiple configurations // without default values overwriting explicitly specified values. See the // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. match_kind: Option, pre: Option>, starts_for_each_pattern: Option, byte_classes: Option, unicode_word_boundary: Option, quitset: Option, specialize_start_states: Option, cache_capacity: Option, skip_cache_capacity_check: Option, minimum_cache_clear_count: Option>, minimum_bytes_per_state: Option>, } impl Config { /// Return a new default lazy DFA builder configuration. pub fn new() -> Config { Config::default() } /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the /// match semantics of Perl-like regex engines. That is, when multiple /// patterns would match at the same leftmost position, the pattern that /// appears first in the concrete syntax is chosen. /// /// Currently, the only other kind of match semantics supported is /// [`MatchKind::All`]. This corresponds to classical DFA construction /// where all possible matches are added to the lazy DFA. /// /// Typically, `All` is used when one wants to execute an overlapping /// search and `LeftmostFirst` otherwise. In particular, it rarely makes /// sense to use `All` with the various "leftmost" find routines, since the /// leftmost routines depend on the `LeftmostFirst` automata construction /// strategy. Specifically, `LeftmostFirst` adds dead states to the /// lazy DFA as a way to terminate the search and report a match. /// `LeftmostFirst` also supports non-greedy matches using this strategy /// where as `All` does not. /// /// # Example: overlapping search /// /// This example shows the typical use of `MatchKind::All`, which is to /// report overlapping matches. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::dfa::{DFA, OverlappingState}, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build_many(&[r"\w+$", r"\S+$"])?; /// let mut cache = dfa.create_cache(); /// let haystack = "@foo"; /// let mut state = OverlappingState::start(); /// /// let expected = Some(HalfMatch::must(1, 4)); /// dfa.try_search_overlapping_fwd( /// &mut cache, &Input::new(haystack), &mut state, /// )?; /// assert_eq!(expected, state.get_match()); /// /// // The first pattern also matches at the same position, so re-running /// // the search will yield another match. Notice also that the first /// // pattern is returned after the second. This is because the second /// // pattern begins its match before the first, is therefore an earlier /// // match and is thus reported first. /// let expected = Some(HalfMatch::must(0, 4)); /// dfa.try_search_overlapping_fwd( /// &mut cache, &Input::new(haystack), &mut state, /// )?; /// assert_eq!(expected, state.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: reverse automaton to find start of match /// /// Another example for using `MatchKind::All` is for constructing a /// reverse automaton to find the start of a match. `All` semantics are /// used for this in order to find the longest possible match, which /// corresponds to the leftmost starting position. /// /// Note that if you need the starting position then /// [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) will handle this /// for you, so it's usually not necessary to do this yourself. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson::NFA, /// Anchored, HalfMatch, Input, MatchKind, /// }; /// /// let input = Input::new("123foobar456"); /// let pattern = r"[a-z]+r"; /// /// let dfa_fwd = DFA::new(pattern)?; /// let dfa_rev = DFA::builder() /// .thompson(NFA::config().reverse(true)) /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build(pattern)?; /// let mut cache_fwd = dfa_fwd.create_cache(); /// let mut cache_rev = dfa_rev.create_cache(); /// /// let expected_fwd = HalfMatch::must(0, 9); /// let expected_rev = HalfMatch::must(0, 3); /// let got_fwd = dfa_fwd.try_search_fwd(&mut cache_fwd, &input)?.unwrap(); /// // Here we don't specify the pattern to search for since there's only /// // one pattern and we're doing a leftmost search. But if this were an /// // overlapping search, you'd need to specify the pattern that matched /// // in the forward direction. (Otherwise, you might wind up finding the /// // starting position of a match of some other pattern.) That in turn /// // requires building the reverse automaton with starts_for_each_pattern /// // enabled. /// let input = input /// .clone() /// .range(..got_fwd.offset()) /// .anchored(Anchored::Yes); /// let got_rev = dfa_rev.try_search_rev(&mut cache_rev, &input)?.unwrap(); /// assert_eq!(expected_fwd, got_fwd); /// assert_eq!(expected_rev, got_rev); /// /// # Ok::<(), Box>(()) /// ``` pub fn match_kind(mut self, kind: MatchKind) -> Config { self.match_kind = Some(kind); self } /// Set a prefilter to be used whenever a start state is entered. /// /// A [`Prefilter`] in this context is meant to accelerate searches by /// looking for literal prefixes that every match for the corresponding /// pattern (or patterns) must start with. Once a prefilter produces a /// match, the underlying search routine continues on to try and confirm /// the match. /// /// Be warned that setting a prefilter does not guarantee that the search /// will be faster. While it's usually a good bet, if the prefilter /// produces a lot of false positive candidates (i.e., positions matched /// by the prefilter but not by the regex), then the overall result can /// be slower than if you had just executed the regex engine without any /// prefilters. /// /// Note that unless [`Config::specialize_start_states`] has been /// explicitly set, then setting this will also enable (when `pre` is /// `Some`) or disable (when `pre` is `None`) start state specialization. /// This occurs because without start state specialization, a prefilter /// is likely to be less effective. And without a prefilter, start state /// specialization is usually pointless. /// /// By default no prefilter is set. /// /// # Example /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); /// let re = DFA::builder() /// .configure(DFA::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// assert_eq!( /// Some(HalfMatch::must(0, 11)), /// re.try_search_fwd(&mut cache, &input)?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// Be warned though that an incorrect prefilter can lead to incorrect /// results! /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); /// let re = DFA::builder() /// .configure(DFA::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// assert_eq!( /// // No match reported even though there clearly is one! /// None, /// re.try_search_fwd(&mut cache, &input)?, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(mut self, pre: Option) -> Config { self.pre = Some(pre); if self.specialize_start_states.is_none() { self.specialize_start_states = Some(self.get_prefilter().is_some()); } self } /// Whether to compile a separate start state for each pattern in the /// lazy DFA. /// /// When enabled, a separate **anchored** start state is added for each /// pattern in the lazy DFA. When this start state is used, then the DFA /// will only search for matches for the pattern specified, even if there /// are other patterns in the DFA. /// /// The main downside of this option is that it can potentially increase /// the size of the DFA and/or increase the time it takes to build the /// DFA at search time. However, since this is configuration for a lazy /// DFA, these states aren't actually built unless they're used. Enabling /// this isn't necessarily free, however, as it may result in higher cache /// usage. /// /// There are a few reasons one might want to enable this (it's disabled /// by default): /// /// 1. When looking for the start of an overlapping match (using a reverse /// DFA), doing it correctly requires starting the reverse search using the /// starting state of the pattern that matched in the forward direction. /// Indeed, when building a [`Regex`](crate::hybrid::regex::Regex), it /// will automatically enable this option when building the reverse DFA /// internally. /// 2. When you want to use a DFA with multiple patterns to both search /// for matches of any pattern or to search for anchored matches of one /// particular pattern while using the same DFA. (Otherwise, you would need /// to compile a new DFA for each pattern.) /// /// By default this is disabled. /// /// # Example /// /// This example shows how to use this option to permit the same lazy DFA /// to run both general searches for any pattern and anchored searches for /// a specific pattern. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// Anchored, HalfMatch, Input, PatternID, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) /// .build_many(&[r"[a-z0-9]{6}", r"[a-z][a-z0-9]{5}"])?; /// let mut cache = dfa.create_cache(); /// let haystack = "bar foo123"; /// /// // Here's a normal unanchored search that looks for any pattern. /// let expected = HalfMatch::must(0, 10); /// let input = Input::new(haystack); /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); /// // We can also do a normal anchored search for any pattern. Since it's /// // an anchored search, we position the start of the search where we /// // know the match will begin. /// let expected = HalfMatch::must(0, 10); /// let input = Input::new(haystack).range(4..); /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); /// // Since we compiled anchored start states for each pattern, we can /// // also look for matches of other patterns explicitly, even if a /// // different pattern would have normally matched. /// let expected = HalfMatch::must(1, 10); /// let input = Input::new(haystack) /// .range(4..) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// assert_eq!(Some(expected), dfa.try_search_fwd(&mut cache, &input)?); /// /// # Ok::<(), Box>(()) /// ``` pub fn starts_for_each_pattern(mut self, yes: bool) -> Config { self.starts_for_each_pattern = Some(yes); self } /// Whether to attempt to shrink the size of the lazy DFA's alphabet or /// not. /// /// This option is enabled by default and should never be disabled unless /// one is debugging the lazy DFA. /// /// When enabled, the lazy DFA will use a map from all possible bytes /// to their corresponding equivalence class. Each equivalence class /// represents a set of bytes that does not discriminate between a match /// and a non-match in the DFA. For example, the pattern `[ab]+` has at /// least two equivalence classes: a set containing `a` and `b` and a set /// containing every byte except for `a` and `b`. `a` and `b` are in the /// same equivalence classes because they never discriminate between a /// match and a non-match. /// /// The advantage of this map is that the size of the transition table /// can be reduced drastically from `#states * 256 * sizeof(LazyStateID)` /// to `#states * k * sizeof(LazyStateID)` where `k` is the number of /// equivalence classes (rounded up to the nearest power of 2). As a /// result, total space usage can decrease substantially. Moreover, since a /// smaller alphabet is used, DFA compilation during search becomes faster /// as well since it will potentially be able to reuse a single transition /// for multiple bytes. /// /// **WARNING:** This is only useful for debugging lazy DFAs. Disabling /// this does not yield any speed advantages. Namely, even when this is /// disabled, a byte class map is still used while searching. The only /// difference is that every byte will be forced into its own distinct /// equivalence class. This is useful for debugging the actual generated /// transitions because it lets one see the transitions defined on actual /// bytes instead of the equivalence classes. pub fn byte_classes(mut self, yes: bool) -> Config { self.byte_classes = Some(yes); self } /// Heuristically enable Unicode word boundaries. /// /// When set, this will attempt to implement Unicode word boundaries as if /// they were ASCII word boundaries. This only works when the search input /// is ASCII only. If a non-ASCII byte is observed while searching, then a /// [`MatchError::quit`] error is returned. /// /// A possible alternative to enabling this option is to simply use an /// ASCII word boundary, e.g., via `(?-u:\b)`. The main reason to use this /// option is if you absolutely need Unicode support. This option lets one /// use a fast search implementation (a DFA) for some potentially very /// common cases, while providing the option to fall back to some other /// regex engine to handle the general case when an error is returned. /// /// If the pattern provided has no Unicode word boundary in it, then this /// option has no effect. (That is, quitting on a non-ASCII byte only /// occurs when this option is enabled _and_ a Unicode word boundary is /// present in the pattern.) /// /// This is almost equivalent to setting all non-ASCII bytes to be quit /// bytes. The only difference is that this will cause non-ASCII bytes to /// be quit bytes _only_ when a Unicode word boundary is present in the /// pattern. /// /// When enabling this option, callers _must_ be prepared to /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. Alternatively, if callers can guarantee that /// their input is ASCII only, then a [`MatchError::quit`] error will never /// be returned while searching. /// /// This is disabled by default. /// /// # Example /// /// This example shows how to heuristically enable Unicode word boundaries /// in a pattern. It also shows what happens when a search comes across a /// non-ASCII byte. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// HalfMatch, Input, MatchError, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().unicode_word_boundary(true)) /// .build(r"\b[0-9]+\b")?; /// let mut cache = dfa.create_cache(); /// /// // The match occurs before the search ever observes the snowman /// // character, so no error occurs. /// let haystack = "foo 123 ☃"; /// let expected = Some(HalfMatch::must(0, 7)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // Notice that this search fails, even though the snowman character /// // occurs after the ending match offset. This is because search /// // routines read one byte past the end of the search to account for /// // look-around, and indeed, this is required here to determine whether /// // the trailing \b matches. /// let haystack = "foo 123 ☃"; /// let expected = MatchError::quit(0xE2, 8); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack)); /// assert_eq!(Err(expected), got); /// /// // Another example is executing a search where the span of the haystack /// // we specify is all ASCII, but there is non-ASCII just before it. This /// // correctly also reports an error. /// let input = Input::new("β123").range(2..); /// let expected = MatchError::quit(0xB2, 1); /// let got = dfa.try_search_fwd(&mut cache, &input); /// assert_eq!(Err(expected), got); /// /// // And similarly for the trailing word boundary. /// let input = Input::new("123β").range(..3); /// let expected = MatchError::quit(0xCE, 3); /// let got = dfa.try_search_fwd(&mut cache, &input); /// assert_eq!(Err(expected), got); /// /// # Ok::<(), Box>(()) /// ``` pub fn unicode_word_boundary(mut self, yes: bool) -> Config { // We have a separate option for this instead of just setting the // appropriate quit bytes here because we don't want to set quit bytes // for every regex. We only want to set them when the regex contains a // Unicode word boundary. self.unicode_word_boundary = Some(yes); self } /// Add a "quit" byte to the lazy DFA. /// /// When a quit byte is seen during search time, then search will return a /// [`MatchError::quit`] error indicating the offset at which the search /// stopped. /// /// A quit byte will always overrule any other aspects of a regex. For /// example, if the `x` byte is added as a quit byte and the regex `\w` is /// used, then observing `x` will cause the search to quit immediately /// despite the fact that `x` is in the `\w` class. /// /// This mechanism is primarily useful for heuristically enabling certain /// features like Unicode word boundaries in a DFA. Namely, if the input /// to search is ASCII, then a Unicode word boundary can be implemented /// via an ASCII word boundary with no change in semantics. Thus, a DFA /// can attempt to match a Unicode word boundary but give up as soon as it /// observes a non-ASCII byte. Indeed, if callers set all non-ASCII bytes /// to be quit bytes, then Unicode word boundaries will be permitted when /// building lazy DFAs. Of course, callers should enable /// [`Config::unicode_word_boundary`] if they want this behavior instead. /// (The advantage being that non-ASCII quit bytes will only be added if a /// Unicode word boundary is in the pattern.) /// /// When enabling this option, callers _must_ be prepared to /// handle a [`MatchError`] error during search. When using a /// [`Regex`](crate::hybrid::regex::Regex), this corresponds to using the /// `try_` suite of methods. /// /// By default, there are no quit bytes set. /// /// # Panics /// /// This panics if heuristic Unicode word boundaries are enabled and any /// non-ASCII byte is removed from the set of quit bytes. Namely, enabling /// Unicode word boundaries requires setting every non-ASCII byte to a quit /// byte. So if the caller attempts to undo any of that, then this will /// panic. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte. This could be useful if, for example, you wanted to prevent /// a user supplied pattern from matching across a line boundary. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "foo\nbar"; /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::quit(b'\n', 3); /// let got = dfa.try_search_fwd( /// &mut cache, /// &Input::new(haystack), /// ).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn quit(mut self, byte: u8, yes: bool) -> Config { if self.get_unicode_word_boundary() && !byte.is_ascii() && !yes { panic!( "cannot set non-ASCII byte to be non-quit when \ Unicode word boundaries are enabled" ); } if self.quitset.is_none() { self.quitset = Some(ByteSet::empty()); } if yes { self.quitset.as_mut().unwrap().add(byte); } else { self.quitset.as_mut().unwrap().remove(byte); } self } /// Enable specializing start states in the lazy DFA. /// /// When start states are specialized, an implementor of a search routine /// using a lazy DFA can tell when the search has entered a starting state. /// When start states aren't specialized, then it is impossible to know /// whether the search has entered a start state. /// /// Ideally, this option wouldn't need to exist and we could always /// specialize start states. The problem is that start states can be quite /// active. This in turn means that an efficient search routine is likely /// to ping-pong between a heavily optimized hot loop that handles most /// states and to a less optimized specialized handling of start states. /// This causes branches to get heavily mispredicted and overall can /// materially decrease throughput. Therefore, specializing start states /// should only be enabled when it is needed. /// /// Knowing whether a search is in a start state is typically useful when a /// prefilter is active for the search. A prefilter is typically only run /// when in a start state and a prefilter can greatly accelerate a search. /// Therefore, the possible cost of specializing start states is worth it /// in this case. Otherwise, if you have no prefilter, there is likely no /// reason to specialize start states. /// /// This is disabled by default, but note that it is automatically /// enabled (or disabled) if [`Config::prefilter`] is set. Namely, unless /// `specialize_start_states` has already been set, [`Config::prefilter`] /// will automatically enable or disable it based on whether a prefilter /// is present or not, respectively. This is done because a prefilter's /// effectiveness is rooted in being executed whenever the DFA is in a /// start state, and that's only possible to do when they are specialized. /// /// Note that it is plausibly reasonable to _disable_ this option /// explicitly while _enabling_ a prefilter. In that case, a prefilter /// will still be run at the beginning of a search, but never again. This /// in theory could strike a good balance if you're in a situation where a /// prefilter is likely to produce many false positive candidates. /// /// # Example /// /// This example shows how to enable start state specialization and then /// shows how to check whether a state is a start state or not. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; /// /// let dfa = DFA::builder() /// .configure(DFA::config().specialize_start_states(true)) /// .build(r"[a-z]+")?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "123 foobar 4567".as_bytes(); /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; /// // The ID returned by 'start_state_forward' will always be tagged as /// // a start state when start state specialization is enabled. /// assert!(sid.is_tagged()); /// assert!(sid.is_start()); /// /// # Ok::<(), Box>(()) /// ``` /// /// Compare the above with the default lazy DFA configuration where /// start states are _not_ specialized. In this case, the start state /// is not tagged and `sid.is_start()` returns false. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, MatchError, Input}; /// /// let dfa = DFA::new(r"[a-z]+")?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "123 foobar 4567".as_bytes(); /// let sid = dfa.start_state_forward(&mut cache, &Input::new(haystack))?; /// // Start states are not tagged in the default configuration! /// assert!(!sid.is_tagged()); /// assert!(!sid.is_start()); /// /// # Ok::<(), Box>(()) /// ``` pub fn specialize_start_states(mut self, yes: bool) -> Config { self.specialize_start_states = Some(yes); self } /// Sets the maximum amount of heap memory, in bytes, to allocate to the /// cache for use during a lazy DFA search. If the lazy DFA would otherwise /// use more heap memory, then, depending on other configuration knobs, /// either stop the search and return an error or clear the cache and /// continue the search. /// /// The default cache capacity is some "reasonable" number that will /// accommodate most regular expressions. You may find that if you need /// to build a large DFA then it may be necessary to increase the cache /// capacity. /// /// Note that while building a lazy DFA will do a "minimum" check to ensure /// the capacity is big enough, this is more or less about correctness. /// If the cache is bigger than the minimum but still "too small," then the /// lazy DFA could wind up spending a lot of time clearing the cache and /// recomputing transitions, thus negating the performance benefits of a /// lazy DFA. Thus, setting the cache capacity is mostly an experimental /// endeavor. For most common patterns, however, the default should be /// sufficient. /// /// For more details on how the lazy DFA's cache is used, see the /// documentation for [`Cache`]. /// /// # Example /// /// This example shows what happens if the configured cache capacity is /// too small. In such cases, one can override the cache capacity to make /// it bigger. Alternatively, one might want to use less memory by setting /// a smaller cache capacity. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let pattern = r"\p{L}{1000}"; /// /// // The default cache capacity is likely too small to deal with regexes /// // that are very large. Large repetitions of large Unicode character /// // classes are a common way to make very large regexes. /// let _ = DFA::new(pattern).unwrap_err(); /// // Bump up the capacity to something bigger. /// let dfa = DFA::builder() /// .configure(DFA::config().cache_capacity(100 * (1<<20))) // 100 MB /// .build(pattern)?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); /// let expected = Some(HalfMatch::must(0, 2000)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn cache_capacity(mut self, bytes: usize) -> Config { self.cache_capacity = Some(bytes); self } /// Configures construction of a lazy DFA to use the minimum cache capacity /// if the configured capacity is otherwise too small for the provided NFA. /// /// This is useful if you never want lazy DFA construction to fail because /// of a capacity that is too small. /// /// In general, this option is typically not a good idea. In particular, /// while a minimum cache capacity does permit the lazy DFA to function /// where it otherwise couldn't, it's plausible that it may not function /// well if it's constantly running out of room. In that case, the speed /// advantages of the lazy DFA may be negated. On the other hand, the /// "minimum" cache capacity computed may not be completely accurate and /// could actually be bigger than what is really necessary. Therefore, it /// is plausible that using the minimum cache capacity could still result /// in very good performance. /// /// This is disabled by default. /// /// # Example /// /// This example shows what happens if the configured cache capacity is /// too small. In such cases, one could override the capacity explicitly. /// An alternative, demonstrated here, let's us force construction to use /// the minimum cache capacity if the configured capacity is otherwise /// too small. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let pattern = r"\p{L}{1000}"; /// /// // The default cache capacity is likely too small to deal with regexes /// // that are very large. Large repetitions of large Unicode character /// // classes are a common way to make very large regexes. /// let _ = DFA::new(pattern).unwrap_err(); /// // Configure construction such it automatically selects the minimum /// // cache capacity if it would otherwise be too small. /// let dfa = DFA::builder() /// .configure(DFA::config().skip_cache_capacity_check(true)) /// .build(pattern)?; /// let mut cache = dfa.create_cache(); /// /// let haystack = "ͰͲͶͿΆΈΉΊΌΎΏΑΒΓΔΕΖΗΘΙ".repeat(50); /// let expected = Some(HalfMatch::must(0, 2000)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(&haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn skip_cache_capacity_check(mut self, yes: bool) -> Config { self.skip_cache_capacity_check = Some(yes); self } /// Configure a lazy DFA search to quit after a certain number of cache /// clearings. /// /// When a minimum is set, then a lazy DFA search will *possibly* "give /// up" after the minimum number of cache clearings has occurred. This is /// typically useful in scenarios where callers want to detect whether the /// lazy DFA search is "efficient" or not. If the cache is cleared too many /// times, this is a good indicator that it is not efficient, and thus, the /// caller may wish to use some other regex engine. /// /// Note that the number of times a cache is cleared is a property of /// the cache itself. Thus, if a cache is used in a subsequent search /// with a similarly configured lazy DFA, then it could cause the /// search to "give up" if the cache needed to be cleared, depending /// on its internal count and configured minimum. The cache clear /// count can only be reset to `0` via [`DFA::reset_cache`] (or /// [`Regex::reset_cache`](crate::hybrid::regex::Regex::reset_cache) if /// you're using the `Regex` API). /// /// By default, no minimum is configured. Thus, a lazy DFA search will /// never give up due to cache clearings. If you do set this option, you /// might consider also setting [`Config::minimum_bytes_per_state`] in /// order for the lazy DFA to take efficiency into account before giving /// up. /// /// # Example /// /// This example uses a somewhat pathological configuration to demonstrate /// the _possible_ behavior of cache clearing and how it might result /// in a search that returns an error. /// /// It is important to note that the precise mechanics of how and when /// a cache gets cleared is an implementation detail. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::dfa::DFA, Input, MatchError, MatchErrorKind}; /// /// // This is a carefully chosen regex. The idea is to pick one /// // that requires some decent number of states (hence the bounded /// // repetition). But we specifically choose to create a class with an /// // ASCII letter and a non-ASCII letter so that we can check that no new /// // states are created once the cache is full. Namely, if we fill up the /// // cache on a haystack of 'a's, then in order to match one 'β', a new /// // state will need to be created since a 'β' is encoded with multiple /// // bytes. Since there's no room for this state, the search should quit /// // at the very first position. /// let pattern = r"[aβ]{100}"; /// let dfa = DFA::builder() /// .configure( /// // Configure it so that we have the minimum cache capacity /// // possible. And that if any clearings occur, the search quits. /// DFA::config() /// .skip_cache_capacity_check(true) /// .cache_capacity(0) /// .minimum_cache_clear_count(Some(0)), /// ) /// .build(pattern)?; /// let mut cache = dfa.create_cache(); /// /// // Our search will give up before reaching the end! /// let haystack = "a".repeat(101).into_bytes(); /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); /// assert!(matches!( /// *result.unwrap_err().kind(), /// MatchErrorKind::GaveUp { .. }, /// )); /// /// // Now that we know the cache is full, if we search a haystack that we /// // know will require creating at least one new state, it should not /// // be able to make much progress. /// let haystack = "β".repeat(101).into_bytes(); /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); /// assert!(matches!( /// *result.unwrap_err().kind(), /// MatchErrorKind::GaveUp { .. }, /// )); /// /// // If we reset the cache, then we should be able to create more states /// // and make more progress with searching for betas. /// cache.reset(&dfa); /// let haystack = "β".repeat(101).into_bytes(); /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); /// assert!(matches!( /// *result.unwrap_err().kind(), /// MatchErrorKind::GaveUp { .. }, /// )); /// /// // ... switching back to ASCII still makes progress since it just needs /// // to set transitions on existing states! /// let haystack = "a".repeat(101).into_bytes(); /// let result = dfa.try_search_fwd(&mut cache, &Input::new(&haystack)); /// assert!(matches!( /// *result.unwrap_err().kind(), /// MatchErrorKind::GaveUp { .. }, /// )); /// /// # Ok::<(), Box>(()) /// ``` pub fn minimum_cache_clear_count(mut self, min: Option) -> Config { self.minimum_cache_clear_count = Some(min); self } /// Configure a lazy DFA search to quit only when its efficiency drops /// below the given minimum. /// /// The efficiency of the cache is determined by the number of DFA states /// compiled per byte of haystack searched. For example, if the efficiency /// is 2, then it means the lazy DFA is creating a new DFA state after /// searching approximately 2 bytes in a haystack. Generally speaking, 2 /// is quite bad and it's likely that even a slower regex engine like the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) would be faster. /// /// This has no effect if [`Config::minimum_cache_clear_count`] is not set. /// Namely, this option only kicks in when the cache has been cleared more /// than the minimum number. If no minimum is set, then the cache is simply /// cleared whenever it fills up and it is impossible for the lazy DFA to /// quit due to ineffective use of the cache. /// /// In general, if one is setting [`Config::minimum_cache_clear_count`], /// then one should probably also set this knob as well. The reason is /// that the absolute number of times the cache is cleared is generally /// not a great predictor of efficiency. For example, if a new DFA state /// is created for every 1,000 bytes searched, then it wouldn't be hard /// for the cache to get cleared more than `N` times and then cause the /// lazy DFA to quit. But a new DFA state every 1,000 bytes is likely quite /// good from a performance perspective, and it's likely that the lazy /// DFA should continue searching, even if it requires clearing the cache /// occasionally. /// /// Finally, note that if you're implementing your own lazy DFA search /// routine and also want this efficiency check to work correctly, then /// you'll need to use the following routines to record search progress: /// /// * Call [`Cache::search_start`] at the beginning of every search. /// * Call [`Cache::search_update`] whenever [`DFA::next_state`] is /// called. /// * Call [`Cache::search_finish`] before completing a search. (It is /// not strictly necessary to call this when an error is returned, as /// `Cache::search_start` will automatically finish the previous search /// for you. But calling it where possible before returning helps improve /// the accuracy of how many bytes have actually been searched.) pub fn minimum_bytes_per_state(mut self, min: Option) -> Config { self.minimum_bytes_per_state = Some(min); self } /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns the prefilter set in this configuration, if one at all. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Returns whether this configuration has enabled anchored starting states /// for every pattern in the DFA. pub fn get_starts_for_each_pattern(&self) -> bool { self.starts_for_each_pattern.unwrap_or(false) } /// Returns whether this configuration has enabled byte classes or not. /// This is typically a debugging oriented option, as disabling it confers /// no speed benefit. pub fn get_byte_classes(&self) -> bool { self.byte_classes.unwrap_or(true) } /// Returns whether this configuration has enabled heuristic Unicode word /// boundary support. When enabled, it is possible for a search to return /// an error. pub fn get_unicode_word_boundary(&self) -> bool { self.unicode_word_boundary.unwrap_or(false) } /// Returns whether this configuration will instruct the lazy DFA to enter /// a quit state whenever the given byte is seen during a search. When at /// least one byte has this enabled, it is possible for a search to return /// an error. pub fn get_quit(&self, byte: u8) -> bool { self.quitset.map_or(false, |q| q.contains(byte)) } /// Returns whether this configuration will instruct the lazy DFA to /// "specialize" start states. When enabled, the lazy DFA will tag start /// states so that search routines using the lazy DFA can detect when /// it's in a start state and do some kind of optimization (like run a /// prefilter). pub fn get_specialize_start_states(&self) -> bool { self.specialize_start_states.unwrap_or(false) } /// Returns the cache capacity set on this configuration. pub fn get_cache_capacity(&self) -> usize { self.cache_capacity.unwrap_or(2 * (1 << 20)) } /// Returns whether the cache capacity check should be skipped. pub fn get_skip_cache_capacity_check(&self) -> bool { self.skip_cache_capacity_check.unwrap_or(false) } /// Returns, if set, the minimum number of times the cache must be cleared /// before a lazy DFA search can give up. When no minimum is set, then a /// search will never quit and will always clear the cache whenever it /// fills up. pub fn get_minimum_cache_clear_count(&self) -> Option { self.minimum_cache_clear_count.unwrap_or(None) } /// Returns, if set, the minimum number of bytes per state that need to be /// processed in order for the lazy DFA to keep going. If the minimum falls /// below this number (and the cache has been cleared a minimum number of /// times), then the lazy DFA will return a "gave up" error. pub fn get_minimum_bytes_per_state(&self) -> Option { self.minimum_bytes_per_state.unwrap_or(None) } /// Returns the minimum lazy DFA cache capacity required for the given NFA. /// /// The cache capacity required for a particular NFA may change without /// notice. Callers should not rely on it being stable. /// /// This is useful for informational purposes, but can also be useful for /// other reasons. For example, if one wants to check the minimum cache /// capacity themselves or if one wants to set the capacity based on the /// minimum. /// /// This may return an error if this configuration does not support all of /// the instructions used in the given NFA. For example, if the NFA has a /// Unicode word boundary but this configuration does not enable heuristic /// support for Unicode word boundaries. pub fn get_minimum_cache_capacity( &self, nfa: &thompson::NFA, ) -> Result { let quitset = self.quit_set_from_nfa(nfa)?; let classes = self.byte_classes_from_nfa(nfa, &quitset); let starts = self.get_starts_for_each_pattern(); Ok(minimum_cache_capacity(nfa, &classes, starts)) } /// Returns the byte class map used during search from the given NFA. /// /// If byte classes are disabled on this configuration, then a map is /// returned that puts each byte in its own equivalent class. fn byte_classes_from_nfa( &self, nfa: &thompson::NFA, quit: &ByteSet, ) -> ByteClasses { if !self.get_byte_classes() { // The lazy DFA will always use the equivalence class map, but // enabling this option is useful for debugging. Namely, this will // cause all transitions to be defined over their actual bytes // instead of an opaque equivalence class identifier. The former is // much easier to grok as a human. ByteClasses::singletons() } else { let mut set = nfa.byte_class_set().clone(); // It is important to distinguish any "quit" bytes from all other // bytes. Otherwise, a non-quit byte may end up in the same class // as a quit byte, and thus cause the DFA stop when it shouldn't. // // Test case: // // regex-cli find match hybrid --unicode-word-boundary \ // -p '^#' -p '\b10\.55\.182\.100\b' -y @conn.json.1000x.log if !quit.is_empty() { set.add_set(&quit); } set.byte_classes() } } /// Return the quit set for this configuration and the given NFA. /// /// This may return an error if the NFA is incompatible with this /// configuration's quit set. For example, if the NFA has a Unicode word /// boundary and the quit set doesn't include non-ASCII bytes. fn quit_set_from_nfa( &self, nfa: &thompson::NFA, ) -> Result { let mut quit = self.quitset.unwrap_or(ByteSet::empty()); if nfa.look_set_any().contains_word_unicode() { if self.get_unicode_word_boundary() { for b in 0x80..=0xFF { quit.add(b); } } else { // If heuristic support for Unicode word boundaries wasn't // enabled, then we can still check if our quit set is correct. // If the caller set their quit bytes in a way that causes the // DFA to quit on at least all non-ASCII bytes, then that's all // we need for heuristic support to work. if !quit.contains_range(0x80, 0xFF) { return Err( BuildError::unsupported_dfa_word_boundary_unicode(), ); } } } Ok(quit) } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), pre: o.pre.or_else(|| self.pre.clone()), starts_for_each_pattern: o .starts_for_each_pattern .or(self.starts_for_each_pattern), byte_classes: o.byte_classes.or(self.byte_classes), unicode_word_boundary: o .unicode_word_boundary .or(self.unicode_word_boundary), quitset: o.quitset.or(self.quitset), specialize_start_states: o .specialize_start_states .or(self.specialize_start_states), cache_capacity: o.cache_capacity.or(self.cache_capacity), skip_cache_capacity_check: o .skip_cache_capacity_check .or(self.skip_cache_capacity_check), minimum_cache_clear_count: o .minimum_cache_clear_count .or(self.minimum_cache_clear_count), minimum_bytes_per_state: o .minimum_bytes_per_state .or(self.minimum_bytes_per_state), } } } /// A builder for constructing a lazy deterministic finite automaton from /// regular expressions. /// /// As a convenience, [`DFA::builder`] is an alias for [`Builder::new`]. The /// advantage of the former is that it often lets you avoid importing the /// `Builder` type directly. /// /// This builder provides two main things: /// /// 1. It provides a few different `build` routines for actually constructing /// a DFA from different kinds of inputs. The most convenient is /// [`Builder::build`], which builds a DFA directly from a pattern string. The /// most flexible is [`Builder::build_from_nfa`], which builds a DFA straight /// from an NFA. /// 2. The builder permits configuring a number of things. /// [`Builder::configure`] is used with [`Config`] to configure aspects of /// the DFA and the construction process itself. [`Builder::syntax`] and /// [`Builder::thompson`] permit configuring the regex parser and Thompson NFA /// construction, respectively. The syntax and thompson configurations only /// apply when building from a pattern string. /// /// This builder always constructs a *single* lazy DFA. As such, this builder /// can only be used to construct regexes that either detect the presence /// of a match or find the end location of a match. A single DFA cannot /// produce both the start and end of a match. For that information, use a /// [`Regex`](crate::hybrid::regex::Regex), which can be similarly configured /// using [`regex::Builder`](crate::hybrid::regex::Builder). The main reason /// to use a DFA directly is if the end location of a match is enough for your /// use case. Namely, a `Regex` will construct two lazy DFAs instead of one, /// since a second reverse DFA is needed to find the start of a match. /// /// # Example /// /// This example shows how to build a lazy DFA that uses a tiny cache capacity /// and completely disables Unicode. That is: /// /// * Things such as `\w`, `.` and `\b` are no longer Unicode-aware. `\w` /// and `\b` are ASCII-only while `.` matches any byte except for `\n` /// (instead of any UTF-8 encoding of a Unicode scalar value except for /// `\n`). Things that are Unicode only, such as `\pL`, are not allowed. /// * The pattern itself is permitted to match invalid UTF-8. For example, /// things like `[^a]` that match any byte except for `a` are permitted. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// util::syntax, /// HalfMatch, Input, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().cache_capacity(5_000)) /// .thompson(thompson::Config::new().utf8(false)) /// .syntax(syntax::Config::new().unicode(false).utf8(false)) /// .build(r"foo[^b]ar.*")?; /// let mut cache = dfa.create_cache(); /// /// let haystack = b"\xFEfoo\xFFar\xE2\x98\xFF\n"; /// let expected = Some(HalfMatch::must(0, 10)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } impl Builder { /// Create a new lazy DFA builder with the default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a lazy DFA from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a lazy DFA from the given patterns. /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self .thompson .clone() // We can always forcefully disable captures because DFAs do not // support them. .configure( thompson::Config::new() .which_captures(thompson::WhichCaptures::None), ) .build_many(patterns) .map_err(BuildError::nfa)?; self.build_from_nfa(nfa) } /// Build a DFA from the given NFA. /// /// Note that this requires owning a `thompson::NFA`. While this may force /// you to clone the NFA, such a clone is not a deep clone. Namely, NFAs /// are defined internally to support shared ownership such that cloning is /// very cheap. /// /// # Example /// /// This example shows how to build a lazy DFA if you already have an NFA /// in hand. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// HalfMatch, Input, /// }; /// /// let haystack = "foo123bar"; /// /// // This shows how to set non-default options for building an NFA. /// let nfa = thompson::Compiler::new() /// .configure(thompson::Config::new().shrink(true)) /// .build(r"[0-9]+")?; /// let dfa = DFA::builder().build_from_nfa(nfa)?; /// let mut cache = dfa.create_cache(); /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&mut cache, &Input::new(haystack))?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_from_nfa( &self, nfa: thompson::NFA, ) -> Result { let quitset = self.config.quit_set_from_nfa(&nfa)?; let classes = self.config.byte_classes_from_nfa(&nfa, &quitset); // Check that we can fit at least a few states into our cache, // otherwise it's pretty senseless to use the lazy DFA. This does have // a possible failure mode though. This assumes the maximum size of a // state in powerset space (so, the total number of NFA states), which // may never actually materialize, and could be quite a bit larger // than the actual biggest state. If this turns out to be a problem, // we could expose a knob that disables this check. But if so, we have // to be careful not to panic in other areas of the code (the cache // clearing and init code) that tend to assume some minimum useful // cache capacity. let min_cache = minimum_cache_capacity( &nfa, &classes, self.config.get_starts_for_each_pattern(), ); let mut cache_capacity = self.config.get_cache_capacity(); if cache_capacity < min_cache { // When the caller has asked us to skip the cache capacity check, // then we simply force the cache capacity to its minimum amount // and mush on. if self.config.get_skip_cache_capacity_check() { debug!( "given capacity ({}) is too small, \ since skip_cache_capacity_check is enabled, \ setting cache capacity to minimum ({})", cache_capacity, min_cache, ); cache_capacity = min_cache; } else { return Err(BuildError::insufficient_cache_capacity( min_cache, cache_capacity, )); } } // We also need to check that we can fit at least some small number // of states in our state ID space. This is unlikely to trigger in // >=32-bit systems, but 16-bit systems have a pretty small state ID // space since a number of bits are used up as sentinels. if let Err(err) = minimum_lazy_state_id(&classes) { return Err(BuildError::insufficient_state_id_capacity(err)); } let stride2 = classes.stride2(); let start_map = StartByteMap::new(nfa.look_matcher()); Ok(DFA { config: self.config.clone(), nfa, stride2, start_map, classes, quitset, cache_capacity, }) } /// Apply the given lazy DFA configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a lazy DFA directly from a /// pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like whether the DFA should match the regex /// in reverse or if additional time should be spent shrinking the size of /// the NFA. /// /// These settings only apply when constructing a DFA directly from a /// pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } /// Represents the current state of an overlapping search. /// /// This is used for overlapping searches since they need to know something /// about the previous search. For example, when multiple patterns match at the /// same position, this state tracks the last reported pattern so that the next /// search knows whether to report another matching pattern or continue with /// the search at the next position. Additionally, it also tracks which state /// the last search call terminated in. /// /// This type provides little introspection capabilities. The only thing a /// caller can do is construct it and pass it around to permit search routines /// to use it to track state, and also ask whether a match has been found. /// /// Callers should always provide a fresh state constructed via /// [`OverlappingState::start`] when starting a new search. Reusing state from /// a previous search may result in incorrect results. #[derive(Clone, Debug, Eq, PartialEq)] pub struct OverlappingState { /// The match reported by the most recent overlapping search to use this /// state. /// /// If a search does not find any matches, then it is expected to clear /// this value. pub(crate) mat: Option, /// The state ID of the state at which the search was in when the call /// terminated. When this is a match state, `last_match` must be set to a /// non-None value. /// /// A `None` value indicates the start state of the corresponding /// automaton. We cannot use the actual ID, since any one automaton may /// have many start states, and which one is in use depends on several /// search-time factors. pub(crate) id: Option, /// The position of the search. /// /// When `id` is None (i.e., we are starting a search), this is set to /// the beginning of the search as given by the caller regardless of its /// current value. Subsequent calls to an overlapping search pick up at /// this offset. pub(crate) at: usize, /// The index into the matching patterns of the next match to report if the /// current state is a match state. Note that this may be 1 greater than /// the total number of matches to report for the current match state. (In /// which case, no more matches should be reported at the current position /// and the search should advance to the next position.) pub(crate) next_match_index: Option, /// This is set to true when a reverse overlapping search has entered its /// EOI transitions. /// /// This isn't used in a forward search because it knows to stop once the /// position exceeds the end of the search range. In a reverse search, /// since we use unsigned offsets, we don't "know" once we've gone past /// `0`. So the only way to detect it is with this extra flag. The reverse /// overlapping search knows to terminate specifically after it has /// reported all matches after following the EOI transition. pub(crate) rev_eoi: bool, } impl OverlappingState { /// Create a new overlapping state that begins at the start state of any /// automaton. pub fn start() -> OverlappingState { OverlappingState { mat: None, id: None, at: 0, next_match_index: None, rev_eoi: false, } } /// Return the match result of the most recent search to execute with this /// state. /// /// A searches will clear this result automatically, such that if no /// match is found, this will correctly report `None`. pub fn get_match(&self) -> Option { self.mat } } /// Runs the given overlapping `search` function (forwards or backwards) until /// a match is found whose offset does not split a codepoint. /// /// This is *not* always correct to call. It should only be called when the /// underlying NFA has UTF-8 mode enabled *and* it can produce zero-width /// matches. Calling this when both of those things aren't true might result /// in legitimate matches getting skipped. #[cold] #[inline(never)] fn skip_empty_utf8_splits_overlapping( input: &Input<'_>, state: &mut OverlappingState, mut search: F, ) -> Result<(), MatchError> where F: FnMut(&Input<'_>, &mut OverlappingState) -> Result<(), MatchError>, { // Note that this routine works for forwards and reverse searches // even though there's no code here to handle those cases. That's // because overlapping searches drive themselves to completion via // `OverlappingState`. So all we have to do is push it until no matches are // found. let mut hm = match state.get_match() { None => return Ok(()), Some(hm) => hm, }; if input.get_anchored().is_anchored() { if !input.is_char_boundary(hm.offset()) { state.mat = None; } return Ok(()); } while !input.is_char_boundary(hm.offset()) { search(input, state)?; hm = match state.get_match() { None => return Ok(()), Some(hm) => hm, }; } Ok(()) } /// Based on the minimum number of states required for a useful lazy DFA cache, /// this returns the minimum lazy state ID that must be representable. /// /// It's not likely for this to have any impact 32-bit systems (or higher), but /// on 16-bit systems, the lazy state ID space is quite constrained and thus /// may be insufficient if our MIN_STATES value is (for some reason) too high. fn minimum_lazy_state_id( classes: &ByteClasses, ) -> Result { let stride = 1 << classes.stride2(); let min_state_index = MIN_STATES.checked_sub(1).unwrap(); LazyStateID::new(min_state_index * stride) } /// Based on the minimum number of states required for a useful lazy DFA cache, /// this returns a heuristic minimum number of bytes of heap space required. /// /// This is a "heuristic" because the minimum it returns is likely bigger than /// the true minimum. Namely, it assumes that each powerset NFA/DFA state uses /// the maximum number of NFA states (all of them). This is likely bigger /// than what is required in practice. Computing the true minimum effectively /// requires determinization, which is probably too much work to do for a /// simple check like this. /// /// One of the issues with this approach IMO is that it requires that this /// be in sync with the calculation above for computing how much heap memory /// the DFA cache uses. If we get it wrong, it's possible for example for the /// minimum to be smaller than the computed heap memory, and thus, it may be /// the case that we can't add the required minimum number of states. That in /// turn will make lazy DFA panic because we assume that we can add at least a /// minimum number of states. /// /// Another approach would be to always allow the minimum number of states to /// be added to the lazy DFA cache, even if it exceeds the configured cache /// limit. This does mean that the limit isn't really a limit in all cases, /// which is unfortunate. But it does at least guarantee that the lazy DFA can /// always make progress, even if it is slow. (This approach is very similar to /// enabling the 'skip_cache_capacity_check' config knob, except it wouldn't /// rely on cache size calculation. Instead, it would just always permit a /// minimum number of states to be added.) fn minimum_cache_capacity( nfa: &thompson::NFA, classes: &ByteClasses, starts_for_each_pattern: bool, ) -> usize { const ID_SIZE: usize = size_of::(); const STATE_SIZE: usize = size_of::(); let stride = 1 << classes.stride2(); let states_len = nfa.states().len(); let sparses = 2 * states_len * NFAStateID::SIZE; let trans = MIN_STATES * stride * ID_SIZE; let mut starts = Start::len() * ID_SIZE; if starts_for_each_pattern { starts += (Start::len() * nfa.pattern_len()) * ID_SIZE; } // The min number of states HAS to be at least 4: we have 3 sentinel states // and then we need space for one more when we save a state after clearing // the cache. We also need space for one more, otherwise we get stuck in a // loop where we try to add a 5th state, which gets rejected, which clears // the cache, which adds back a saved state (4th total state) which then // tries to add the 5th state again. assert!(MIN_STATES >= 5, "minimum number of states has to be at least 5"); // The minimum number of non-sentinel states. We consider this separately // because sentinel states are much smaller in that they contain no NFA // states. Given our aggressive calculation here, it's worth being more // precise with the number of states we need. let non_sentinel = MIN_STATES.checked_sub(SENTINEL_STATES).unwrap(); // Every `State` has 5 bytes for flags, 4 bytes (max) for the number of // patterns, followed by 32-bit encodings of patterns and then delta // varint encodings of NFA state IDs. We use the worst case (which isn't // technically possible) of 5 bytes for each NFA state ID. // // HOWEVER, three of the states needed by a lazy DFA are just the sentinel // unknown, dead and quit states. Those states have a known size and it is // small. let dead_state_size = State::dead().memory_usage(); let max_state_size = 5 + 4 + (nfa.pattern_len() * 4) + (states_len * 5); let states = (SENTINEL_STATES * (STATE_SIZE + dead_state_size)) + (non_sentinel * (STATE_SIZE + max_state_size)); // NOTE: We don't double count heap memory used by State for this map since // we use reference counting to avoid doubling memory usage. (This tends to // be where most memory is allocated in the cache.) let states_to_sid = (MIN_STATES * STATE_SIZE) + (MIN_STATES * ID_SIZE); let stack = states_len * NFAStateID::SIZE; let scratch_state_builder = max_state_size; trans + starts + states + states_to_sid + sparses + stack + scratch_state_builder } #[cfg(all(test, feature = "syntax"))] mod tests { use super::*; // Tests that we handle heuristic Unicode word boundary support in reverse // DFAs in the specific case of contextual searches. // // I wrote this test when I discovered a bug in how heuristic word // boundaries were handled. Namely, that the starting state selection // didn't consider the DFA's quit byte set when looking at the byte // immediately before the start of the search (or immediately after the // end of the search in the case of a reverse search). As a result, it was // possible for '\bfoo\b' to match 'β123' because the trailing \xB2 byte // in the 'β' codepoint would be treated as a non-word character. But of // course, this search should trigger the DFA to quit, since there is a // non-ASCII byte in consideration. // // Thus, I fixed 'start_state_{forward,reverse}' to check the quit byte set // if it wasn't empty. The forward case is tested in the doc test for the // Config::unicode_word_boundary API. We test the reverse case here, which // is sufficiently niche that it doesn't really belong in a doc test. #[test] fn heuristic_unicode_reverse() { let dfa = DFA::builder() .configure(DFA::config().unicode_word_boundary(true)) .thompson(thompson::Config::new().reverse(true)) .build(r"\b[0-9]+\b") .unwrap(); let mut cache = dfa.create_cache(); let input = Input::new("β123").range(2..); let expected = MatchError::quit(0xB2, 1); let got = dfa.try_search_rev(&mut cache, &input); assert_eq!(Err(expected), got); let input = Input::new("123β").range(..3); let expected = MatchError::quit(0xCE, 3); let got = dfa.try_search_rev(&mut cache, &input); assert_eq!(Err(expected), got); } } regex-automata-0.4.9/src/hybrid/error.rs000064400000000000000000000204241046102023000163320ustar 00000000000000use crate::{hybrid::id::LazyStateIDError, nfa, util::search::Anchored}; /// An error that occurs when initial construction of a lazy DFA fails. /// /// A build error can occur when insufficient cache capacity is configured or /// if something about the NFA is unsupported. (For example, if one attempts /// to build a lazy DFA without heuristic Unicode support but with an NFA that /// contains a Unicode word boundary.) /// /// This error does not provide many introspection capabilities. There are /// generally only two things you can do with it: /// /// * Obtain a human readable message via its `std::fmt::Display` impl. /// * Access an underlying /// [`nfa::thompson::BuildError`](crate::nfa::thompson::BuildError) /// type from its `source` method via the `std::error::Error` trait. This error /// only occurs when using convenience routines for building a lazy DFA /// directly from a pattern string. /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } #[derive(Clone, Debug)] enum BuildErrorKind { NFA(nfa::thompson::BuildError), InsufficientCacheCapacity { minimum: usize, given: usize }, InsufficientStateIDCapacity { err: LazyStateIDError }, Unsupported(&'static str), } impl BuildError { pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { BuildError { kind: BuildErrorKind::NFA(err) } } pub(crate) fn insufficient_cache_capacity( minimum: usize, given: usize, ) -> BuildError { BuildError { kind: BuildErrorKind::InsufficientCacheCapacity { minimum, given }, } } pub(crate) fn insufficient_state_id_capacity( err: LazyStateIDError, ) -> BuildError { BuildError { kind: BuildErrorKind::InsufficientStateIDCapacity { err }, } } pub(crate) fn unsupported_dfa_word_boundary_unicode() -> BuildError { let msg = "cannot build lazy DFAs for regexes with Unicode word \ boundaries; switch to ASCII word boundaries, or \ heuristically enable Unicode word boundaries or use a \ different regex engine"; BuildError { kind: BuildErrorKind::Unsupported(msg) } } } #[cfg(feature = "std")] impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind { BuildErrorKind::NFA(ref err) => Some(err), _ => None, } } } impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind { BuildErrorKind::NFA(_) => write!(f, "error building NFA"), BuildErrorKind::InsufficientCacheCapacity { minimum, given } => { write!( f, "given cache capacity ({}) is smaller than \ minimum required ({})", given, minimum, ) } BuildErrorKind::InsufficientStateIDCapacity { ref err } => { err.fmt(f) } BuildErrorKind::Unsupported(ref msg) => { write!(f, "unsupported regex feature for DFAs: {}", msg) } } } } /// An error that can occur when computing the start state for a search. /// /// Computing a start state can fail for a few reasons, either /// based on incorrect configuration or even based on whether /// the look-behind byte triggers a quit state. Typically /// one does not need to handle this error if you're using /// [`DFA::start_state_forward`](crate::hybrid::dfa::DFA::start_state_forward) /// (or its reverse counterpart), as that routine automatically converts /// `StartError` to a [`MatchError`](crate::MatchError) for you. /// /// This error may be returned by the /// [`DFA::start_state`](crate::hybrid::dfa::DFA::start_state) routine. /// /// This error implements the `std::error::Error` trait when the `std` feature /// is enabled. /// /// This error is marked as non-exhaustive. New variants may be added in a /// semver compatible release. #[non_exhaustive] #[derive(Clone, Debug)] pub enum StartError { /// An error that occurs when cache inefficiency has dropped below the /// configured heuristic thresholds. Cache { /// The underlying cache error that occurred. err: CacheError, }, /// An error that occurs when a starting configuration's look-behind byte /// is in this DFA's quit set. Quit { /// The quit byte that was found. byte: u8, }, /// An error that occurs when the caller requests an anchored mode that /// isn't supported by the DFA. UnsupportedAnchored { /// The anchored mode given that is unsupported. mode: Anchored, }, } impl StartError { pub(crate) fn cache(err: CacheError) -> StartError { StartError::Cache { err } } pub(crate) fn quit(byte: u8) -> StartError { StartError::Quit { byte } } pub(crate) fn unsupported_anchored(mode: Anchored) -> StartError { StartError::UnsupportedAnchored { mode } } } #[cfg(feature = "std")] impl std::error::Error for StartError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match *self { StartError::Cache { ref err } => Some(err), _ => None, } } } impl core::fmt::Display for StartError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { StartError::Cache { .. } => write!( f, "error computing start state because of cache inefficiency" ), StartError::Quit { byte } => write!( f, "error computing start state because the look-behind byte \ {:?} triggered a quit state", crate::util::escape::DebugByte(byte), ), StartError::UnsupportedAnchored { mode: Anchored::Yes } => { write!( f, "error computing start state because \ anchored searches are not supported or enabled" ) } StartError::UnsupportedAnchored { mode: Anchored::No } => { write!( f, "error computing start state because \ unanchored searches are not supported or enabled" ) } StartError::UnsupportedAnchored { mode: Anchored::Pattern(pid), } => { write!( f, "error computing start state because \ anchored searches for a specific pattern ({}) \ are not supported or enabled", pid.as_usize(), ) } } } } /// An error that occurs when cache usage has become inefficient. /// /// One of the weaknesses of a lazy DFA is that it may need to clear its /// cache repeatedly if it's not big enough. If this happens too much, then it /// can slow searching down significantly. A mitigation to this is to use /// heuristics to detect whether the cache is being used efficiently or not. /// If not, then a lazy DFA can return a `CacheError`. /// /// The default configuration of a lazy DFA in this crate is /// set such that a `CacheError` will never occur. Instead, /// callers must opt into this behavior with settings like /// [`dfa::Config::minimum_cache_clear_count`](crate::hybrid::dfa::Config::minimum_cache_clear_count) /// and /// [`dfa::Config::minimum_bytes_per_state`](crate::hybrid::dfa::Config::minimum_bytes_per_state). /// /// When the `std` feature is enabled, this implements the `std::error::Error` /// trait. #[derive(Clone, Debug)] pub struct CacheError(()); impl CacheError { pub(crate) fn too_many_cache_clears() -> CacheError { CacheError(()) } pub(crate) fn bad_efficiency() -> CacheError { CacheError(()) } } #[cfg(feature = "std")] impl std::error::Error for CacheError {} impl core::fmt::Display for CacheError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "lazy DFA cache has been cleared too many times") } } regex-automata-0.4.9/src/hybrid/id.rs000064400000000000000000000344451046102023000156050ustar 00000000000000/// A state identifier specifically tailored for lazy DFAs. /// /// A lazy state ID logically represents a pointer to a DFA state. In practice, /// by limiting the number of DFA states it can address, it reserves some /// bits of its representation to encode some additional information. That /// additional information is called a "tag." That tag is used to record /// whether the state it points to is an unknown, dead, quit, start or match /// state. /// /// When implementing a low level search routine with a lazy DFA, it is /// necessary to query the type of the current state to know what to do: /// /// * **Unknown** - The state has not yet been computed. The /// parameters used to get this state ID must be re-passed to /// [`DFA::next_state`](crate::hybrid::dfa::DFA::next_state), which will never /// return an unknown state ID. /// * **Dead** - A dead state only has transitions to itself. It indicates that /// the search cannot do anything else and should stop with whatever result it /// has. /// * **Quit** - A quit state indicates that the automaton could not answer /// whether a match exists or not. Correct search implementations must return a /// [`MatchError::quit`](crate::MatchError::quit) when a DFA enters a quit /// state. /// * **Start** - A start state is a state in which a search can begin. /// Lazy DFAs usually have more than one start state. Branching on /// this isn't required for correctness, but a common optimization is /// to run a prefilter when a search enters a start state. Note that /// start states are *not* tagged automatically, and one must enable the /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config::specialize_start_states) /// setting for start states to be tagged. The reason for this is /// that a DFA search loop is usually written to execute a prefilter once it /// enters a start state. But if there is no prefilter, this handling can be /// quite diastrous as the DFA may ping-pong between the special handling code /// and a possible optimized hot path for handling untagged states. When start /// states aren't specialized, then they are untagged and remain in the hot /// path. /// * **Match** - A match state indicates that a match has been found. /// Depending on the semantics of your search implementation, it may either /// continue until the end of the haystack or a dead state, or it might quit /// and return the match immediately. /// /// As an optimization, the [`is_tagged`](LazyStateID::is_tagged) predicate /// can be used to determine if a tag exists at all. This is useful to avoid /// branching on all of the above types for every byte searched. /// /// # Example /// /// This example shows how `LazyStateID` can be used to implement a correct /// search routine with minimal branching. In particular, this search routine /// implements "leftmost" matching, which means that it doesn't immediately /// stop once a match is found. Instead, it continues until it reaches a dead /// state. /// /// Notice also how a correct search implementation deals with /// [`CacheError`](crate::hybrid::CacheError)s returned by some of /// the lazy DFA routines. When a `CacheError` occurs, it returns /// [`MatchError::gave_up`](crate::MatchError::gave_up). /// /// ``` /// use regex_automata::{ /// hybrid::dfa::{Cache, DFA}, /// HalfMatch, MatchError, Input, /// }; /// /// fn find_leftmost_first( /// dfa: &DFA, /// cache: &mut Cache, /// haystack: &[u8], /// ) -> Result, MatchError> { /// // The start state is determined by inspecting the position and the /// // initial bytes of the haystack. Note that start states can never /// // be match states (since DFAs in this crate delay matches by 1 /// // byte), so we don't need to check if the start state is a match. /// let mut sid = dfa.start_state_forward( /// cache, /// &Input::new(haystack), /// )?; /// let mut last_match = None; /// // Walk all the bytes in the haystack. We can quit early if we see /// // a dead or a quit state. The former means the automaton will /// // never transition to any other state. The latter means that the /// // automaton entered a condition in which its search failed. /// for (i, &b) in haystack.iter().enumerate() { /// sid = dfa /// .next_state(cache, sid, b) /// .map_err(|_| MatchError::gave_up(i))?; /// if sid.is_tagged() { /// if sid.is_match() { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(cache, sid, 0), /// i, /// )); /// } else if sid.is_dead() { /// return Ok(last_match); /// } else if sid.is_quit() { /// // It is possible to enter into a quit state after /// // observing a match has occurred. In that case, we /// // should return the match instead of an error. /// if last_match.is_some() { /// return Ok(last_match); /// } /// return Err(MatchError::quit(b, i)); /// } /// // Implementors may also want to check for start states and /// // handle them differently for performance reasons. But it is /// // not necessary for correctness. Note that in order to check /// // for start states, you'll need to enable the /// // 'specialize_start_states' config knob, otherwise start /// // states will not be tagged. /// } /// } /// // Matches are always delayed by 1 byte, so we must explicitly walk /// // the special "EOI" transition at the end of the search. /// sid = dfa /// .next_eoi_state(cache, sid) /// .map_err(|_| MatchError::gave_up(haystack.len()))?; /// if sid.is_match() { /// last_match = Some(HalfMatch::new( /// dfa.match_pattern(cache, sid, 0), /// haystack.len(), /// )); /// } /// Ok(last_match) /// } /// /// // We use a greedy '+' operator to show how the search doesn't just stop /// // once a match is detected. It continues extending the match. Using /// // '[a-z]+?' would also work as expected and stop the search early. /// // Greediness is built into the automaton. /// let dfa = DFA::new(r"[a-z]+")?; /// let mut cache = dfa.create_cache(); /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 10); /// /// // Here's another example that tests our handling of the special /// // EOI transition. This will fail to find a match if we don't call /// // 'next_eoi_state' at the end of the search since the match isn't found /// // until the final byte in the haystack. /// let dfa = DFA::new(r"[0-9]{4}")?; /// let mut cache = dfa.create_cache(); /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 15); /// /// // And note that our search implementation above automatically works /// // with multi-DFAs. Namely, `dfa.match_pattern(match_state, 0)` selects /// // the appropriate pattern ID for us. /// let dfa = DFA::new_many(&[r"[a-z]+", r"[0-9]+"])?; /// let mut cache = dfa.create_cache(); /// let haystack = "123 foobar 4567".as_bytes(); /// let mat = find_leftmost_first(&dfa, &mut cache, haystack)?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 3); /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[3..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 0); /// assert_eq!(mat.offset(), 7); /// let mat = find_leftmost_first(&dfa, &mut cache, &haystack[10..])?.unwrap(); /// assert_eq!(mat.pattern().as_usize(), 1); /// assert_eq!(mat.offset(), 5); /// /// # Ok::<(), Box>(()) /// ``` #[derive( Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, )] pub struct LazyStateID(u32); impl LazyStateID { #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] const MAX_BIT: usize = 31; #[cfg(target_pointer_width = "16")] const MAX_BIT: usize = 15; const MASK_UNKNOWN: usize = 1 << (LazyStateID::MAX_BIT); const MASK_DEAD: usize = 1 << (LazyStateID::MAX_BIT - 1); const MASK_QUIT: usize = 1 << (LazyStateID::MAX_BIT - 2); const MASK_START: usize = 1 << (LazyStateID::MAX_BIT - 3); const MASK_MATCH: usize = 1 << (LazyStateID::MAX_BIT - 4); const MAX: usize = LazyStateID::MASK_MATCH - 1; /// Create a new lazy state ID. /// /// If the given identifier exceeds [`LazyStateID::MAX`], then this returns /// an error. #[inline] pub(crate) fn new(id: usize) -> Result { if id > LazyStateID::MAX { let attempted = u64::try_from(id).unwrap(); return Err(LazyStateIDError { attempted }); } Ok(LazyStateID::new_unchecked(id)) } /// Create a new lazy state ID without checking whether the given value /// exceeds [`LazyStateID::MAX`]. /// /// While this is unchecked, providing an incorrect value must never /// sacrifice memory safety. #[inline] const fn new_unchecked(id: usize) -> LazyStateID { // FIXME: Use as_u32() once const functions in traits are stable. LazyStateID(id as u32) } /// Return this lazy state ID as an untagged `usize`. /// /// If this lazy state ID is tagged, then the usize returned is the state /// ID without the tag. If the ID was not tagged, then the usize returned /// is equivalent to the state ID. #[inline] pub(crate) fn as_usize_untagged(&self) -> usize { self.as_usize_unchecked() & LazyStateID::MAX } /// Return this lazy state ID as its raw internal `usize` value, which may /// be tagged (and thus greater than LazyStateID::MAX). #[inline] pub(crate) const fn as_usize_unchecked(&self) -> usize { // FIXME: Use as_usize() once const functions in traits are stable. self.0 as usize } #[inline] pub(crate) const fn to_unknown(&self) -> LazyStateID { LazyStateID::new_unchecked( self.as_usize_unchecked() | LazyStateID::MASK_UNKNOWN, ) } #[inline] pub(crate) const fn to_dead(&self) -> LazyStateID { LazyStateID::new_unchecked( self.as_usize_unchecked() | LazyStateID::MASK_DEAD, ) } #[inline] pub(crate) const fn to_quit(&self) -> LazyStateID { LazyStateID::new_unchecked( self.as_usize_unchecked() | LazyStateID::MASK_QUIT, ) } /// Return this lazy state ID as a state ID that is tagged as a start /// state. #[inline] pub(crate) const fn to_start(&self) -> LazyStateID { LazyStateID::new_unchecked( self.as_usize_unchecked() | LazyStateID::MASK_START, ) } /// Return this lazy state ID as a lazy state ID that is tagged as a match /// state. #[inline] pub(crate) const fn to_match(&self) -> LazyStateID { LazyStateID::new_unchecked( self.as_usize_unchecked() | LazyStateID::MASK_MATCH, ) } /// Return true if and only if this lazy state ID is tagged. /// /// When a lazy state ID is tagged, then one can conclude that it is one /// of a match, start, dead, quit or unknown state. #[inline] pub const fn is_tagged(&self) -> bool { self.as_usize_unchecked() > LazyStateID::MAX } /// Return true if and only if this represents a lazy state ID that is /// "unknown." That is, the state has not yet been created. When a caller /// sees this state ID, it generally means that a state has to be computed /// in order to proceed. #[inline] pub const fn is_unknown(&self) -> bool { self.as_usize_unchecked() & LazyStateID::MASK_UNKNOWN > 0 } /// Return true if and only if this represents a dead state. A dead state /// is a state that can never transition to any other state except the /// dead state. When a dead state is seen, it generally indicates that a /// search should stop. #[inline] pub const fn is_dead(&self) -> bool { self.as_usize_unchecked() & LazyStateID::MASK_DEAD > 0 } /// Return true if and only if this represents a quit state. A quit state /// is a state that is representationally equivalent to a dead state, /// except it indicates the automaton has reached a point at which it can /// no longer determine whether a match exists or not. In general, this /// indicates an error during search and the caller must either pass this /// error up or use a different search technique. #[inline] pub const fn is_quit(&self) -> bool { self.as_usize_unchecked() & LazyStateID::MASK_QUIT > 0 } /// Return true if and only if this lazy state ID has been tagged as a /// start state. /// /// Note that if /// [`Config::specialize_start_states`](crate::hybrid::dfa::Config) is /// disabled (which is the default), then this will always return false /// since start states won't be tagged. #[inline] pub const fn is_start(&self) -> bool { self.as_usize_unchecked() & LazyStateID::MASK_START > 0 } /// Return true if and only if this lazy state ID has been tagged as a /// match state. #[inline] pub const fn is_match(&self) -> bool { self.as_usize_unchecked() & LazyStateID::MASK_MATCH > 0 } } /// This error occurs when a lazy state ID could not be constructed. /// /// This occurs when given an integer exceeding the maximum lazy state ID /// value. /// /// When the `std` feature is enabled, this implements the `Error` trait. #[derive(Clone, Debug, Eq, PartialEq)] pub(crate) struct LazyStateIDError { attempted: u64, } impl LazyStateIDError { /// Returns the value that failed to constructed a lazy state ID. pub(crate) fn attempted(&self) -> u64 { self.attempted } } #[cfg(feature = "std")] impl std::error::Error for LazyStateIDError {} impl core::fmt::Display for LazyStateIDError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to create LazyStateID from {:?}, which exceeds {:?}", self.attempted(), LazyStateID::MAX, ) } } regex-automata-0.4.9/src/hybrid/mod.rs000064400000000000000000000136301046102023000157610ustar 00000000000000/*! A module for building and searching with lazy deterministic finite automata (DFAs). Like other modules in this crate, lazy DFAs support a rich regex syntax with Unicode features. The key feature of a lazy DFA is that it builds itself incrementally during search, and never uses more than a configured capacity of memory. Thus, when searching with a lazy DFA, one must supply a mutable "cache" in which the actual DFA's transition table is stored. If you're looking for fully compiled DFAs, then please see the top-level [`dfa` module](crate::dfa). # Overview This section gives a brief overview of the primary types in this module: * A [`regex::Regex`] provides a way to search for matches of a regular expression using lazy DFAs. This includes iterating over matches with both the start and end positions of each match. * A [`dfa::DFA`] provides direct low level access to a lazy DFA. # Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ``` use regex_automata::{hybrid::regex::Regex, Match}; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; let mut cache = re.create_cache(); let haystack = "2018-12-24 2016-10-08"; let matches: Vec = re.find_iter(&mut cache, haystack).collect(); assert_eq!(matches, vec![ Match::must(0, 0..10), Match::must(0, 11..21), ]); # Ok::<(), Box>(()) ``` # Example: searching with multiple regexes The lazy DFAs in this module all fully support searching with multiple regexes simultaneously. You can use this support with standard leftmost-first style searching to find non-overlapping matches: ``` # if cfg!(miri) { return Ok(()); } // miri takes too long use regex_automata::{hybrid::regex::Regex, Match}; let re = Regex::new_many(&[r"\w+", r"\S+"])?; let mut cache = re.create_cache(); let haystack = "@foo bar"; let matches: Vec = re.find_iter(&mut cache, haystack).collect(); assert_eq!(matches, vec![ Match::must(1, 0..4), Match::must(0, 5..8), ]); # Ok::<(), Box>(()) ``` # When should I use this? Generally speaking, if you can abide the use of mutable state during search, and you don't need things like capturing groups or Unicode word boundary support in non-ASCII text, then a lazy DFA is likely a robust choice with respect to both search speed and memory usage. Note however that its speed may be worse than a general purpose regex engine if you don't select a good [prefilter](crate::util::prefilter). If you know ahead of time that your pattern would result in a very large DFA if it was fully compiled, it may be better to use an NFA simulation instead of a lazy DFA. Either that, or increase the cache capacity of your lazy DFA to something that is big enough to hold the state machine (likely through experimentation). The issue here is that if the cache is too small, then it could wind up being reset too frequently and this might decrease searching speed significantly. # Differences with fully compiled DFAs A [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) and a [`dfa::regex::Regex`](crate::dfa::regex::Regex) both have the same capabilities (and similarly for their underlying DFAs), but they achieve them through different means. The main difference is that a hybrid or "lazy" regex builds its DFA lazily during search, where as a fully compiled regex will build its DFA at construction time. While building a DFA at search time might sound like it's slow, it tends to work out where most bytes seen during a search will reuse pre-built parts of the DFA and thus can be almost as fast as a fully compiled DFA. The main downside is that searching requires mutable space to store the DFA, and, in the worst case, a search can result in a new state being created for each byte seen, which would make searching quite a bit slower. A fully compiled DFA never has to worry about searches being slower once it's built. (Aside from, say, the transition table being so large that it is subject to harsh CPU cache effects.) However, of course, building a full DFA can be quite time consuming and memory hungry. Particularly when large Unicode character classes are used, which tend to translate into very large DFAs. A lazy DFA strikes a nice balance _in practice_, particularly in the presence of Unicode mode, by only building what is needed. It avoids the worst case exponential time complexity of DFA compilation by guaranteeing that it will only build at most one state per byte searched. While the worst case here can lead to a very high constant, it will never be exponential. # Syntax This module supports the same syntax as the `regex` crate, since they share the same parser. You can find an exhaustive list of supported syntax in the [documentation for the `regex` crate](https://docs.rs/regex/1/regex/#syntax). There are two things that are not supported by the lazy DFAs in this module: * Capturing groups. The DFAs (and [`Regex`](regex::Regex)es built on top of them) can only find the offsets of an entire match, but cannot resolve the offsets of each capturing group. This is because DFAs do not have the expressive power necessary. Note that it is okay to build a lazy DFA from an NFA that contains capture groups. The capture groups will simply be ignored. * Unicode word boundaries. These present particularly difficult challenges for DFA construction and would result in an explosion in the number of states. One can enable [`dfa::Config::unicode_word_boundary`] though, which provides heuristic support for Unicode word boundaries that only works on ASCII text. Otherwise, one can use `(?-u:\b)` for an ASCII word boundary, which will work on any input. There are no plans to lift either of these limitations. Note that these restrictions are identical to the restrictions on fully compiled DFAs. */ pub use self::{ error::{BuildError, CacheError, StartError}, id::LazyStateID, }; pub mod dfa; mod error; mod id; pub mod regex; mod search; regex-automata-0.4.9/src/hybrid/regex.rs000064400000000000000000001033431046102023000163150ustar 00000000000000/*! A lazy DFA backed `Regex`. This module provides a [`Regex`] backed by a lazy DFA. A `Regex` implements convenience routines you might have come to expect, such as finding a match and iterating over all non-overlapping matches. This `Regex` type is limited in its capabilities to what a lazy DFA can provide. Therefore, APIs involving capturing groups, for example, are not provided. Internally, a `Regex` is composed of two DFAs. One is a "forward" DFA that finds the end offset of a match, where as the other is a "reverse" DFA that find the start offset of a match. See the [parent module](crate::hybrid) for examples. */ use crate::{ hybrid::{ dfa::{self, DFA}, error::BuildError, }, nfa::thompson, util::{ iter, search::{Anchored, Input, Match, MatchError, MatchKind}, }, }; /// A regular expression that uses hybrid NFA/DFAs (also called "lazy DFAs") /// for searching. /// /// A regular expression is comprised of two lazy DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of /// a match while the reverse DFA is responsible for detecting the start /// of a match. Thus, in order to find the bounds of any given match, a /// forward search must first be run followed by a reverse search. A match /// found by the forward DFA guarantees that the reverse DFA will also find /// a match. /// /// # Fallibility /// /// Most of the search routines defined on this type will _panic_ when the /// underlying search fails. This might be because the DFA gave up because it /// saw a quit byte, whether configured explicitly or via heuristic Unicode /// word boundary support, although neither are enabled by default. It might /// also fail if the underlying DFA determines it isn't making effective use of /// the cache (which also never happens by default). Or it might fail because /// an invalid `Input` configuration is given, for example, with an unsupported /// [`Anchored`] mode. /// /// If you need to handle these error cases instead of allowing them to trigger /// a panic, then the lower level [`Regex::try_search`] provides a fallible API /// that never panics. /// /// # Example /// /// This example shows how to cause a search to terminate if it sees a /// `\n` byte, and handle the error returned. This could be useful if, for /// example, you wanted to prevent a user supplied pattern from matching /// across a line boundary. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::{dfa, regex::Regex}, Input, MatchError}; /// /// let re = Regex::builder() /// .dfa(dfa::Config::new().quit(b'\n', true)) /// .build(r"foo\p{any}+bar")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("foo\nbar"); /// // Normally this would produce a match, since \p{any} contains '\n'. /// // But since we instructed the automaton to enter a quit state if a /// // '\n' is observed, this produces a match error instead. /// let expected = MatchError::quit(b'\n', 3); /// let got = re.try_search(&mut cache, &input).unwrap_err(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug)] pub struct Regex { /// The forward lazy DFA. This can only find the end of a match. forward: DFA, /// The reverse lazy DFA. This can only find the start of a match. /// /// This is built with 'all' match semantics (instead of leftmost-first) /// so that it always finds the longest possible match (which corresponds /// to the leftmost starting position). It is also compiled as an anchored /// matcher and has 'starts_for_each_pattern' enabled. Including starting /// states for each pattern is necessary to ensure that we only look for /// matches of a pattern that matched in the forward direction. Otherwise, /// we might wind up finding the "leftmost" starting position of a totally /// different pattern! reverse: DFA, } /// Convenience routines for regex and cache construction. impl Regex { /// Parse the given regular expression using the default configuration and /// return the corresponding regex. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find(&mut cache, "zzzfoo12345barzzz"), /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { Regex::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { Regex::builder().build_many(patterns) } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new cache for this `Regex`. /// /// The cache returned should only be used for searches for this /// `Regex`. If you want to reuse the cache for another `Regex`, then /// you must call [`Cache::reset`] with that `Regex` (or, equivalently, /// [`Regex::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this `Regex` (and only this `Regex`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `Regex`. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// `Regex` has been configured to "give up" after it has cleared the cache /// a certain number of times. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { self.forward().reset_cache(&mut cache.forward); self.reverse().reset_cache(&mut cache.reverse); } } /// Standard infallible search routines for finding and iterating over matches. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::hybrid::regex::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "foo12345bar")); /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> bool { // Not only can we do an "earliest" search, but we can avoid doing a // reverse scan too. self.forward() .try_search_fwd(&mut cache.forward, &input.into().earliest(true)) .unwrap() .is_some() } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, hybrid::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..11)), /// re.find(&mut cache, "zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// let mut cache = re.create_cache(); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Option { self.try_search(cache, &input.into()).unwrap() } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// The above conditions also apply to the iterator returned as well. For /// example, if the lazy DFA gives up or quits during a search using this /// method, then a panic will occur during iteration. /// /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) /// if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } } /// Lower level "search" primitives that accept a `&Input` for cheap reuse /// and return an error if one occurs instead of panicking. impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but with two differences: /// /// 1. It is not generic over `Into` and instead accepts a /// `&Input`. This permits reusing the same `Input` for multiple searches /// without needing to create a new one. This _may_ help with latency. /// 2. It returns an error if the search could not complete where as /// [`Regex::find`] will panic. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. #[inline] pub fn try_search( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { let (fcache, rcache) = (&mut cache.forward, &mut cache.reverse); let end = match self.forward().try_search_fwd(fcache, input)? { None => return Ok(None), Some(end) => end, }; // This special cases an empty match at the beginning of the search. If // our end matches our start, then since a reverse DFA can't match past // the start, it must follow that our starting position is also our end // position. So short circuit and skip the reverse search. if input.start() == end.offset() { return Ok(Some(Match::new( end.pattern(), end.offset()..end.offset(), ))); } // We can also skip the reverse search if we know our search was // anchored. This occurs either when the input config is anchored or // when we know the regex itself is anchored. In this case, we know the // start of the match, if one is found, must be the start of the // search. if self.is_anchored(input) { return Ok(Some(Match::new( end.pattern(), input.start()..end.offset(), ))); } // N.B. I have tentatively convinced myself that it isn't necessary // to specify the specific pattern for the reverse search since the // reverse search will always find the same pattern to match as the // forward search. But I lack a rigorous proof. Why not just provide // the pattern anyway? Well, if it is needed, then leaving it out // gives us a chance to find a witness. (Also, if we don't need to // specify the pattern, then we don't need to build the reverse DFA // with 'starts_for_each_pattern' enabled. It doesn't matter too much // for the lazy DFA, but does make the overall DFA bigger.) // // We also need to be careful to disable 'earliest' for the reverse // search, since it could be enabled for the forward search. In the // reverse case, to satisfy "leftmost" criteria, we need to match as // much as we can. We also need to be careful to make the search // anchored. We don't want the reverse search to report any matches // other than the one beginning at the end of our forward search. let revsearch = input .clone() .span(input.start()..end.offset()) .anchored(Anchored::Yes) .earliest(false); let start = self .reverse() .try_search_rev(rcache, &revsearch)? .expect("reverse search must match if forward search does"); debug_assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); debug_assert!(start.offset() <= end.offset()); Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } /// Returns true if either the given input specifies an anchored search /// or if the underlying NFA is always anchored. fn is_anchored(&self, input: &Input<'_>) -> bool { match input.get_anchored() { Anchored::No => { self.forward().get_nfa().is_always_start_anchored() } Anchored::Yes | Anchored::Pattern(_) => true, } } } /// Non-search APIs for querying information about the regex and setting a /// prefilter. impl Regex { /// Return the underlying lazy DFA responsible for forward matching. /// /// This is useful for accessing the underlying lazy DFA and using it /// directly if the situation calls for it. pub fn forward(&self) -> &DFA { &self.forward } /// Return the underlying lazy DFA responsible for reverse matching. /// /// This is useful for accessing the underlying lazy DFA and using it /// directly if the situation calls for it. pub fn reverse(&self) -> &DFA { &self.reverse } /// Returns the total number of patterns matched by this regex. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::hybrid::regex::Regex; /// /// let re = Regex::new_many(&[r"[a-z]+", r"[0-9]+", r"\w+"])?; /// assert_eq!(3, re.pattern_len()); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { assert_eq!(self.forward().pattern_len(), self.reverse().pattern_len()); self.forward().pattern_len() } } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the regex object. /// * `'h` represents the lifetime of the haystack being searched. /// * `'c` represents the lifetime of the regex cache. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'c, 'h> { re: &'r Regex, cache: &'c mut Cache, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; it.advance(|input| re.try_search(cache, input)) } } /// A cache represents a partially computed forward and reverse DFA. /// /// A cache is the key component that differentiates a classical DFA and a /// hybrid NFA/DFA (also called a "lazy DFA"). Where a classical DFA builds a /// complete transition table that can handle all possible inputs, a hybrid /// NFA/DFA starts with an empty transition table and builds only the parts /// required during search. The parts that are built are stored in a cache. For /// this reason, a cache is a required parameter for nearly every operation on /// a [`Regex`]. /// /// Caches can be created from their corresponding `Regex` via /// [`Regex::create_cache`]. A cache can only be used with either the `Regex` /// that created it, or the `Regex` that was most recently used to reset it /// with [`Cache::reset`]. Using a cache with any other `Regex` may result in /// panics or incorrect results. #[derive(Debug, Clone)] pub struct Cache { forward: dfa::Cache, reverse: dfa::Cache, } impl Cache { /// Create a new cache for the given `Regex`. /// /// The cache returned should only be used for searches for the given /// `Regex`. If you want to reuse the cache for another `Regex`, then you /// must call [`Cache::reset`] with that `Regex`. pub fn new(re: &Regex) -> Cache { let forward = dfa::Cache::new(re.forward()); let reverse = dfa::Cache::new(re.reverse()); Cache { forward, reverse } } /// Reset this cache such that it can be used for searching with the given /// `Regex` (and only that `Regex`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `Regex`. /// /// Resetting a cache sets its "clear count" to 0. This is relevant if the /// `Regex` has been configured to "give up" after it has cleared the cache /// a certain number of times. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find(&mut cache, "Δ"), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find(&mut cache, "☃"), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &Regex) { self.forward.reset(re.forward()); self.reverse.reset(re.reverse()); } /// Return a reference to the forward cache. pub fn forward(&mut self) -> &dfa::Cache { &self.forward } /// Return a reference to the reverse cache. pub fn reverse(&mut self) -> &dfa::Cache { &self.reverse } /// Return a mutable reference to the forward cache. /// /// If you need mutable references to both the forward and reverse caches, /// then use [`Cache::as_parts_mut`]. pub fn forward_mut(&mut self) -> &mut dfa::Cache { &mut self.forward } /// Return a mutable reference to the reverse cache. /// /// If you need mutable references to both the forward and reverse caches, /// then use [`Cache::as_parts_mut`]. pub fn reverse_mut(&mut self) -> &mut dfa::Cache { &mut self.reverse } /// Return references to the forward and reverse caches, respectively. pub fn as_parts(&self) -> (&dfa::Cache, &dfa::Cache) { (&self.forward, &self.reverse) } /// Return mutable references to the forward and reverse caches, /// respectively. pub fn as_parts_mut(&mut self) -> (&mut dfa::Cache, &mut dfa::Cache) { (&mut self.forward, &mut self.reverse) } /// Returns the heap memory usage, in bytes, as a sum of the forward and /// reverse lazy DFA caches. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.forward.memory_usage() + self.reverse.memory_usage() } } /// A builder for a regex based on a hybrid NFA/DFA. /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction, the lazy DFA construction and finally the regex searching /// itself. This builder is different from a general purpose regex builder /// in that it permits fine grain configuration of the construction process. /// The trade off for this is complexity, and the possibility of setting a /// configuration that might not make sense. For example, there are two /// different UTF-8 modes: /// /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls /// whether the pattern itself can contain sub-expressions that match invalid /// UTF-8. /// * [`thompson::Config::utf8`] controls how the regex iterators themselves /// advance the starting position of the next search when a match with zero /// length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// Internally, building a regex requires building two hybrid NFA/DFAs, /// where one is responsible for finding the end of a match and the other is /// responsible for finding the start of a match. If you only need to detect /// whether something matched, or only the end of a match, then you should use /// a [`dfa::Builder`] to construct a single hybrid NFA/DFA, which is cheaper /// than building two of them. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// hybrid::regex::Regex, nfa::thompson, util::syntax, Match, /// }; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find(&mut cache, haystack); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { dfa: dfa::Builder, } impl Builder { /// Create a new regex builder with the default configuration. pub fn new() -> Builder { Builder { dfa: DFA::builder() } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a regex from the given patterns. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let forward = self.dfa.build_many(patterns)?; let reverse = self .dfa .clone() .configure( DFA::config() .prefilter(None) .specialize_start_states(false) .match_kind(MatchKind::All), ) .thompson(thompson::Config::new().reverse(true)) .build_many(patterns)?; Ok(self.build_from_dfas(forward, reverse)) } /// Build a regex from its component forward and reverse hybrid NFA/DFAs. /// /// This is useful when you've built a forward and reverse lazy DFA /// separately, and want to combine them into a single regex. Once build, /// the individual DFAs given can still be accessed via [`Regex::forward`] /// and [`Regex::reverse`]. /// /// It is important that the reverse lazy DFA be compiled under the /// following conditions: /// /// * It should use [`MatchKind::All`] semantics. /// * It should match in reverse. /// * Otherwise, its configuration should match the forward DFA. /// /// If these conditions aren't satisfied, then the behavior of searches is /// unspecified. /// /// Note that when using this constructor, no configuration is applied. /// Since this routine provides the DFAs to the builder, there is no /// opportunity to apply other configuration options. /// /// # Example /// /// This shows how to build individual lazy forward and reverse DFAs, and /// then combine them into a single `Regex`. /// /// ``` /// use regex_automata::{ /// hybrid::{dfa::DFA, regex::Regex}, /// nfa::thompson, /// MatchKind, /// }; /// /// let fwd = DFA::new(r"foo[0-9]+")?; /// let rev = DFA::builder() /// .configure(DFA::config().match_kind(MatchKind::All)) /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"foo[0-9]+")?; /// /// let re = Regex::builder().build_from_dfas(fwd, rev); /// let mut cache = re.create_cache(); /// assert_eq!(true, re.is_match(&mut cache, "foo123")); /// # Ok::<(), Box>(()) /// ``` pub fn build_from_dfas(&self, forward: DFA, reverse: DFA) -> Regex { Regex { forward, reverse } } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.dfa.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](thompson::Config). /// /// This permits setting things like whether additional time should be /// spent shrinking the size of the NFA. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.dfa.thompson(config); self } /// Set the lazy DFA compilation configuration for this builder using /// [`dfa::Config`]. /// /// This permits setting things like whether Unicode word boundaries should /// be heuristically supported or settings how the behavior of the cache. pub fn dfa(&mut self, config: dfa::Config) -> &mut Builder { self.dfa.configure(config); self } } impl Default for Builder { fn default() -> Builder { Builder::new() } } regex-automata-0.4.9/src/hybrid/search.rs000064400000000000000000000775351046102023000164650ustar 00000000000000use crate::{ hybrid::{ dfa::{Cache, OverlappingState, DFA}, id::LazyStateID, }, util::{ prefilter::Prefilter, search::{HalfMatch, Input, MatchError, Span}, }, }; #[inline(never)] pub(crate) fn find_fwd( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { if input.is_done() { return Ok(None); } let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_config().get_prefilter() }; // So what we do here is specialize four different versions of 'find_fwd': // one for each of the combinations for 'has prefilter' and 'is earliest // search'. The reason for doing this is that both of these things require // branches and special handling in some code that can be very hot, // and shaving off as much as we can when we don't need it tends to be // beneficial in ad hoc benchmarks. To see these differences, you often // need a query with a high match count. In other words, specializing these // four routines *tends* to help latency more than throughput. if pre.is_some() { if input.get_earliest() { find_fwd_imp(dfa, cache, input, pre, true) } else { find_fwd_imp(dfa, cache, input, pre, false) } } else { if input.get_earliest() { find_fwd_imp(dfa, cache, input, None, true) } else { find_fwd_imp(dfa, cache, input, None, false) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_fwd_imp( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, pre: Option<&'_ Prefilter>, earliest: bool, ) -> Result, MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); let mut mat = None; let mut sid = init_fwd(dfa, cache, input)?; let mut at = input.start(); // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr, $at:expr) => {{ let byte = *input.haystack().get_unchecked($at); dfa.next_state_untagged_unchecked(cache, $sid, byte) }}; } if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => return Ok(mat), Some(ref span) => { at = span.start; if !universal_start { sid = prefilter_restart(dfa, cache, &input, at)?; } } } } cache.search_start(at); while at < input.end() { if sid.is_tagged() { cache.search_update(at); sid = dfa .next_state(cache, sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } else { // SAFETY: There are two safety invariants we need to uphold // here in the loops below: that 'sid' and 'prev_sid' are valid // state IDs for this DFA, and that 'at' is a valid index into // 'haystack'. For the former, we rely on the invariant that // next_state* and start_state_forward always returns a valid state // ID (given a valid state ID in the former case), and that we are // only at this place in the code if 'sid' is untagged. Moreover, // every call to next_state_untagged_unchecked below is guarded by // a check that sid is untagged. For the latter safety invariant, // we always guard unchecked access with a check that 'at' is less // than 'end', where 'end <= haystack.len()'. In the unrolled loop // below, we ensure that 'at' is always in bounds. // // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // // PERF: For justification for the loop unrolling, we use a few // different tests: // // regex-cli find half hybrid -p '\w{50}' -UBb bigfile // regex-cli find half hybrid -p '(?m)^.+$' -UBb bigfile // regex-cli find half hybrid -p 'ZQZQZQZQ' -UBb bigfile // // And there are three different configurations: // // nounroll: this entire 'else' block vanishes and we just // always use 'dfa.next_state(..)'. // unroll1: just the outer loop below // unroll2: just the inner loop below // unroll3: both the outer and inner loops below // // This results in a matrix of timings for each of the above // regexes with each of the above unrolling configurations: // // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ' // nounroll 1.51s 2.34s 1.51s // unroll1 1.53s 2.32s 1.56s // unroll2 2.22s 1.50s 0.61s // unroll3 1.67s 1.45s 0.61s // // Ideally we'd be able to find a configuration that yields the // best time for all regexes, but alas we settle for unroll3 that // gives us *almost* the best for '\w{50}' and the best for the // other two regexes. // // So what exactly is going on here? The first unrolling (grouping // together runs of untagged transitions) specifically targets // our choice of representation. The second unrolling (grouping // together runs of self-transitions) specifically targets a common // DFA topology. Let's dig in a little bit by looking at our // regexes: // // '\w{50}': This regex spends a lot of time outside of the DFA's // start state matching some part of the '\w' repetition. This // means that it's a bit of a worst case for loop unrolling that // targets self-transitions since the self-transitions in '\w{50}' // are not particularly active for this haystack. However, the // first unrolling (grouping together untagged transitions) // does apply quite well here since very few transitions hit // match/dead/quit/unknown states. It is however worth mentioning // that if start states are configured to be tagged (which you // typically want to do if you have a prefilter), then this regex // actually slows way down because it is constantly ping-ponging // out of the unrolled loop and into the handling of a tagged start // state below. But when start states aren't tagged, the unrolled // loop stays hot. (This is why it's imperative that start state // tagging be disabled when there isn't a prefilter!) // // '(?m)^.+$': There are two important aspects of this regex: 1) // on this haystack, its match count is very high, much higher // than the other two regex and 2) it spends the vast majority // of its time matching '.+'. Since Unicode mode is disabled, // this corresponds to repeatedly following self transitions for // the vast majority of the input. This does benefit from the // untagged unrolling since most of the transitions will be to // untagged states, but the untagged unrolling does more work than // what is actually required. Namely, it has to keep track of the // previous and next state IDs, which I guess requires a bit more // shuffling. This is supported by the fact that nounroll+unroll1 // are both slower than unroll2+unroll3, where the latter has a // loop unrolling that specifically targets self-transitions. // // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it // spends the vast majority of its time in self-transitions for // the (implicit) unanchored prefix. The main difference with // '(?m)^.+$' is that it has a much lower match count. So there // isn't much time spent in the overhead of reporting matches. This // is the primary explainer in the perf difference here. We include // this regex and the former to make sure we have comparison points // with high and low match counts. // // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. // // NOTE: In a follow-up, it turns out that the "inner" loop // mentioned above was a pretty big pessimization in some other // cases. Namely, it resulted in too much ping-ponging into and out // of the loop, which resulted in nearly ~2x regressions in search // time when compared to the originally lazy DFA in the regex crate. // So I've removed the second loop unrolling that targets the // self-transition case. let mut prev_sid = sid; while at < input.end() { prev_sid = unsafe { next_unchecked!(sid, at) }; if prev_sid.is_tagged() || at + 3 >= input.end() { core::mem::swap(&mut prev_sid, &mut sid); break; } at += 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if sid.is_tagged() { break; } at += 1; prev_sid = unsafe { next_unchecked!(sid, at) }; if prev_sid.is_tagged() { core::mem::swap(&mut prev_sid, &mut sid); break; } at += 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if sid.is_tagged() { break; } at += 1; } // If we quit out of the code above with an unknown state ID at // any point, then we need to re-compute that transition using // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { cache.search_update(at); sid = dfa .next_state(cache, prev_sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } } if sid.is_tagged() { if sid.is_start() { if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => { cache.search_finish(span.end); return Ok(mat); } Some(ref span) => { // We want to skip any update to 'at' below // at the end of this iteration and just // jump immediately back to the next state // transition at the leading position of the // candidate match. // // ... but only if we actually made progress // with our prefilter, otherwise if the start // state has a self-loop, we can get stuck. if span.start > at { at = span.start; if !universal_start { sid = prefilter_restart( dfa, cache, &input, at, )?; } continue; } } } } } else if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); // Since slice ranges are inclusive at the beginning and // exclusive at the end, and since forward searches report // the end, we can return 'at' as-is. This only works because // matches are delayed by 1 byte. So by the time we observe a // match, 'at' has already been set to 1 byte past the actual // match location, which is precisely the exclusive ending // bound of the match. mat = Some(HalfMatch::new(pattern, at)); if earliest { cache.search_finish(at); return Ok(mat); } } else if sid.is_dead() { cache.search_finish(at); return Ok(mat); } else if sid.is_quit() { cache.search_finish(at); return Err(MatchError::quit(input.haystack()[at], at)); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } at += 1; } eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; cache.search_finish(input.end()); Ok(mat) } #[inline(never)] pub(crate) fn find_rev( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, ) -> Result, MatchError> { if input.is_done() { return Ok(None); } if input.get_earliest() { find_rev_imp(dfa, cache, input, true) } else { find_rev_imp(dfa, cache, input, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_rev_imp( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, earliest: bool, ) -> Result, MatchError> { let mut mat = None; let mut sid = init_rev(dfa, cache, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use // an unsigned offset, 'at >= 0' is trivially always true. We could avoid // this extra case handling by using a signed offset, but Rust makes it // annoying to do. So... We just handle the empty case separately. if input.start() == input.end() { eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; return Ok(mat); } let mut at = input.end() - 1; macro_rules! next_unchecked { ($sid:expr, $at:expr) => {{ let byte = *input.haystack().get_unchecked($at); dfa.next_state_untagged_unchecked(cache, $sid, byte) }}; } cache.search_start(at); loop { if sid.is_tagged() { cache.search_update(at); sid = dfa .next_state(cache, sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } else { // SAFETY: See comments in 'find_fwd' for a safety argument. // // PERF: The comments in 'find_fwd' also provide a justification // from a performance perspective as to 1) why we elide bounds // checks and 2) why we do a specialized version of unrolling // below. The reverse search does have a slightly different // consideration in that most reverse searches tend to be // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // // regex-cli find match hybrid -p '(?m)^.+$' -UBb bigfile // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' // sub-command will find start-of-match and thus run the reverse // direction.) // // Without unrolling below, the above command takes around 3.76s. // But with the unrolling below, we get down to 2.55s. If we keep // the unrolling but add in bounds checks, then we get 2.86s. // // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. let mut prev_sid = sid; while at >= input.start() { prev_sid = unsafe { next_unchecked!(sid, at) }; if prev_sid.is_tagged() || at <= input.start().saturating_add(3) { core::mem::swap(&mut prev_sid, &mut sid); break; } at -= 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if sid.is_tagged() { break; } at -= 1; prev_sid = unsafe { next_unchecked!(sid, at) }; if prev_sid.is_tagged() { core::mem::swap(&mut prev_sid, &mut sid); break; } at -= 1; sid = unsafe { next_unchecked!(prev_sid, at) }; if sid.is_tagged() { break; } at -= 1; } // If we quit out of the code above with an unknown state ID at // any point, then we need to re-compute that transition using // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { cache.search_update(at); sid = dfa .next_state(cache, prev_sid, input.haystack()[at]) .map_err(|_| gave_up(at))?; } } if sid.is_tagged() { if sid.is_start() { // do nothing } else if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); // Since reverse searches report the beginning of a match // and the beginning is inclusive (not exclusive like the // end of a match), we add 1 to make it inclusive. mat = Some(HalfMatch::new(pattern, at + 1)); if earliest { cache.search_finish(at); return Ok(mat); } } else if sid.is_dead() { cache.search_finish(at); return Ok(mat); } else if sid.is_quit() { cache.search_finish(at); return Err(MatchError::quit(input.haystack()[at], at)); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } if at == input.start() { break; } at -= 1; } cache.search_finish(input.start()); eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; Ok(mat) } #[inline(never)] pub(crate) fn find_overlapping_fwd( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { state.mat = None; if input.is_done() { return Ok(()); } let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_config().get_prefilter() }; if pre.is_some() { find_overlapping_fwd_imp(dfa, cache, input, pre, state) } else { find_overlapping_fwd_imp(dfa, cache, input, None, state) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_overlapping_fwd_imp( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, pre: Option<&'_ Prefilter>, state: &mut OverlappingState, ) -> Result<(), MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); let mut sid = match state.id { None => { state.at = input.start(); init_fwd(dfa, cache, input)? } Some(sid) => { if let Some(match_index) = state.next_match_index { let match_len = dfa.match_len(cache, sid); if match_index < match_len { state.next_match_index = Some(match_index + 1); let pattern = dfa.match_pattern(cache, sid, match_index); state.mat = Some(HalfMatch::new(pattern, state.at)); return Ok(()); } } // Once we've reported all matches at a given position, we need to // advance the search to the next position. state.at += 1; if state.at > input.end() { return Ok(()); } sid } }; // NOTE: We don't optimize the crap out of this routine primarily because // it seems like most overlapping searches will have higher match counts, // and thus, throughput is perhaps not as important. But if you have a use // case for something faster, feel free to file an issue. cache.search_start(state.at); while state.at < input.end() { sid = dfa .next_state(cache, sid, input.haystack()[state.at]) .map_err(|_| gave_up(state.at))?; if sid.is_tagged() { state.id = Some(sid); if sid.is_start() { if let Some(ref pre) = pre { let span = Span::from(state.at..input.end()); match pre.find(input.haystack(), span) { None => return Ok(()), Some(ref span) => { if span.start > state.at { state.at = span.start; if !universal_start { sid = prefilter_restart( dfa, cache, &input, state.at, )?; } continue; } } } } } else if sid.is_match() { state.next_match_index = Some(1); let pattern = dfa.match_pattern(cache, sid, 0); state.mat = Some(HalfMatch::new(pattern, state.at)); cache.search_finish(state.at); return Ok(()); } else if sid.is_dead() { cache.search_finish(state.at); return Ok(()); } else if sid.is_quit() { cache.search_finish(state.at); return Err(MatchError::quit( input.haystack()[state.at], state.at, )); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } state.at += 1; cache.search_update(state.at); } let result = eoi_fwd(dfa, cache, input, &mut sid, &mut state.mat); state.id = Some(sid); if state.mat.is_some() { // '1' is always correct here since if we get to this point, this // always corresponds to the first (index '0') match discovered at // this position. So the next match to report at this position (if // it exists) is at index '1'. state.next_match_index = Some(1); } cache.search_finish(input.end()); result } #[inline(never)] pub(crate) fn find_overlapping_rev( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, state: &mut OverlappingState, ) -> Result<(), MatchError> { state.mat = None; if input.is_done() { return Ok(()); } let mut sid = match state.id { None => { let sid = init_rev(dfa, cache, input)?; state.id = Some(sid); if input.start() == input.end() { state.rev_eoi = true; } else { state.at = input.end() - 1; } sid } Some(sid) => { if let Some(match_index) = state.next_match_index { let match_len = dfa.match_len(cache, sid); if match_index < match_len { state.next_match_index = Some(match_index + 1); let pattern = dfa.match_pattern(cache, sid, match_index); state.mat = Some(HalfMatch::new(pattern, state.at)); return Ok(()); } } // Once we've reported all matches at a given position, we need // to advance the search to the next position. However, if we've // already followed the EOI transition, then we know we're done // with the search and there cannot be any more matches to report. if state.rev_eoi { return Ok(()); } else if state.at == input.start() { // At this point, we should follow the EOI transition. This // will cause us the skip the main loop below and fall through // to the final 'eoi_rev' transition. state.rev_eoi = true; } else { // We haven't hit the end of the search yet, so move on. state.at -= 1; } sid } }; cache.search_start(state.at); while !state.rev_eoi { sid = dfa .next_state(cache, sid, input.haystack()[state.at]) .map_err(|_| gave_up(state.at))?; if sid.is_tagged() { state.id = Some(sid); if sid.is_start() { // do nothing } else if sid.is_match() { state.next_match_index = Some(1); let pattern = dfa.match_pattern(cache, sid, 0); state.mat = Some(HalfMatch::new(pattern, state.at + 1)); cache.search_finish(state.at); return Ok(()); } else if sid.is_dead() { cache.search_finish(state.at); return Ok(()); } else if sid.is_quit() { cache.search_finish(state.at); return Err(MatchError::quit( input.haystack()[state.at], state.at, )); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } if state.at == input.start() { break; } state.at -= 1; cache.search_update(state.at); } let result = eoi_rev(dfa, cache, input, &mut sid, &mut state.mat); state.rev_eoi = true; state.id = Some(sid); if state.mat.is_some() { // '1' is always correct here since if we get to this point, this // always corresponds to the first (index '0') match discovered at // this position. So the next match to report at this position (if // it exists) is at index '1'. state.next_match_index = Some(1); } cache.search_finish(input.start()); result } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, ) -> Result { let sid = dfa.start_state_forward(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!sid.is_match()); Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, ) -> Result { let sid = dfa.start_state_reverse(cache, input)?; // Start states can never be match states, since all matches are delayed // by 1 byte. debug_assert!(!sid.is_match()); Ok(sid) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, sid: &mut LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); match input.haystack().get(sp.end) { Some(&b) => { *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa .next_eoi_state(cache, *sid) .map_err(|_| gave_up(input.haystack().len()))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } } Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, sid: &mut LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); if sp.start > 0 { let byte = input.haystack()[sp.start - 1]; *sid = dfa .next_state(cache, *sid, byte) .map_err(|_| gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } Ok(()) } /// Re-compute the starting state that a DFA should be in after finding a /// prefilter candidate match at the position `at`. /// /// It is always correct to call this, but not always necessary. Namely, /// whenever the DFA has a universal start state, the DFA can remain in the /// start state that it was in when it ran the prefilter. Why? Because in that /// case, there is only one start state. /// /// When does a DFA have a universal start state? In precisely cases where /// it has no look-around assertions in its prefix. So for example, `\bfoo` /// does not have a universal start state because the start state depends on /// whether the byte immediately before the start position is a word byte or /// not. However, `foo\b` does have a universal start state because the word /// boundary does not appear in the pattern's prefix. /// /// So... most cases don't need this, but when a pattern doesn't have a /// universal start state, then after a prefilter candidate has been found, the /// current state *must* be re-litigated as if computing the start state at the /// beginning of the search because it might change. That is, not all start /// states are created equal. /// /// Why avoid it? Because while it's not super expensive, it isn't a trivial /// operation to compute the start state. It is much better to avoid it and /// just state in the current state if you know it to be correct. #[cfg_attr(feature = "perf-inline", inline(always))] fn prefilter_restart( dfa: &DFA, cache: &mut Cache, input: &Input<'_>, at: usize, ) -> Result { let mut input = input.clone(); input.set_start(at); init_fwd(dfa, cache, &input) } /// A convenience routine for constructing a "gave up" match error. #[cfg_attr(feature = "perf-inline", inline(always))] fn gave_up(offset: usize) -> MatchError { MatchError::gave_up(offset) } regex-automata-0.4.9/src/lib.rs000064400000000000000000000736741046102023000145050ustar 00000000000000/*! This crate exposes a variety of regex engines used by the `regex` crate. It provides a vast, sprawling and "expert" level API to each regex engine. The regex engines provided by this crate focus heavily on finite automata implementations and specifically guarantee worst case `O(m * n)` time complexity for all searches. (Where `m ~ len(regex)` and `n ~ len(haystack)`.) The primary goal of this crate is to serve as an implementation detail for the `regex` crate. A secondary goal is to make its internals available for use by others. # Table of contents * [Should I be using this crate?](#should-i-be-using-this-crate) gives some reasons for and against using this crate. * [Examples](#examples) provides a small selection of things you can do with this crate. * [Available regex engines](#available-regex-engines) provides a hyperlinked list of all regex engines in this crate. * [API themes](#api-themes) discusses common elements used throughout this crate. * [Crate features](#crate-features) documents the extensive list of Cargo features available. # Should I be using this crate? If you find yourself here because you just want to use regexes, then you should first check out whether the [`regex` crate](https://docs.rs/regex) meets your needs. It provides a streamlined and difficult-to-misuse API for regex searching. If you're here because there is something specific you want to do that can't be easily done with `regex` crate, then you are perhaps in the right place. It's most likely that the first stop you'll want to make is to explore the [`meta` regex APIs](meta). Namely, the `regex` crate is just a light wrapper over a [`meta::Regex`], so its API will probably be the easiest to transition to. In contrast to the `regex` crate, the `meta::Regex` API supports more search parameters and does multi-pattern searches. However, it isn't quite as ergonomic. Otherwise, the following is an inexhaustive list of reasons to use this crate: * You want to analyze or use a [Thompson `NFA`](nfa::thompson::NFA) directly. * You want more powerful multi-pattern search than what is provided by `RegexSet` in the `regex` crate. All regex engines in this crate support multi-pattern searches. * You want to use one of the `regex` crate's internal engines directly because of some interesting configuration that isn't possible via the `regex` crate. For example, a [lazy DFA's configuration](hybrid::dfa::Config) exposes a dizzying number of options for controlling its execution. * You want to use the lower level search APIs. For example, both the [lazy DFA](hybrid::dfa) and [fully compiled DFAs](dfa) support searching by exploring the automaton one state at a time. This might be useful, for example, for stream searches or searches of strings stored in non-contiguous in memory. * You want to build a fully compiled DFA and then [use zero-copy deserialization](dfa::dense::DFA::from_bytes) to load it into memory and use it for searching. This use case is supported in core-only no-std/no-alloc environments. * You want to run [anchored searches](Input::anchored) without using the `^` anchor in your regex pattern. * You need to work-around contention issues with sharing a regex across multiple threads. The [`meta::Regex::search_with`](meta::Regex::search_with) API permits bypassing any kind of synchronization at all by requiring the caller to provide the mutable scratch spaced needed during a search. * You want to build your own regex engine on top of the `regex` crate's infrastructure. # Examples This section tries to identify a few interesting things you can do with this crate and demonstrates them. ### Multi-pattern searches with capture groups One of the more frustrating limitations of `RegexSet` in the `regex` crate (at the time of writing) is that it doesn't report match positions. With this crate, multi-pattern support was intentionally designed in from the beginning, which means it works in all regex engines and even for capture groups as well. This example shows how to search for matches of multiple regexes, where each regex uses the same capture group names to parse different key-value formats. ``` use regex_automata::{meta::Regex, PatternID}; let re = Regex::new_many(&[ r#"(?m)^(?[[:word:]]+)=(?[[:word:]]+)$"#, r#"(?m)^(?[[:word:]]+)="(?[^"]+)"$"#, r#"(?m)^(?[[:word:]]+)='(?[^']+)'$"#, r#"(?m)^(?[[:word:]]+):\s*(?[[:word:]]+)$"#, ])?; let hay = r#" best_album="Blow Your Face Out" best_quote='"then as it was, then again it will be"' best_year=1973 best_simpsons_episode: HOMR "#; let mut kvs = vec![]; for caps in re.captures_iter(hay) { // N.B. One could use capture indices '1' and '2' here // as well. Capture indices are local to each pattern. // (Just like names are.) let key = &hay[caps.get_group_by_name("key").unwrap()]; let val = &hay[caps.get_group_by_name("val").unwrap()]; kvs.push((key, val)); } assert_eq!(kvs, vec![ ("best_album", "Blow Your Face Out"), ("best_quote", "\"then as it was, then again it will be\""), ("best_year", "1973"), ("best_simpsons_episode", "HOMR"), ]); # Ok::<(), Box>(()) ``` ### Build a full DFA and walk it manually One of the regex engines in this crate is a fully compiled DFA. It takes worst case exponential time to build, but once built, it can be easily explored and used for searches. Here's a simple example that uses its lower level APIs to implement a simple anchored search by hand. ``` use regex_automata::{dfa::{Automaton, dense}, Input}; let dfa = dense::DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; let haystack = "Quartz"; // The start state is determined by inspecting the position and the // initial bytes of the haystack. let mut state = dfa.start_state_forward(&Input::new(haystack))?; // Walk all the bytes in the haystack. for &b in haystack.as_bytes().iter() { state = dfa.next_state(state, b); } // DFAs in this crate require an explicit // end-of-input transition if a search reaches // the end of a haystack. state = dfa.next_eoi_state(state); assert!(dfa.is_match_state(state)); # Ok::<(), Box>(()) ``` Or do the same with a lazy DFA that avoids exponential worst case compile time, but requires mutable scratch space to lazily build the DFA during the search. ``` use regex_automata::{hybrid::dfa::DFA, Input}; let dfa = DFA::new(r"(?-u)\b[A-Z]\w+z\b")?; let mut cache = dfa.create_cache(); let hay = "Quartz"; // The start state is determined by inspecting the position and the // initial bytes of the haystack. let mut state = dfa.start_state_forward(&mut cache, &Input::new(hay))?; // Walk all the bytes in the haystack. for &b in hay.as_bytes().iter() { state = dfa.next_state(&mut cache, state, b)?; } // DFAs in this crate require an explicit // end-of-input transition if a search reaches // the end of a haystack. state = dfa.next_eoi_state(&mut cache, state)?; assert!(state.is_match()); # Ok::<(), Box>(()) ``` ### Find all overlapping matches This example shows how to build a DFA and use it to find all possible matches, including overlapping matches. A similar example will work with a lazy DFA as well. This also works with multiple patterns and will report all matches at the same position where multiple patterns match. ``` use regex_automata::{ dfa::{dense, Automaton, OverlappingState}, Input, MatchKind, }; let dfa = dense::DFA::builder() .configure(dense::DFA::config().match_kind(MatchKind::All)) .build(r"(?-u)\w{3,}")?; let input = Input::new("homer marge bart lisa maggie"); let mut state = OverlappingState::start(); let mut matches = vec![]; while let Some(hm) = { dfa.try_search_overlapping_fwd(&input, &mut state)?; state.get_match() } { matches.push(hm.offset()); } assert_eq!(matches, vec![ 3, 4, 5, // hom, home, homer 9, 10, 11, // mar, marg, marge 15, 16, // bar, bart 20, 21, // lis, lisa 25, 26, 27, 28, // mag, magg, maggi, maggie ]); # Ok::<(), Box>(()) ``` # Available regex engines The following is a complete list of all regex engines provided by this crate, along with a very brief description of it and why you might want to use it. * [`dfa::regex::Regex`] is a regex engine that works on top of either [dense](dfa::dense) or [sparse](dfa::sparse) fully compiled DFAs. You might use a DFA if you need the fastest possible regex engine in this crate and can afford the exorbitant memory usage usually required by DFAs. Low level APIs on fully compiled DFAs are provided by the [`Automaton` trait](dfa::Automaton). Fully compiled dense DFAs can handle all regexes except for searching a regex with a Unicode word boundary on non-ASCII haystacks. A fully compiled DFA based regex can only report the start and end of each match. * [`hybrid::regex::Regex`] is a regex engine that works on top of a lazily built DFA. Its performance profile is very similar to that of fully compiled DFAs, but can be slower in some pathological cases. Fully compiled DFAs are also amenable to more optimizations, such as state acceleration, that aren't available in a lazy DFA. You might use this lazy DFA if you can't abide the worst case exponential compile time of a full DFA, but still want the DFA search performance in the vast majority of cases. A lazy DFA based regex can only report the start and end of each match. * [`dfa::onepass::DFA`] is a regex engine that is implemented as a DFA, but can report the matches of each capture group in addition to the start and end of each match. The catch is that it only works on a somewhat small subset of regexes known as "one-pass." You'll want to use this for cases when you need capture group matches and the regex is one-pass since it is likely to be faster than any alternative. A one-pass DFA can handle all types of regexes, but does have some reasonable limits on the number of capture groups it can handle. * [`nfa::thompson::backtrack::BoundedBacktracker`] is a regex engine that uses backtracking, but keeps track of the work it has done to avoid catastrophic backtracking. Like the one-pass DFA, it provides the matches of each capture group. It retains the `O(m * n)` worst case time bound. This tends to be slower than the one-pass DFA regex engine, but faster than the PikeVM. It can handle all types of regexes, but usually only works well with small haystacks and small regexes due to the memory required to avoid redoing work. * [`nfa::thompson::pikevm::PikeVM`] is a regex engine that can handle all regexes, of all sizes and provides capture group matches. It tends to be a tool of last resort because it is also usually the slowest regex engine. * [`meta::Regex`] is the meta regex engine that combines *all* of the above engines into one. The reason for this is that each of the engines above have their own caveats such as, "only handles a subset of regexes" or "is generally slow." The meta regex engine accounts for all of these caveats and composes the engines in a way that attempts to mitigate each engine's weaknesses while emphasizing its strengths. For example, it will attempt to run a lazy DFA even if it might fail. In which case, it will restart the search with a likely slower but more capable regex engine. The meta regex engine is what you should default to. Use one of the above engines directly only if you have a specific reason to. # API themes While each regex engine has its own APIs and configuration options, there are some general themes followed by all of them. ### The `Input` abstraction Most search routines in this crate accept anything that implements `Into`. Both `&str` and `&[u8]` haystacks satisfy this constraint, which means that things like `engine.search("foo")` will work as you would expect. By virtue of accepting an `Into` though, callers can provide more than just a haystack. Indeed, the [`Input`] type has more details, but briefly, callers can use it to configure various aspects of the search: * The span of the haystack to search via [`Input::span`] or [`Input::range`], which might be a substring of the haystack. * Whether to run an anchored search or not via [`Input::anchored`]. This permits one to require matches to start at the same offset that the search started. * Whether to ask the regex engine to stop as soon as a match is seen via [`Input::earliest`]. This can be used to find the offset of a match as soon as it is known without waiting for the full leftmost-first match to be found. This can also be used to avoid the worst case `O(m * n^2)` time complexity of iteration. Some lower level search routines accept an `&Input` for performance reasons. In which case, `&Input::new("haystack")` can be used for a simple search. ### Error reporting Most, but not all, regex engines in this crate can fail to execute a search. When a search fails, callers cannot determine whether or not a match exists. That is, the result is indeterminate. Search failure, in all cases in this crate, is represented by a [`MatchError`]. Routines that can fail start with the `try_` prefix in their name. For example, [`hybrid::regex::Regex::try_search`] can fail for a number of reasons. Conversely, routines that either can't fail or can panic on failure lack the `try_` prefix. For example, [`hybrid::regex::Regex::find`] will panic in cases where [`hybrid::regex::Regex::try_search`] would return an error, and [`meta::Regex::find`] will never panic. Therefore, callers need to pay close attention to the panicking conditions in the documentation. In most cases, the reasons that a search fails are either predictable or configurable, albeit at some additional cost. An example of predictable failure is [`BoundedBacktracker::try_search`](nfa::thompson::backtrack::BoundedBacktracker::try_search). Namely, it fails whenever the multiplication of the haystack, the regex and some constant exceeds the [configured visited capacity](nfa::thompson::backtrack::Config::visited_capacity). Callers can predict the failure in terms of haystack length via the [`BoundedBacktracker::max_haystack_len`](nfa::thompson::backtrack::BoundedBacktracker::max_haystack_len) method. While this form of failure is technically avoidable by increasing the visited capacity, it isn't practical to do so for all inputs because the memory usage required for larger haystacks becomes impractically large. So in practice, if one is using the bounded backtracker, you really do have to deal with the failure. An example of configurable failure happens when one enables heuristic support for Unicode word boundaries in a DFA. Namely, since the DFAs in this crate (except for the one-pass DFA) do not support Unicode word boundaries on non-ASCII haystacks, building a DFA from an NFA that contains a Unicode word boundary will itself fail. However, one can configure DFAs to still be built in this case by [configuring heuristic support for Unicode word boundaries](hybrid::dfa::Config::unicode_word_boundary). If the NFA the DFA is built from contains a Unicode word boundary, then the DFA will still be built, but special transitions will be added to every state that cause the DFA to fail if any non-ASCII byte is seen. This failure happens at search time and it requires the caller to opt into this. There are other ways for regex engines to fail in this crate, but the above two should represent the general theme of failures one can find. Dealing with these failures is, in part, one the responsibilities of the [meta regex engine](meta). Notice, for example, that the meta regex engine exposes an API that never returns an error nor panics. It carefully manages all of the ways in which the regex engines can fail and either avoids the predictable ones entirely (e.g., the bounded backtracker) or reacts to configured failures by falling back to a different engine (e.g., the lazy DFA quitting because it saw a non-ASCII byte). ### Configuration and Builders Most of the regex engines in this crate come with two types to facilitate building the regex engine: a `Config` and a `Builder`. A `Config` is usually specific to that particular regex engine, but other objects such as parsing and NFA compilation have `Config` types too. A `Builder` is the thing responsible for taking inputs (either pattern strings or already-parsed patterns or even NFAs directly) and turning them into an actual regex engine that can be used for searching. The main reason why building a regex engine is a bit complicated is because of the desire to permit composition with de-coupled components. For example, you might want to [manually construct a Thompson NFA](nfa::thompson::Builder) and then build a regex engine from it without ever using a regex parser at all. On the other hand, you might also want to build a regex engine directly from the concrete syntax. This demonstrates why regex engine construction is so flexible: it needs to support not just convenient construction, but also construction from parts built elsewhere. This is also in turn why there are many different `Config` structs in this crate. Let's look more closely at an example: [`hybrid::regex::Builder`]. It accepts three different `Config` types for configuring construction of a lazy DFA regex: * [`hybrid::regex::Builder::syntax`] accepts a [`util::syntax::Config`] for configuring the options found in the [`regex-syntax`](regex_syntax) crate. For example, whether to match case insensitively. * [`hybrid::regex::Builder::thompson`] accepts a [`nfa::thompson::Config`] for configuring construction of a [Thompson NFA](nfa::thompson::NFA). For example, whether to build an NFA that matches the reverse language described by the regex. * [`hybrid::regex::Builder::dfa`] accept a [`hybrid::dfa::Config`] for configuring construction of the pair of underlying lazy DFAs that make up the lazy DFA regex engine. For example, changing the capacity of the cache used to store the transition table. The lazy DFA regex engine uses all three of those configuration objects for methods like [`hybrid::regex::Builder::build`], which accepts a pattern string containing the concrete syntax of your regex. It uses the syntax configuration to parse it into an AST and translate it into an HIR. Then the NFA configuration when compiling the HIR into an NFA. And then finally the DFA configuration when lazily determinizing the NFA into a DFA. Notice though that the builder also has a [`hybrid::regex::Builder::build_from_dfas`] constructor. This permits callers to build the underlying pair of lazy DFAs themselves (one for the forward searching to find the end of a match and one for the reverse searching to find the start of a match), and then build the regex engine from them. The lazy DFAs, in turn, have their own builder that permits [construction directly from a Thompson NFA](hybrid::dfa::Builder::build_from_nfa). Continuing down the rabbit hole, a Thompson NFA has its own compiler that permits [construction directly from an HIR](nfa::thompson::Compiler::build_from_hir). The lazy DFA regex engine builder lets you follow this rabbit hole all the way down, but also provides convenience routines that do it for you when you don't need precise control over every component. The [meta regex engine](meta) is a good example of something that utilizes the full flexibility of these builders. It often needs not only precise control over each component, but also shares them across multiple regex engines. (Most sharing is done by internal reference accounting. For example, an [`NFA`](nfa::thompson::NFA) is reference counted internally which makes cloning cheap.) ### Size limits Unlike the `regex` crate, the `regex-automata` crate specifically does not enable any size limits by default. That means users of this crate need to be quite careful when using untrusted patterns. Namely, because bounded repetitions can grow exponentially by stacking them, it is possible to build a very large internal regex object from just a small pattern string. For example, the NFA built from the pattern `a{10}{10}{10}{10}{10}{10}{10}` is over 240MB. There are multiple size limit options in this crate. If one or more size limits are relevant for the object you're building, they will be configurable via methods on a corresponding `Config` type. # Crate features This crate has a dizzying number of features. The main idea is to be able to control how much stuff you pull in for your specific use case, since the full crate is quite large and can dramatically increase compile times and binary size. The most barebones but useful configuration is to disable all default features and enable only `dfa-search`. This will bring in just the DFA deserialization and search routines without any dependency on `std` or `alloc`. This does require generating and serializing a DFA, and then storing it somewhere, but it permits regex searches in freestanding or embedded environments. Because there are so many features, they are split into a few groups. The default set of features is: `std`, `syntax`, `perf`, `unicode`, `meta`, `nfa`, `dfa` and `hybrid`. Basically, the default is to enable everything except for development related features like `logging`. ### Ecosystem features * **std** - Enables use of the standard library. In terms of APIs, this usually just means that error types implement the `std::error::Error` trait. Otherwise, `std` sometimes enables the code to be faster, for example, using a `HashMap` instead of a `BTreeMap`. (The `std` feature matters more for dependencies like `aho-corasick` and `memchr`, where `std` is required to enable certain classes of SIMD optimizations.) Enabling `std` automatically enables `alloc`. * **alloc** - Enables use of the `alloc` library. This is required for most APIs in this crate. The main exception is deserializing and searching with fully compiled DFAs. * **logging** - Adds a dependency on the `log` crate and makes this crate emit log messages of varying degrees of utility. The log messages are especially useful in trying to understand what the meta regex engine is doing. ### Performance features * **perf** - Enables all of the below features. * **perf-inline** - When enabled, `inline(always)` is used in (many) strategic locations to help performance at the expense of longer compile times and increased binary size. * **perf-literal** - Enables all literal related optimizations. * **perf-literal-substring** - Enables all single substring literal optimizations. This includes adding a dependency on the `memchr` crate. * **perf-literal-multisubstring** - Enables all multiple substring literal optimizations. This includes adding a dependency on the `aho-corasick` crate. ### Unicode features * **unicode** - Enables all Unicode features. This feature is enabled by default, and will always cover all Unicode features, even if more are added in the future. * **unicode-age** - Provide the data for the [Unicode `Age` property](https://www.unicode.org/reports/tr44/tr44-24.html#Character_Age). This makes it possible to use classes like `\p{Age:6.0}` to refer to all codepoints first introduced in Unicode 6.0 * **unicode-bool** - Provide the data for numerous Unicode boolean properties. The full list is not included here, but contains properties like `Alphabetic`, `Emoji`, `Lowercase`, `Math`, `Uppercase` and `White_Space`. * **unicode-case** - Provide the data for case insensitive matching using [Unicode's "simple loose matches" specification](https://www.unicode.org/reports/tr18/#Simple_Loose_Matches). * **unicode-gencat** - Provide the data for [Unicode general categories](https://www.unicode.org/reports/tr44/tr44-24.html#General_Category_Values). This includes, but is not limited to, `Decimal_Number`, `Letter`, `Math_Symbol`, `Number` and `Punctuation`. * **unicode-perl** - Provide the data for supporting the Unicode-aware Perl character classes, corresponding to `\w`, `\s` and `\d`. This is also necessary for using Unicode-aware word boundary assertions. Note that if this feature is disabled, the `\s` and `\d` character classes are still available if the `unicode-bool` and `unicode-gencat` features are enabled, respectively. * **unicode-script** - Provide the data for [Unicode scripts and script extensions](https://www.unicode.org/reports/tr24/). This includes, but is not limited to, `Arabic`, `Cyrillic`, `Hebrew`, `Latin` and `Thai`. * **unicode-segment** - Provide the data necessary to provide the properties used to implement the [Unicode text segmentation algorithms](https://www.unicode.org/reports/tr29/). This enables using classes like `\p{gcb=Extend}`, `\p{wb=Katakana}` and `\p{sb=ATerm}`. * **unicode-word-boundary** - Enables support for Unicode word boundaries, i.e., `\b`, in regexes. When this and `unicode-perl` are enabled, then data tables from `regex-syntax` are used to implement Unicode word boundaries. However, if `regex-syntax` isn't enabled as a dependency then one can still enable this feature. It will cause `regex-automata` to bundle its own data table that would otherwise be redundant with `regex-syntax`'s table. ### Regex engine features * **syntax** - Enables a dependency on `regex-syntax`. This makes APIs for building regex engines from pattern strings available. Without the `regex-syntax` dependency, the only way to build a regex engine is generally to deserialize a previously built DFA or to hand assemble an NFA using its [builder API](nfa::thompson::Builder). Once you have an NFA, you can build any of the regex engines in this crate. The `syntax` feature also enables `alloc`. * **meta** - Enables the meta regex engine. This also enables the `syntax` and `nfa-pikevm` features, as both are the minimal requirements needed. The meta regex engine benefits from enabling any of the other regex engines and will use them automatically when appropriate. * **nfa** - Enables all NFA related features below. * **nfa-thompson** - Enables the Thompson NFA APIs. This enables `alloc`. * **nfa-pikevm** - Enables the PikeVM regex engine. This enables `nfa-thompson`. * **nfa-backtrack** - Enables the bounded backtracker regex engine. This enables `nfa-thompson`. * **dfa** - Enables all DFA related features below. * **dfa-build** - Enables APIs for determinizing DFAs from NFAs. This enables `nfa-thompson` and `dfa-search`. * **dfa-search** - Enables APIs for searching with DFAs. * **dfa-onepass** - Enables the one-pass DFA API. This enables `nfa-thompson`. * **hybrid** - Enables the hybrid NFA/DFA or "lazy DFA" regex engine. This enables `alloc` and `nfa-thompson`. */ // We are no_std. #![no_std] // All APIs need docs! #![deny(missing_docs)] // Some intra-doc links are broken when certain features are disabled, so we // only bleat about it when most (all?) features are enabled. But when we do, // we block the build. Links need to work. #![cfg_attr( all( feature = "std", feature = "nfa", feature = "dfa", feature = "hybrid" ), deny(rustdoc::broken_intra_doc_links) )] // Broken rustdoc links are very easy to come by when you start disabling // features. Namely, features tend to change imports, and imports change what's // available to link to. // // Basically, we just don't support rustdoc for anything other than the maximal // feature configuration. Other configurations will work, they just won't be // perfect. // // So here, we specifically allow them so we don't even get warned about them. #![cfg_attr( not(all( feature = "std", feature = "nfa", feature = "dfa", feature = "hybrid" )), allow(rustdoc::broken_intra_doc_links) )] // Kinda similar, but eliminating all of the dead code and unused import // warnings for every feature combo is a fool's errand. Instead, we just // suppress those, but still let them through in a common configuration when we // build most of everything. // // This does actually suggest that when features are disabled, we are actually // compiling more code than we need to be. And this is perhaps not so great // because disabling features is usually done in order to reduce compile times // by reducing the amount of code one compiles... However, usually, most of the // time this dead code is a relatively small amount from the 'util' module. // But... I confess... There isn't a ton of visibility on this. // // I'm happy to try to address this in a different way, but "let's annotate // every function in 'util' with some non-local combination of features" just // cannot be the way forward. #![cfg_attr( not(all( feature = "std", feature = "nfa", feature = "dfa", feature = "hybrid", feature = "perf-literal-substring", feature = "perf-literal-multisubstring", )), allow(dead_code, unused_imports, unused_variables) )] // We generally want all types to impl Debug. #![warn(missing_debug_implementations)] // No clue why this thing is still unstable because it's pretty amazing. This // adds Cargo feature annotations to items in the rustdoc output. Which is // sadly hugely beneficial for this crate due to the number of features. #![cfg_attr(docsrs, feature(doc_auto_cfg))] // I have literally never tested this crate on 16-bit, so it is quite // suspicious to advertise support for it. But... the regex crate, at time // of writing, at least claims to support it by not doing any conditional // compilation based on the target pointer width. So I guess I remain // consistent with that here. // // If you are here because you're on a 16-bit system and you were somehow using // the regex crate previously, please file an issue. Please be prepared to // provide some kind of reproduction or carve out some path to getting 16-bit // working in CI. (Via qemu?) #[cfg(not(any( target_pointer_width = "16", target_pointer_width = "32", target_pointer_width = "64" )))] compile_error!("not supported on non-{16,32,64}, please file an issue"); #[cfg(any(test, feature = "std"))] extern crate std; #[cfg(feature = "alloc")] extern crate alloc; #[cfg(doctest)] doc_comment::doctest!("../README.md"); #[doc(inline)] pub use crate::util::primitives::PatternID; pub use crate::util::search::*; #[macro_use] mod macros; #[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))] pub mod dfa; #[cfg(feature = "hybrid")] pub mod hybrid; #[cfg(feature = "meta")] pub mod meta; #[cfg(feature = "nfa-thompson")] pub mod nfa; pub mod util; regex-automata-0.4.9/src/macros.rs000064400000000000000000000006471046102023000152110ustar 00000000000000// Some feature combinations result in some of these macros never being used. // Which is fine. Just squash the warnings. #![allow(unused_macros)] macro_rules! log { ($($tt:tt)*) => { #[cfg(feature = "logging")] { $($tt)* } } } macro_rules! debug { ($($tt:tt)*) => { log!(log::debug!($($tt)*)) } } macro_rules! trace { ($($tt:tt)*) => { log!(log::trace!($($tt)*)) } } regex-automata-0.4.9/src/meta/error.rs000064400000000000000000000206361046102023000160040ustar 00000000000000use regex_syntax::{ast, hir}; use crate::{nfa, util::search::MatchError, PatternID}; /// An error that occurs when construction of a `Regex` fails. /// /// A build error is generally a result of one of two possible failure /// modes. First is a parse or syntax error in the concrete syntax of a /// pattern. Second is that the construction of the underlying regex matcher /// fails, usually because it gets too big with respect to limits like /// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit). /// /// This error provides very little introspection capabilities. You can: /// /// * Ask for the [`PatternID`] of the pattern that caused an error, if one /// is available. This is available for things like syntax errors, but not for /// cases where build limits are exceeded. /// * Ask for the underlying syntax error, but only if the error is a syntax /// error. /// * Ask for a human readable message corresponding to the underlying error. /// * The `BuildError::source` method (from the `std::error::Error` /// trait implementation) may be used to query for an underlying error if one /// exists. There are no API guarantees about which error is returned. /// /// When the `std` feature is enabled, this implements `std::error::Error`. #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } #[derive(Clone, Debug)] enum BuildErrorKind { Syntax { pid: PatternID, err: regex_syntax::Error }, NFA(nfa::thompson::BuildError), } impl BuildError { /// If it is known which pattern ID caused this build error to occur, then /// this method returns it. /// /// Some errors are not associated with a particular pattern. However, any /// errors that occur as part of parsing a pattern are guaranteed to be /// associated with a pattern ID. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` pub fn pattern(&self) -> Option { match self.kind { BuildErrorKind::Syntax { pid, .. } => Some(pid), _ => None, } } /// If this error occurred because the regex exceeded the configured size /// limit before being built, then this returns the configured size limit. /// /// The limit returned is what was configured, and corresponds to the /// maximum amount of heap usage in bytes. pub fn size_limit(&self) -> Option { match self.kind { BuildErrorKind::NFA(ref err) => err.size_limit(), _ => None, } } /// If this error corresponds to a syntax error, then a reference to it is /// returned by this method. pub fn syntax_error(&self) -> Option<®ex_syntax::Error> { match self.kind { BuildErrorKind::Syntax { ref err, .. } => Some(err), _ => None, } } pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError { let err = regex_syntax::Error::from(err); BuildError { kind: BuildErrorKind::Syntax { pid, err } } } pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError { let err = regex_syntax::Error::from(err); BuildError { kind: BuildErrorKind::Syntax { pid, err } } } pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { BuildError { kind: BuildErrorKind::NFA(err) } } } #[cfg(feature = "std")] impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind { BuildErrorKind::Syntax { ref err, .. } => Some(err), BuildErrorKind::NFA(ref err) => Some(err), } } } impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind { BuildErrorKind::Syntax { pid, .. } => { write!(f, "error parsing pattern {}", pid.as_usize()) } BuildErrorKind::NFA(_) => write!(f, "error building NFA"), } } } /// An error that occurs when a search should be retried. /// /// This retry error distinguishes between two different failure modes. /// /// The first is one where potential quadratic behavior has been detected. /// In this case, whatever optimization that led to this behavior should be /// stopped, and the next best strategy should be used. /// /// The second indicates that the underlying regex engine has failed for some /// reason. This usually occurs because either a lazy DFA's cache has become /// ineffective or because a non-ASCII byte has been seen *and* a Unicode word /// boundary was used in one of the patterns. In this failure case, a different /// regex engine that won't fail in these ways (PikeVM, backtracker or the /// one-pass DFA) should be used. /// /// This is an internal error only and should never bleed into the public /// API. #[derive(Debug)] pub(crate) enum RetryError { Quadratic(RetryQuadraticError), Fail(RetryFailError), } #[cfg(feature = "std")] impl std::error::Error for RetryError {} impl core::fmt::Display for RetryError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match *self { RetryError::Quadratic(ref err) => err.fmt(f), RetryError::Fail(ref err) => err.fmt(f), } } } impl From for RetryError { fn from(merr: MatchError) -> RetryError { RetryError::Fail(RetryFailError::from(merr)) } } /// An error that occurs when potential quadratic behavior has been detected /// when applying either the "reverse suffix" or "reverse inner" optimizations. /// /// When this error occurs, callers should abandon the "reverse" optimization /// and use a normal forward search. #[derive(Debug)] pub(crate) struct RetryQuadraticError(()); impl RetryQuadraticError { pub(crate) fn new() -> RetryQuadraticError { RetryQuadraticError(()) } } #[cfg(feature = "std")] impl std::error::Error for RetryQuadraticError {} impl core::fmt::Display for RetryQuadraticError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "regex engine gave up to avoid quadratic behavior") } } impl From for RetryError { fn from(err: RetryQuadraticError) -> RetryError { RetryError::Quadratic(err) } } /// An error that occurs when a regex engine "gives up" for some reason before /// finishing a search. Usually this occurs because of heuristic Unicode word /// boundary support or because of ineffective cache usage in the lazy DFA. /// /// When this error occurs, callers should retry the regex search with a /// different regex engine. /// /// Note that this has convenient `From` impls that will automatically /// convert a `MatchError` into this error. This works because the meta /// regex engine internals guarantee that errors like `HaystackTooLong` and /// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and /// `GaveUp`, which both correspond to this "failure" error. #[derive(Debug)] pub(crate) struct RetryFailError { offset: usize, } impl RetryFailError { pub(crate) fn from_offset(offset: usize) -> RetryFailError { RetryFailError { offset } } } #[cfg(feature = "std")] impl std::error::Error for RetryFailError {} impl core::fmt::Display for RetryFailError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "regex engine failed at offset {:?}", self.offset) } } impl From for RetryError { fn from(err: RetryFailError) -> RetryError { RetryError::Fail(err) } } impl From for RetryFailError { fn from(merr: MatchError) -> RetryFailError { use crate::util::search::MatchErrorKind::*; match *merr.kind() { Quit { offset, .. } => RetryFailError::from_offset(offset), GaveUp { offset } => RetryFailError::from_offset(offset), // These can never occur because we avoid them by construction // or with higher level control flow logic. For example, the // backtracker's wrapper will never hand out a backtracker engine // when the haystack would be too long. HaystackTooLong { .. } | UnsupportedAnchored { .. } => { unreachable!("found impossible error in meta engine: {}", merr) } } } } regex-automata-0.4.9/src/meta/limited.rs000064400000000000000000000240401046102023000162730ustar 00000000000000/*! This module defines two bespoke reverse DFA searching routines. (One for the lazy DFA and one for the fully compiled DFA.) These routines differ from the usual ones by permitting the caller to specify a minimum starting position. That is, the search will begin at `input.end()` and will usually stop at `input.start()`, unless `min_start > input.start()`, in which case, the search will stop at `min_start`. In other words, this lets you say, "no, the search must not extend past this point, even if it's within the bounds of the given `Input`." And if the search *does* want to go past that point, it stops and returns a "may be quadratic" error, which indicates that the caller should retry using some other technique. These routines specifically exist to protect against quadratic behavior when employing the "reverse suffix" and "reverse inner" optimizations. Without the backstop these routines provide, it is possible for parts of the haystack to get re-scanned over and over again. The backstop not only prevents this, but *tells you when it is happening* so that you can change the strategy. Why can't we just use the normal search routines? We could use the normal search routines and just set the start bound on the provided `Input` to our `min_start` position. The problem here is that it's impossible to distinguish between "no match because we reached the end of input" and "determined there was no match well before the end of input." The former case is what we care about with respect to quadratic behavior. The latter case is totally fine. Why don't we modify the normal search routines to report the position at which the search stops? I considered this, and I still wonder if it is indeed the right thing to do. However, I think the straight-forward thing to do there would be to complicate the return type signature of almost every search routine in this crate, which I really do not want to do. It therefore might make more sense to provide a richer way for search routines to report meta data, but that was beyond my bandwidth to work on at the time of writing. See the 'opt/reverse-inner' and 'opt/reverse-suffix' benchmarks in rebar for a real demonstration of how quadratic behavior is mitigated. */ use crate::{ meta::error::{RetryError, RetryQuadraticError}, HalfMatch, Input, MatchError, }; #[cfg(feature = "dfa-build")] pub(crate) fn dfa_try_search_half_rev( dfa: &crate::dfa::dense::DFA>, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { use crate::dfa::Automaton; let mut mat = None; let mut sid = dfa.start_state_reverse(input)?; if input.start() == input.end() { dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; return Ok(mat); } let mut at = input.end() - 1; loop { sid = dfa.next_state(sid, input.haystack()[at]); if dfa.is_special_state(sid) { if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); // Since reverse searches report the beginning of a // match and the beginning is inclusive (not exclusive // like the end of a match), we add 1 to make it // inclusive. mat = Some(HalfMatch::new(pattern, at + 1)); } else if dfa.is_dead_state(sid) { return Ok(mat); } else if dfa.is_quit_state(sid) { return Err(MatchError::quit(input.haystack()[at], at).into()); } } if at == input.start() { break; } at -= 1; if at < min_start { trace!( "reached position {} which is before the previous literal \ match, quitting to avoid quadratic behavior", at, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } let was_dead = dfa.is_dead_state(sid); dfa_eoi_rev(dfa, input, &mut sid, &mut mat)?; // If we reach the beginning of the search and we could otherwise still // potentially keep matching if there was more to match, then we actually // return an error to indicate giving up on this optimization. Why? Because // we can't prove that the real match begins at where we would report it. // // This only happens when all of the following are true: // // 1) We reach the starting point of our search span. // 2) The match we found is before the starting point. // 3) The FSM reports we could possibly find a longer match. // // We need (1) because otherwise the search stopped before the starting // point and there is no possible way to find a more leftmost position. // // We need (2) because if the match found has an offset equal to the minimum // possible offset, then there is no possible more leftmost match. // // We need (3) because if the FSM couldn't continue anyway (i.e., it's in // a dead state), then we know we couldn't find anything more leftmost // than what we have. (We have to check the state we were in prior to the // EOI transition since the EOI transition will usually bring us to a dead // state by virtue of it represents the end-of-input.) if at == input.start() && mat.map_or(false, |m| m.offset() > input.start()) && !was_dead { trace!( "reached beginning of search at offset {} without hitting \ a dead state, quitting to avoid potential false positive match", at, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } Ok(mat) } #[cfg(feature = "hybrid")] pub(crate) fn hybrid_try_search_half_rev( dfa: &crate::hybrid::dfa::DFA, cache: &mut crate::hybrid::dfa::Cache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { let mut mat = None; let mut sid = dfa.start_state_reverse(cache, input)?; if input.start() == input.end() { hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; return Ok(mat); } let mut at = input.end() - 1; loop { sid = dfa .next_state(cache, sid, input.haystack()[at]) .map_err(|_| MatchError::gave_up(at))?; if sid.is_tagged() { if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); // Since reverse searches report the beginning of a // match and the beginning is inclusive (not exclusive // like the end of a match), we add 1 to make it // inclusive. mat = Some(HalfMatch::new(pattern, at + 1)); } else if sid.is_dead() { return Ok(mat); } else if sid.is_quit() { return Err(MatchError::quit(input.haystack()[at], at).into()); } } if at == input.start() { break; } at -= 1; if at < min_start { trace!( "reached position {} which is before the previous literal \ match, quitting to avoid quadratic behavior", at, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } } let was_dead = sid.is_dead(); hybrid_eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; // See the comments in the full DFA routine above for why we need this. if at == input.start() && mat.map_or(false, |m| m.offset() > input.start()) && !was_dead { trace!( "reached beginning of search at offset {} without hitting \ a dead state, quitting to avoid potential false positive match", at, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } Ok(mat) } #[cfg(feature = "dfa-build")] #[cfg_attr(feature = "perf-inline", inline(always))] fn dfa_eoi_rev( dfa: &crate::dfa::dense::DFA>, input: &Input<'_>, sid: &mut crate::util::primitives::StateID, mat: &mut Option, ) -> Result<(), MatchError> { use crate::dfa::Automaton; let sp = input.get_span(); if sp.start > 0 { let byte = input.haystack()[sp.start - 1]; *sid = dfa.next_state(*sid, byte); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } #[cfg(feature = "hybrid")] #[cfg_attr(feature = "perf-inline", inline(always))] fn hybrid_eoi_rev( dfa: &crate::hybrid::dfa::DFA, cache: &mut crate::hybrid::dfa::Cache, input: &Input<'_>, sid: &mut crate::hybrid::LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); if sp.start > 0 { let byte = input.haystack()[sp.start - 1]; *sid = dfa .next_state(cache, *sid, byte) .map_err(|_| MatchError::gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa .next_eoi_state(cache, *sid) .map_err(|_| MatchError::gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } Ok(()) } regex-automata-0.4.9/src/meta/literal.rs000064400000000000000000000062751046102023000163120ustar 00000000000000use alloc::{vec, vec::Vec}; use regex_syntax::hir::Hir; use crate::{meta::regex::RegexInfo, util::search::MatchKind}; /// Pull out an alternation of literals from the given sequence of HIR /// expressions. /// /// There are numerous ways for this to fail. Generally, this only applies /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there /// are "too few" alternates, in which case, the regex engine is likely faster. /// /// And currently, this only returns something when 'hirs.len() == 1'. pub(crate) fn alternation_literals( info: &RegexInfo, hirs: &[&Hir], ) -> Option>> { use regex_syntax::hir::{HirKind, Literal}; // Might as well skip the work below if we know we can't build an // Aho-Corasick searcher. if !cfg!(feature = "perf-literal-multisubstring") { return None; } // This is pretty hacky, but basically, if `is_alternation_literal` is // true, then we can make several assumptions about the structure of our // HIR. This is what justifies the `unreachable!` statements below. if hirs.len() != 1 || !info.props()[0].look_set().is_empty() || info.props()[0].explicit_captures_len() > 0 || !info.props()[0].is_alternation_literal() || info.config().get_match_kind() != MatchKind::LeftmostFirst { return None; } let hir = &hirs[0]; let alts = match *hir.kind() { HirKind::Alternation(ref alts) => alts, _ => return None, // one literal isn't worth it }; let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { HirKind::Literal(Literal(ref bytes)) => { lit.extend_from_slice(bytes) } HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { HirKind::Literal(Literal(ref bytes)) => { lit.extend_from_slice(bytes); } _ => unreachable!("expected literal, got {:?}", e), } } } _ => unreachable!("expected literal or concat, got {:?}", alt), } lits.push(lit); } // Why do this? Well, when the number of literals is small, it's likely // that we'll use the lazy DFA which is in turn likely to be faster than // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use // the latter because it is so hungry (in time and space), and the former // is decently fast, but not as fast as a well oiled lazy DFA. // // However, once the number starts getting large, the lazy DFA is likely // to start thrashing because of the modest default cache size. When // exactly does this happen? Dunno. But at whatever point that is (we make // a guess below based on ad hoc benchmarking), we'll want to cut over to // Aho-Corasick, where even the contiguous NFA is likely to do much better. if lits.len() < 3000 { debug!("skipping Aho-Corasick because there are too few literals"); return None; } Some(lits) } regex-automata-0.4.9/src/meta/mod.rs000064400000000000000000000052321046102023000154250ustar 00000000000000/*! Provides a regex matcher that composes several other regex matchers automatically. This module is home to a meta [`Regex`], which provides a convenient high level API for executing regular expressions in linear time. # Comparison with the `regex` crate A meta `Regex` is the implementation used directly by the `regex` crate. Indeed, the `regex` crate API is essentially just a light wrapper over a meta `Regex`. This means that if you need the full flexibility offered by this API, then you should be able to switch to using this API directly without any changes in match semantics or syntax. However, there are some API level differences: * The `regex` crate API returns match objects that include references to the haystack itself, which in turn makes it easy to access the matching strings without having to slice the haystack yourself. In contrast, a meta `Regex` returns match objects that only have offsets in them. * At time of writing, a meta `Regex` doesn't have some of the convenience routines that the `regex` crate has, such as replacements. Note though that [`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string) will handle the replacement string interpolation for you. * A meta `Regex` supports the [`Input`](crate::Input) abstraction, which provides a way to configure a search in more ways than is supported by the `regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can be used to run an anchored search, regardless of whether the pattern is itself anchored with a `^`. * A meta `Regex` supports multi-pattern searching everywhere. Indeed, every [`Match`](crate::Match) returned by the search APIs include a [`PatternID`](crate::PatternID) indicating which pattern matched. In the single pattern case, all matches correspond to [`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate has distinct `Regex` and a `RegexSet` APIs. The former only supports a single pattern, while the latter supports multiple patterns but cannot report the offsets of a match. * A meta `Regex` provides the explicit capability of bypassing its internal memory pool for automatically acquiring mutable scratch space required by its internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower level routines such as [`Regex::search_with`]. */ pub use self::{ error::BuildError, regex::{ Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split, SplitN, }, }; mod error; #[cfg(any(feature = "dfa-build", feature = "hybrid"))] mod limited; mod literal; mod regex; mod reverse_inner; #[cfg(any(feature = "dfa-build", feature = "hybrid"))] mod stopat; mod strategy; mod wrappers; regex-automata-0.4.9/src/meta/regex.rs000064400000000000000000004246521046102023000157730ustar 00000000000000use core::{ borrow::Borrow, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::{boxed::Box, sync::Arc, vec, vec::Vec}; use regex_syntax::{ ast, hir::{self, Hir}, }; use crate::{ meta::{ error::BuildError, strategy::{self, Strategy}, wrappers, }, nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, iter, pool::{Pool, PoolGuard}, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID}, search::{HalfMatch, Input, Match, MatchKind, PatternSet, Span}, }, }; /// A type alias for our pool of meta::Cache that fixes the type parameters to /// what we use for the meta regex below. type CachePool = Pool; /// Same as above, but for the guard returned by a pool. type CachePoolGuard<'a> = PoolGuard<'a, Cache, CachePoolFn>; /// The type of the closure we use to create new caches. We need to spell out /// all of the marker traits or else we risk leaking !MARKER impls. type CachePoolFn = Box Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; /// A regex matcher that works by composing several other regex matchers /// automatically. /// /// In effect, a meta regex papers over a lot of the quirks or performance /// problems in each of the regex engines in this crate. Its goal is to provide /// an infallible and simple API that "just does the right thing" in the common /// case. /// /// A meta regex is the implementation of a `Regex` in the `regex` crate. /// Indeed, the `regex` crate API is essentially just a light wrapper over /// this type. This includes the `regex` crate's `RegexSet` API! /// /// # Composition /// /// This is called a "meta" matcher precisely because it uses other regex /// matchers to provide a convenient high level regex API. Here are some /// examples of how other regex matchers are composed: /// /// * When calling [`Regex::captures`], instead of immediately /// running a slower but more capable regex engine like the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM), the meta regex engine /// will usually first look for the bounds of a match with a higher throughput /// regex engine like a [lazy DFA](crate::hybrid). Only when a match is found /// is a slower engine like `PikeVM` used to find the matching span for each /// capture group. /// * While higher throughout engines like the lazy DFA cannot handle /// Unicode word boundaries in general, they can still be used on pure ASCII /// haystacks by pretending that Unicode word boundaries are just plain ASCII /// word boundaries. However, if a haystack is not ASCII, the meta regex engine /// will automatically switch to a (possibly slower) regex engine that supports /// Unicode word boundaries in general. /// * In some cases where a regex pattern is just a simple literal or a small /// set of literals, an actual regex engine won't be used at all. Instead, /// substring or multi-substring search algorithms will be employed. /// /// There are many other forms of composition happening too, but the above /// should give a general idea. In particular, it may perhaps be surprising /// that *multiple* regex engines might get executed for a single search. That /// is, the decision of what regex engine to use is not _just_ based on the /// pattern, but also based on the dynamic execution of the search itself. /// /// The primary reason for this composition is performance. The fundamental /// tension is that the faster engines tend to be less capable, and the more /// capable engines tend to be slower. /// /// Note that the forms of composition that are allowed are determined by /// compile time crate features and configuration. For example, if the `hybrid` /// feature isn't enabled, or if [`Config::hybrid`] has been disabled, then the /// meta regex engine will never use a lazy DFA. /// /// # Synchronization and cloning /// /// Most of the regex engines in this crate require some kind of mutable /// "scratch" space to read and write from while performing a search. Since /// a meta regex composes these regex engines, a meta regex also requires /// mutable scratch space. This scratch space is called a [`Cache`]. /// /// Most regex engines _also_ usually have a read-only component, typically /// a [Thompson `NFA`](crate::nfa::thompson::NFA). /// /// In order to make the `Regex` API convenient, most of the routines hide /// the fact that a `Cache` is needed at all. To achieve this, a [memory /// pool](crate::util::pool::Pool) is used internally to retrieve `Cache` /// values in a thread safe way that also permits reuse. This in turn implies /// that every such search call requires some form of synchronization. Usually /// this synchronization is fast enough to not notice, but in some cases, it /// can be a bottleneck. This typically occurs when all of the following are /// true: /// /// * The same `Regex` is shared across multiple threads simultaneously, /// usually via a [`util::lazy::Lazy`](crate::util::lazy::Lazy) or something /// similar from the `once_cell` or `lazy_static` crates. /// * The primary unit of work in each thread is a regex search. /// * Searches are run on very short haystacks. /// /// This particular case can lead to high contention on the pool used by a /// `Regex` internally, which can in turn increase latency to a noticeable /// effect. This cost can be mitigated in one of the following ways: /// /// * Use a distinct copy of a `Regex` in each thread, usually by cloning it. /// Cloning a `Regex` _does not_ do a deep copy of its read-only component. /// But it does lead to each `Regex` having its own memory pool, which in /// turn eliminates the problem of contention. In general, this technique should /// not result in any additional memory usage when compared to sharing the same /// `Regex` across multiple threads simultaneously. /// * Use lower level APIs, like [`Regex::search_with`], which permit passing /// a `Cache` explicitly. In this case, it is up to you to determine how best /// to provide a `Cache`. For example, you might put a `Cache` in thread-local /// storage if your use case allows for it. /// /// Overall, this is an issue that happens rarely in practice, but it can /// happen. /// /// # Warning: spin-locks may be used in alloc-only mode /// /// When this crate is built without the `std` feature and the high level APIs /// on a `Regex` are used, then a spin-lock will be used to synchronize access /// to an internal pool of `Cache` values. This may be undesirable because /// a spin-lock is [effectively impossible to implement correctly in user /// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could /// result in a deadlock. /// /// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// /// If one wants to avoid the use of spin-locks when the `std` feature is /// disabled, then you must use APIs that accept a `Cache` value explicitly. /// For example, [`Regex::search_with`]. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?; /// assert!(re.is_match("2010-03-14")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored search /// /// This example shows how to use [`Input::anchored`] to run an anchored /// search, even when the regex pattern itself isn't anchored. An anchored /// search guarantees that if a match is found, then the start offset of the /// match corresponds to the offset at which the search was started. /// /// ``` /// use regex_automata::{meta::Regex, Anchored, Input, Match}; /// /// let re = Regex::new(r"\bfoo\b")?; /// let input = Input::new("xx foo xx").range(3..).anchored(Anchored::Yes); /// // The offsets are in terms of the original haystack. /// assert_eq!(Some(Match::must(0, 3..6)), re.find(input)); /// /// // Notice that no match occurs here, because \b still takes the /// // surrounding context into account, even if it means looking back /// // before the start of your search. /// let hay = "xxfoo xx"; /// let input = Input::new(hay).range(2..).anchored(Anchored::Yes); /// assert_eq!(None, re.find(input)); /// // Indeed, you cannot achieve the above by simply slicing the /// // haystack itself, since the regex engine can't see the /// // surrounding context. This is why 'Input' permits setting /// // the bounds of a search! /// let input = Input::new(&hay[2..]).anchored(Anchored::Yes); /// // WRONG! /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: earliest search /// /// This example shows how to use [`Input::earliest`] to run a search that /// might stop before finding the typical leftmost match. /// /// ``` /// use regex_automata::{meta::Regex, Anchored, Input, Match}; /// /// let re = Regex::new(r"[a-z]{3}|b")?; /// let input = Input::new("abc").earliest(true); /// assert_eq!(Some(Match::must(0, 1..2)), re.find(input)); /// /// // Note that "earliest" isn't really a match semantic unto itself. /// // Instead, it is merely an instruction to whatever regex engine /// // gets used internally to quit as soon as it can. For example, /// // this regex uses a different search technique, and winds up /// // producing a different (but valid) match! /// let re = Regex::new(r"abc|b")?; /// let input = Input::new("abc").earliest(true); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change /// the line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug)] pub struct Regex { /// The actual regex implementation. imp: Arc, /// A thread safe pool of caches. /// /// For the higher level search APIs, a `Cache` is automatically plucked /// from this pool before running a search. The lower level `with` methods /// permit the caller to provide their own cache, thereby bypassing /// accesses to this pool. /// /// Note that we put this outside the `Arc` so that cloning a `Regex` /// results in creating a fresh `CachePool`. This in turn permits callers /// to clone regexes into separate threads where each such regex gets /// the pool's "thread owner" optimization. Otherwise, if one shares the /// `Regex` directly, then the pool will go through a slower mutex path for /// all threads except for the "owner." pool: CachePool, } /// The internal implementation of `Regex`, split out so that it can be wrapped /// in an `Arc`. #[derive(Debug)] struct RegexI { /// The core matching engine. /// /// Why is this reference counted when RegexI is already wrapped in an Arc? /// Well, we need to capture this in a closure to our `Pool` below in order /// to create new `Cache` values when needed. So since it needs to be in /// two places, we make it reference counted. /// /// We make `RegexI` itself reference counted too so that `Regex` itself /// stays extremely small and very cheap to clone. strat: Arc, /// Metadata about the regexes driving the strategy. The metadata is also /// usually stored inside the strategy too, but we put it here as well /// so that we can get quick access to it (without virtual calls) before /// executing the regex engine. For example, we use this metadata to /// detect a subset of cases where we know a match is impossible, and can /// thus avoid calling into the strategy at all. /// /// Since `RegexInfo` is stored in multiple places, it is also reference /// counted. info: RegexInfo, } /// Convenience constructors for a `Regex` using the default configuration. impl Regex { /// Builds a `Regex` from a single pattern string using the default /// configuration. /// /// If there was a problem parsing the pattern or a problem turning it into /// a regex matcher, then an error is returned. /// /// If you want to change the configuration of a `Regex`, use a [`Builder`] /// with a [`Config`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new(r"(?Rm)^foo$")?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn new(pattern: &str) -> Result { Regex::builder().build(pattern) } /// Builds a `Regex` from many pattern strings using the default /// configuration. /// /// If there was a problem parsing any of the patterns or a problem turning /// them into a regex matcher, then an error is returned. /// /// If you want to change the configuration of a `Regex`, use a [`Builder`] /// with a [`Config`]. /// /// # Example: simple lexer /// /// This simplistic example leverages the multi-pattern support to build a /// simple little lexer. The pattern ID in the match tells you which regex /// matched, which in turn might be used to map back to the "type" of the /// token returned by the lexer. /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new_many(&[ /// r"[[:space:]]", /// r"[A-Za-z0-9][A-Za-z0-9_]+", /// r"->", /// r".", /// ])?; /// let haystack = "fn is_boss(bruce: i32, springsteen: String) -> bool;"; /// let matches: Vec = re.find_iter(haystack).collect(); /// assert_eq!(matches, vec![ /// Match::must(1, 0..2), // 'fn' /// Match::must(0, 2..3), // ' ' /// Match::must(1, 3..10), // 'is_boss' /// Match::must(3, 10..11), // '(' /// Match::must(1, 11..16), // 'bruce' /// Match::must(3, 16..17), // ':' /// Match::must(0, 17..18), // ' ' /// Match::must(1, 18..21), // 'i32' /// Match::must(3, 21..22), // ',' /// Match::must(0, 22..23), // ' ' /// Match::must(1, 23..34), // 'springsteen' /// Match::must(3, 34..35), // ':' /// Match::must(0, 35..36), // ' ' /// Match::must(1, 36..42), // 'String' /// Match::must(3, 42..43), // ')' /// Match::must(0, 43..44), // ' ' /// Match::must(2, 44..46), // '->' /// Match::must(0, 46..47), // ' ' /// Match::must(1, 47..51), // 'bool' /// Match::must(3, 51..52), // ';' /// ]); /// /// # Ok::<(), Box>(()) /// ``` /// /// One can write a lexer like the above using a regex like /// `(?P[[:space:]])|(?P[A-Za-z0-9][A-Za-z0-9_]+)|...`, /// but then you need to ask whether capture group matched to determine /// which branch in the regex matched, and thus, which token the match /// corresponds to. In contrast, the above example includes the pattern ID /// in the match. There's no need to use capture groups at all. /// /// # Example: finding the pattern that caused an error /// /// When a syntax error occurs, it is possible to ask which pattern /// caused the syntax error. /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` /// /// # Example: zero patterns is valid /// /// Building a regex with zero patterns results in a regex that never /// matches anything. Because this routine is generic, passing an empty /// slice usually requires a turbo-fish (or something else to help type /// inference). /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::new_many::<&str>(&[])?; /// assert_eq!(None, re.find("")); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_many>( patterns: &[P], ) -> Result { Regex::builder().build_many(patterns) } /// Return a default configuration for a `Regex`. /// /// This is a convenience routine to avoid needing to import the [`Config`] /// type when customizing the construction of a `Regex`. /// /// # Example: lower the NFA size limit /// /// In some cases, the default size limit might be too big. The size limit /// can be lowered, which will prevent large regex patterns from compiling. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change /// the line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } } /// High level convenience routines for using a regex to search a haystack. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. (Consider how this might make /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`. /// This routine _may_ stop after it sees the first `a`, but routines like /// `find` need to continue searching because `+` is greedy by default.) /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// /// assert!(re.is_match("foo12345bar")); /// assert!(!re.is_match("foobar")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `find` returns a /// match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{meta::Regex, Input}; /// /// let re = Regex::new("a*")?; /// /// // This doesn't match because the default configuration bans empty /// // matches from splitting a codepoint. /// assert!(!re.is_match(Input::new("☃").span(1..2))); /// assert_eq!(None, re.find(Input::new("☃").span(1..2))); /// /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("a*")?; /// /// assert!(re.is_match(Input::new("☃").span(1..2))); /// assert_eq!( /// Some(Match::must(0, 1..1)), /// re.find(Input::new("☃").span(1..2)), /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// A similar idea applies when using line anchors with CRLF mode enabled, /// which prevents them from matching between a `\r` and a `\n`. /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?Rm:$)")?; /// assert!(!re.is_match(Input::new("\r\n").span(1..1))); /// // A regular line anchor, which only considers \n as a /// // line terminator, will match. /// let re = Regex::new(r"(?m:$)")?; /// assert!(re.is_match(Input::new("\r\n").span(1..1))); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>(&self, input: I) -> bool { let input = input.into().earliest(true); if self.imp.info.is_impossible(&input) { return false; } let mut guard = self.pool.get(); let result = self.imp.strat.is_match(&mut guard, &input); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } /// Executes a leftmost search and returns the first match that is found, /// if one exists. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>(&self, input: I) -> Option { self.search(&input.into()) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Span}; /// /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; /// let mut caps = re.create_captures(); /// /// re.captures("2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures<'h, I: Into>>( &self, input: I, caps: &mut Captures, ) { self.search_captures(&input.into(), caps) } /// Returns an iterator over all non-overlapping leftmost matches in /// the given haystack. If no match exists, then the iterator yields no /// elements. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let haystack = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(haystack).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'h, I: Into>>( &'r self, input: I, ) -> FindMatches<'r, 'h> { let cache = self.pool.get(); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, it } } /// Returns an iterator over all non-overlapping `Captures` values. If no /// match exists, then the iterator yields no elements. /// /// This yields the same matches as [`Regex::find_iter`], but it includes /// the spans of all capturing groups that participate in each match. /// /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for /// how to correctly iterate over all matches in a haystack while avoiding /// the creation of a new `Captures` value for every match. (Which you are /// forced to do with an `Iterator`.) /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Span}; /// /// let re = Regex::new("foo(?P[0-9]+)")?; /// /// let haystack = "foo1 foo12 foo123"; /// let matches: Vec = re /// .captures_iter(haystack) /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) /// .collect(); /// assert_eq!(matches, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures_iter<'r, 'h, I: Into>>( &'r self, input: I, ) -> CapturesMatches<'r, 'h> { let cache = self.pool.get(); let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } } /// Returns an iterator of spans of the haystack given, delimited by a /// match of the regex. Namely, each element of the iterator corresponds to /// a part of the haystack that *isn't* matched by the regular expression. /// /// # Example /// /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"[ \t]+")?; /// let hay = "a b \t c\td e"; /// let fields: Vec<&str> = re.split(hay).map(|span| &hay[span]).collect(); /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: more cases /// /// Basic usage: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); /// /// let re = Regex::new(r"X")?; /// let hay = ""; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X")?; /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); /// /// let re = Regex::new(r"::")?; /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// If a haystack contains multiple contiguous matches, you will end up /// with empty spans yielded by the iterator: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"X")?; /// let hay = "XXXXaXXbXc"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// /// let re = Regex::new(r"/")?; /// let hay = "(///)"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["(", "", "", ")"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// Separators at the start or end of a haystack are neighbored by empty /// spans. /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"0")?; /// let hay = "010"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "1", ""]); /// /// # Ok::<(), Box>(()) /// ``` /// /// When the empty string is used as a regex, it splits at every valid /// UTF-8 boundary by default (which includes the beginning and end of the /// haystack): /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"")?; /// let hay = "rust"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); /// /// // Splitting by an empty string is UTF-8 aware by default! /// let re = Regex::new(r"")?; /// let hay = "☃"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "☃", ""]); /// /// # Ok::<(), Box>(()) /// ``` /// /// But note that UTF-8 mode for empty strings can be disabled, which will /// then result in a match at every byte offset in the haystack, /// including between every UTF-8 code unit. /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build(r"")?; /// let hay = "☃".as_bytes(); /// let got: Vec<&[u8]> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![ /// // Writing byte string slices is just brutal. The problem is that /// // b"foo" has type &[u8; 3] instead of &[u8]. /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..], /// ]); /// /// # Ok::<(), Box>(()) /// ``` /// /// Contiguous separators (commonly shows up with whitespace), can lead to /// possibly surprising behavior. For example, this code is correct: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want /// to match contiguous space characters: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" +")?; /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// // N.B. This does still include a leading empty span because ' +' /// // matches at the beginning of the haystack. /// assert_eq!(got, vec!["", "a", "b", "c"]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn split<'r, 'h, I: Into>>( &'r self, input: I, ) -> Split<'r, 'h> { Split { finder: self.find_iter(input), last: 0 } } /// Returns an iterator of at most `limit` spans of the haystack given, /// delimited by a match of the regex. (A `limit` of `0` will return no /// spans.) Namely, each element of the iterator corresponds to a part /// of the haystack that *isn't* matched by the regular expression. The /// remainder of the haystack that is not split will be the last element in /// the iterator. /// /// # Example /// /// Get the first two words in some haystack: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"\W+").unwrap(); /// let hay = "Hey! How are you?"; /// let fields: Vec<&str> = /// re.splitn(hay, 3).map(|span| &hay[span]).collect(); /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Examples: more cases /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); /// /// let re = Regex::new(r"X")?; /// let hay = ""; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X")?; /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); /// /// let re = Regex::new(r"::")?; /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "tiger::leopard"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 1).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["abcXdef"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcdef"; /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["abcdef"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 0).map(|sp| &hay[sp]).collect(); /// assert!(got.is_empty()); /// /// # Ok::<(), Box>(()) /// ``` pub fn splitn<'r, 'h, I: Into>>( &'r self, input: I, limit: usize, ) -> SplitN<'r, 'h> { SplitN { splits: self.split(input), limit } } } /// Lower level search routines that give more control. impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but, but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!(Some(Match::must(0, 29..36)), re.search(&input)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search(&self, input: &Input<'_>) -> Option { if self.imp.info.is_impossible(input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search(&mut guard, input); // We do this dance with the guard and explicitly put it back in the // pool because it seems to result in better codegen. If we let the // guard's Drop impl put it back in the pool, then functions like // ptr::drop_in_place get called and they *don't* get inlined. This // isn't usually a big deal, but in latency sensitive benchmarks the // extra function call can matter. // // I used `rebar measure -f '^grep/every-line$' -e meta` to measure // the effects here. // // Note that this doesn't eliminate the latency effects of using the // pool. There is still some (minor) cost for the "thread owner" of the // pool. (i.e., The thread that first calls a regex search routine.) // However, for other threads using the regex, the pool access can be // quite expensive as it goes through a mutex. Callers can avoid this // by either cloning the Regex (which creates a distinct copy of the // pool), or callers can use the lower level APIs that accept a 'Cache' // directly and do their own handling. PoolGuard::put(guard); result } /// Returns the end offset of the leftmost match. If no match exists, then /// `None` is returned. /// /// This is distinct from [`Regex::search`] in that it only returns the end /// of a match and not the start of the match. Depending on a variety of /// implementation details, this _may_ permit the regex engine to do less /// overall work. For example, if a DFA is being used to execute a search, /// then the start of a match usually requires running a separate DFA in /// reverse to the find the start of a match. If one only needs the end of /// a match, then the separate reverse scan to find the start of a match /// can be skipped. (Note that the reverse scan is avoided even when using /// `Regex::search` when possible, for example, in the case of an anchored /// search.) /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, HalfMatch}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!(Some(HalfMatch::must(0, 36)), re.search_half(&input)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_half(&self, input: &Input<'_>) -> Option { if self.imp.info.is_impossible(input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search_half(&mut guard, input); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// This is like [`Regex::captures`], but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-pattern `Regex` that permits /// searching for specific patterns. /// /// ``` /// use regex_automata::{ /// meta::Regex, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let mut caps = re.create_captures(); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search_captures(&Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re = Regex::new(r"\b[0-9]{3}\b")?; /// let mut caps = re.create_captures(); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// let input = Input::new(&haystack[3..6]); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_captures(&self, input: &Input<'_>, caps: &mut Captures) { caps.set_pattern(None); let pid = self.search_slots(input, caps.slots_mut()); caps.set_pattern(pid); } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided `slots`, and /// returns the matching pattern ID. The contents of the slots for patterns /// other than the matching pattern are unspecified. If no match was found, /// then `None` is returned and the contents of `slots` is unspecified. /// /// This is like [`Regex::search`], but it accepts a raw slots slice /// instead of a `Captures` value. This is useful in contexts where you /// don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with [`pattern_len() * 2`](Regex::pattern_len) slots, if you /// only care about the overall match spans for each matching pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, PatternID, Input}; /// /// let re = Regex::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots(&input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots( &self, input: &Input<'_>, slots: &mut [Option], ) -> Option { if self.imp.info.is_impossible(input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search_slots(&mut guard, input, slots); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } /// Writes the set of patterns that match anywhere in the given search /// configuration to `patset`. If multiple patterns match at the same /// position and this `Regex` was configured with [`MatchKind::All`] /// semantics, then all matching patterns are written to the given set. /// /// Unless all of the patterns in this `Regex` are anchored, then generally /// speaking, this will scan the entire haystack. /// /// This search routine *does not* clear the pattern set. This gives some /// flexibility to the caller (e.g., running multiple searches with the /// same pattern set), but does make the API bug-prone if you're reusing /// the same pattern set for multiple searches but intended them to be /// independent. /// /// If a pattern ID matched but the given `PatternSet` does not have /// sufficient capacity to store it, then it is not inserted and silently /// dropped. /// /// # Example /// /// This example shows how to find all matching patterns in a haystack, /// even when some patterns match at the same position as other patterns. /// It is important that we configure the `Regex` with [`MatchKind::All`] /// semantics here, or else overlapping matches will not be reported. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; /// /// let patterns = &[ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", /// ]; /// let re = Regex::builder() /// .configure(Regex::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(re.pattern_len()); /// re.which_overlapping_matches(&input, &mut patset); /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn which_overlapping_matches( &self, input: &Input<'_>, patset: &mut PatternSet, ) { if self.imp.info.is_impossible(input) { return; } let mut guard = self.pool.get(); let result = self .imp .strat .which_overlapping_matches(&mut guard, input, patset); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } } /// Lower level search routines that give more control, and require the caller /// to provide an explicit [`Cache`] parameter. impl Regex { /// This is like [`Regex::search`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let mut cache = re.create_cache(); /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!( /// Some(Match::must(0, 29..36)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_with( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search(cache, input) } /// This is like [`Regex::search_half`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, HalfMatch}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let mut cache = re.create_cache(); /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!( /// Some(HalfMatch::must(0, 36)), /// re.search_half_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_half_with( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search_half(cache, input) } /// This is like [`Regex::search_captures`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-pattern `Regex` that permits /// searching for specific patterns. /// /// ``` /// use regex_automata::{ /// meta::Regex, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search_captures_with(&mut cache, &Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re = Regex::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// let input = Input::new(&haystack[3..6]); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_captures_with( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) { caps.set_pattern(None); let pid = self.search_slots_with(cache, input, caps.slots_mut()); caps.set_pattern(pid); } /// This is like [`Regex::search_slots`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, PatternID, Input}; /// /// let re = Regex::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots_with(&mut cache, &input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots_with( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search_slots(cache, input, slots) } /// This is like [`Regex::which_overlapping_matches`], but requires the /// caller to explicitly pass a [`Cache`]. /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Why pass a `Cache` explicitly? /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; /// /// let patterns = &[ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", /// ]; /// let re = Regex::builder() /// .configure(Regex::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(re.pattern_len()); /// re.which_overlapping_matches_with(&mut cache, &input, &mut patset); /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn which_overlapping_matches_with( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { if self.imp.info.is_impossible(input) { return; } self.imp.strat.which_overlapping_matches(cache, input, patset) } } /// Various non-search routines for querying properties of a `Regex` and /// convenience routines for creating [`Captures`] and [`Cache`] values. impl Regex { /// Creates a new object for recording capture group offsets. This is used /// in search APIs like [`Regex::captures`] and [`Regex::search_captures`]. /// /// This is a convenience routine for /// `Captures::all(re.group_info().clone())`. Callers may build other types /// of `Captures` values that record less information (and thus require /// less work from the regex engine) using [`Captures::matches`] and /// [`Captures::empty`]. /// /// # Example /// /// This shows some alternatives to [`Regex::create_captures`]: /// /// ``` /// use regex_automata::{ /// meta::Regex, /// util::captures::Captures, /// Match, PatternID, Span, /// }; /// /// let re = Regex::new(r"(?[A-Z][a-z]+) (?[A-Z][a-z]+)")?; /// /// // This is equivalent to Regex::create_captures. It stores matching /// // offsets for all groups in the regex. /// let mut all = Captures::all(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut all); /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); /// /// // In this version, we only care about the implicit groups, which /// // means offsets for the explicit groups will be unavailable. It can /// // sometimes be faster to ask for fewer groups, since the underlying /// // regex engine needs to do less work to keep track of them. /// let mut matches = Captures::matches(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut matches); /// // We still get the overall match info. /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); /// // But now the explicit groups are unavailable. /// assert_eq!(None, matches.get_group_by_name("first")); /// assert_eq!(None, matches.get_group_by_name("last")); /// /// // Finally, in this version, we don't ask to keep track of offsets for /// // *any* groups. All we get back is whether a match occurred, and if /// // so, the ID of the pattern that matched. /// let mut empty = Captures::empty(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut empty); /// // it's a match! /// assert!(empty.is_match()); /// // for pattern ID 0 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); /// // Match offsets are unavailable. /// assert_eq!(None, empty.get_match()); /// // And of course, explicit groups are unavailable too. /// assert_eq!(None, empty.get_group_by_name("first")); /// assert_eq!(None, empty.get_group_by_name("last")); /// /// # Ok::<(), Box>(()) /// ``` pub fn create_captures(&self) -> Captures { Captures::all(self.group_info().clone()) } /// Creates a new cache for use with lower level search APIs like /// [`Regex::search_with`]. /// /// The cache returned should only be used for searches for this `Regex`. /// If you want to reuse the cache for another `Regex`, then you must call /// [`Cache::reset`] with that `Regex`. /// /// This is a convenience routine for [`Cache::new`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("crazy janey and her mission man"); /// assert_eq!( /// Some(Match::must(0, 20..31)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn create_cache(&self) -> Cache { self.imp.strat.create_cache() } /// Returns the total number of patterns in this regex. /// /// The standard [`Regex::new`] constructor always results in a `Regex` /// with a single pattern, but [`Regex::new_many`] permits building a /// multi-pattern regex. /// /// A `Regex` guarantees that the maximum possible `PatternID` returned in /// any match is `Regex::pattern_len() - 1`. In the case where the number /// of patterns is `0`, a match is impossible. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"(?m)^[a-z]$")?; /// assert_eq!(1, re.pattern_len()); /// /// let re = Regex::new_many::<&str>(&[])?; /// assert_eq!(0, re.pattern_len()); /// /// let re = Regex::new_many(&["a", "b", "c"])?; /// assert_eq!(3, re.pattern_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.imp.info.pattern_len() } /// Returns the total number of capturing groups. /// /// This includes the implicit capturing group corresponding to the /// entire match. Therefore, the minimum value returned is `1`. /// /// # Example /// /// This shows a few patterns and how many capture groups they have. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |pattern| { /// Regex::new(pattern).map(|re| re.captures_len()) /// }; /// /// assert_eq!(1, len("a")?); /// assert_eq!(2, len("(a)")?); /// assert_eq!(3, len("(a)|(b)")?); /// assert_eq!(5, len("(a)(b)|(c)(d)")?); /// assert_eq!(2, len("(a)|b")?); /// assert_eq!(2, len("a|(b)")?); /// assert_eq!(2, len("(b)*")?); /// assert_eq!(2, len("(b)+")?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: multiple patterns /// /// This routine also works for multiple patterns. The total number is /// the sum of the capture groups of each pattern. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |patterns| { /// Regex::new_many(patterns).map(|re| re.captures_len()) /// }; /// /// assert_eq!(2, len(&["a", "b"])?); /// assert_eq!(4, len(&["(a)", "(b)"])?); /// assert_eq!(6, len(&["(a)|(b)", "(c)|(d)"])?); /// assert_eq!(8, len(&["(a)(b)|(c)(d)", "(x)(y)"])?); /// assert_eq!(3, len(&["(a)", "b"])?); /// assert_eq!(3, len(&["a", "(b)"])?); /// assert_eq!(4, len(&["(a)", "(b)*"])?); /// assert_eq!(4, len(&["(a)+", "(b)+"])?); /// /// # Ok::<(), Box>(()) /// ``` pub fn captures_len(&self) -> usize { self.imp .info .props_union() .explicit_captures_len() .saturating_add(self.pattern_len()) } /// Returns the total number of capturing groups that appear in every /// possible match. /// /// If the number of capture groups can vary depending on the match, then /// this returns `None`. That is, a value is only returned when the number /// of matching groups is invariant or "static." /// /// Note that like [`Regex::captures_len`], this **does** include the /// implicit capturing group corresponding to the entire match. Therefore, /// when a non-None value is returned, it is guaranteed to be at least `1`. /// Stated differently, a return value of `Some(0)` is impossible. /// /// # Example /// /// This shows a few cases where a static number of capture groups is /// available and a few cases where it is not. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |pattern| { /// Regex::new(pattern).map(|re| re.static_captures_len()) /// }; /// /// assert_eq!(Some(1), len("a")?); /// assert_eq!(Some(2), len("(a)")?); /// assert_eq!(Some(2), len("(a)|(b)")?); /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); /// assert_eq!(None, len("(a)|b")?); /// assert_eq!(None, len("a|(b)")?); /// assert_eq!(None, len("(b)*")?); /// assert_eq!(Some(2), len("(b)+")?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: multiple patterns /// /// This property extends to regexes with multiple patterns as well. In /// order for their to be a static number of capture groups in this case, /// every pattern must have the same static number. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |patterns| { /// Regex::new_many(patterns).map(|re| re.static_captures_len()) /// }; /// /// assert_eq!(Some(1), len(&["a", "b"])?); /// assert_eq!(Some(2), len(&["(a)", "(b)"])?); /// assert_eq!(Some(2), len(&["(a)|(b)", "(c)|(d)"])?); /// assert_eq!(Some(3), len(&["(a)(b)|(c)(d)", "(x)(y)"])?); /// assert_eq!(None, len(&["(a)", "b"])?); /// assert_eq!(None, len(&["a", "(b)"])?); /// assert_eq!(None, len(&["(a)", "(b)*"])?); /// assert_eq!(Some(2), len(&["(a)+", "(b)+"])?); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn static_captures_len(&self) -> Option { self.imp .info .props_union() .static_explicit_captures_len() .map(|len| len.saturating_add(1)) } /// Return information about the capture groups in this `Regex`. /// /// A `GroupInfo` is an immutable object that can be cheaply cloned. It /// is responsible for maintaining a mapping between the capture groups /// in the concrete syntax of zero or more regex patterns and their /// internal representation used by some of the regex matchers. It is also /// responsible for maintaining a mapping between the name of each group /// (if one exists) and its corresponding group index. /// /// A `GroupInfo` is ultimately what is used to build a [`Captures`] value, /// which is some mutable space where group offsets are stored as a result /// of a search. /// /// # Example /// /// This shows some alternatives to [`Regex::create_captures`]: /// /// ``` /// use regex_automata::{ /// meta::Regex, /// util::captures::Captures, /// Match, PatternID, Span, /// }; /// /// let re = Regex::new(r"(?[A-Z][a-z]+) (?[A-Z][a-z]+)")?; /// /// // This is equivalent to Regex::create_captures. It stores matching /// // offsets for all groups in the regex. /// let mut all = Captures::all(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut all); /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); /// /// // In this version, we only care about the implicit groups, which /// // means offsets for the explicit groups will be unavailable. It can /// // sometimes be faster to ask for fewer groups, since the underlying /// // regex engine needs to do less work to keep track of them. /// let mut matches = Captures::matches(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut matches); /// // We still get the overall match info. /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); /// // But now the explicit groups are unavailable. /// assert_eq!(None, matches.get_group_by_name("first")); /// assert_eq!(None, matches.get_group_by_name("last")); /// /// // Finally, in this version, we don't ask to keep track of offsets for /// // *any* groups. All we get back is whether a match occurred, and if /// // so, the ID of the pattern that matched. /// let mut empty = Captures::empty(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut empty); /// // it's a match! /// assert!(empty.is_match()); /// // for pattern ID 0 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); /// // Match offsets are unavailable. /// assert_eq!(None, empty.get_match()); /// // And of course, explicit groups are unavailable too. /// assert_eq!(None, empty.get_group_by_name("first")); /// assert_eq!(None, empty.get_group_by_name("last")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn group_info(&self) -> &GroupInfo { self.imp.strat.group_info() } /// Returns the configuration object used to build this `Regex`. /// /// If no configuration object was explicitly passed, then the /// configuration returned represents the default. #[inline] pub fn get_config(&self) -> &Config { self.imp.info.config() } /// Returns true if this regex has a high chance of being "accelerated." /// /// The precise meaning of "accelerated" is specifically left unspecified, /// but the general meaning is that the search is a high likelihood of /// running faster than a character-at-a-time loop inside a standard /// regex engine. /// /// When a regex is accelerated, it is only a *probabilistic* claim. That /// is, just because the regex is believed to be accelerated, that doesn't /// mean it will definitely execute searches very fast. Similarly, if a /// regex is *not* accelerated, that is also a probabilistic claim. That /// is, a regex for which `is_accelerated` returns `false` could still run /// searches more quickly than a regex for which `is_accelerated` returns /// `true`. /// /// Whether a regex is marked as accelerated or not is dependent on /// implementations details that may change in a semver compatible release. /// That is, a regex that is accelerated in a `x.y.1` release might not be /// accelerated in a `x.y.2` release. /// /// Basically, the value of acceleration boils down to a hedge: a hodge /// podge of internal heuristics combine to make a probabilistic guess /// that this regex search may run "fast." The value in knowing this from /// a caller's perspective is that it may act as a signal that no further /// work should be done to accelerate a search. For example, a grep-like /// tool might try to do some extra work extracting literals from a regex /// to create its own heuristic acceleration strategies. But it might /// choose to defer to this crate's acceleration strategy if one exists. /// This routine permits querying whether such a strategy is active for a /// particular regex. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// // A simple literal is very likely to be accelerated. /// let re = Regex::new(r"foo")?; /// assert!(re.is_accelerated()); /// /// // A regex with no literals is likely to not be accelerated. /// let re = Regex::new(r"\w")?; /// assert!(!re.is_accelerated()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_accelerated(&self) -> bool { self.imp.strat.is_accelerated() } /// Return the total approximate heap memory, in bytes, used by this `Regex`. /// /// Note that currently, there is no high level configuration for setting /// a limit on the specific value returned by this routine. Instead, the /// following routines can be used to control heap memory at a bit of a /// lower level: /// /// * [`Config::nfa_size_limit`] controls how big _any_ of the NFAs are /// allowed to be. /// * [`Config::onepass_size_limit`] controls how big the one-pass DFA is /// allowed to be. /// * [`Config::hybrid_cache_capacity`] controls how much memory the lazy /// DFA is permitted to allocate to store its transition table. /// * [`Config::dfa_size_limit`] controls how big a fully compiled DFA is /// allowed to be. /// * [`Config::dfa_state_limit`] controls the conditions under which the /// meta regex engine will even attempt to build a fully compiled DFA. #[inline] pub fn memory_usage(&self) -> usize { self.imp.strat.memory_usage() } } impl Clone for Regex { fn clone(&self) -> Regex { let imp = Arc::clone(&self.imp); let pool = { let strat = Arc::clone(&imp.strat); let create: CachePoolFn = Box::new(move || strat.create_cache()); Pool::new(create) }; Regex { imp, pool } } } #[derive(Clone, Debug)] pub(crate) struct RegexInfo(Arc); #[derive(Clone, Debug)] struct RegexInfoI { config: Config, props: Vec, props_union: hir::Properties, } impl RegexInfo { fn new(config: Config, hirs: &[&Hir]) -> RegexInfo { // Collect all of the properties from each of the HIRs, and also // union them into one big set of properties representing all HIRs // as if they were in one big alternation. let mut props = vec![]; for hir in hirs.iter() { props.push(hir.properties().clone()); } let props_union = hir::Properties::union(&props); RegexInfo(Arc::new(RegexInfoI { config, props, props_union })) } pub(crate) fn config(&self) -> &Config { &self.0.config } pub(crate) fn props(&self) -> &[hir::Properties] { &self.0.props } pub(crate) fn props_union(&self) -> &hir::Properties { &self.0.props_union } pub(crate) fn pattern_len(&self) -> usize { self.props().len() } pub(crate) fn memory_usage(&self) -> usize { self.props().iter().map(|p| p.memory_usage()).sum::() + self.props_union().memory_usage() } /// Returns true when the search is guaranteed to be anchored. That is, /// when a match is reported, its offset is guaranteed to correspond to /// the start of the search. /// /// This includes returning true when `input` _isn't_ anchored but the /// underlying regex is. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_anchored_start(&self, input: &Input<'_>) -> bool { input.get_anchored().is_anchored() || self.is_always_anchored_start() } /// Returns true when this regex is always anchored to the start of a /// search. And in particular, that regardless of an `Input` configuration, /// if any match is reported it must start at `0`. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_always_anchored_start(&self) -> bool { use regex_syntax::hir::Look; self.props_union().look_set_prefix().contains(Look::Start) } /// Returns true when this regex is always anchored to the end of a /// search. And in particular, that regardless of an `Input` configuration, /// if any match is reported it must end at the end of the haystack. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_always_anchored_end(&self) -> bool { use regex_syntax::hir::Look; self.props_union().look_set_suffix().contains(Look::End) } /// Returns true if and only if it is known that a match is impossible /// for the given input. This is useful for short-circuiting and avoiding /// running the regex engine if it's known no match can be reported. /// /// Note that this doesn't necessarily detect every possible case. For /// example, when `pattern_len() == 0`, a match is impossible, but that /// case is so rare that it's fine to be handled by the regex engine /// itself. That is, it's not worth the cost of adding it here in order to /// make it a little faster. The reason is that this is called for every /// search. so there is some cost to adding checks here. Arguably, some of /// the checks that are here already probably shouldn't be here... #[cfg_attr(feature = "perf-inline", inline(always))] fn is_impossible(&self, input: &Input<'_>) -> bool { // The underlying regex is anchored, so if we don't start the search // at position 0, a match is impossible, because the anchor can only // match at position 0. if input.start() > 0 && self.is_always_anchored_start() { return true; } // Same idea, but for the end anchor. if input.end() < input.haystack().len() && self.is_always_anchored_end() { return true; } // If the haystack is smaller than the minimum length required, then // we know there can be no match. let minlen = match self.props_union().minimum_len() { None => return false, Some(minlen) => minlen, }; if input.get_span().len() < minlen { return true; } // Same idea as minimum, but for maximum. This is trickier. We can // only apply the maximum when we know the entire span that we're // searching *has* to match according to the regex (and possibly the // input configuration). If we know there is too much for the regex // to match, we can bail early. // // I don't think we can apply the maximum otherwise unfortunately. if self.is_anchored_start(input) && self.is_always_anchored_end() { let maxlen = match self.props_union().maximum_len() { None => return false, Some(maxlen) => maxlen, }; if input.get_span().len() > maxlen { return true; } } false } } /// An iterator over all non-overlapping matches. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'h> { re: &'r Regex, cache: CachePoolGuard<'r>, it: iter::Searcher<'h>, } impl<'r, 'h> FindMatches<'r, 'h> { /// Returns the `Regex` value that created this iterator. #[inline] pub fn regex(&self) -> &'r Regex { self.re } /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input<'s>(&'s self) -> &'s Input<'h> { self.it.input() } } impl<'r, 'h> Iterator for FindMatches<'r, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; it.advance(|input| Ok(re.search_with(cache, input))) } #[inline] fn count(self) -> usize { // If all we care about is a count of matches, then we only need to // find the end position of each match. This can give us a 2x perf // boost in some cases, because it avoids needing to do a reverse scan // to find the start of a match. let FindMatches { re, mut cache, it } = self; // This does the deref for PoolGuard once instead of every iter. let cache = &mut *cache; it.into_half_matches_iter( |input| Ok(re.search_half_with(cache, input)), ) .count() } } impl<'r, 'h> core::iter::FusedIterator for FindMatches<'r, 'h> {} /// An iterator over all non-overlapping leftmost matches with their capturing /// groups. /// /// The iterator yields a [`Captures`] value until no more matches could be /// found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::captures_iter`] method. #[derive(Debug)] pub struct CapturesMatches<'r, 'h> { re: &'r Regex, cache: CachePoolGuard<'r>, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'h> CapturesMatches<'r, 'h> { /// Returns the `Regex` value that created this iterator. #[inline] pub fn regex(&self) -> &'r Regex { self.re } /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input<'s>(&'s self) -> &'s Input<'h> { self.it.input() } } impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> { type Item = Captures; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; let _ = it.advance(|input| { re.search_captures_with(cache, input, caps); Ok(caps.get_match()) }); if caps.is_match() { Some(caps.clone()) } else { None } } #[inline] fn count(self) -> usize { let CapturesMatches { re, mut cache, it, .. } = self; // This does the deref for PoolGuard once instead of every iter. let cache = &mut *cache; it.into_half_matches_iter( |input| Ok(re.search_half_with(cache, input)), ) .count() } } impl<'r, 'h> core::iter::FusedIterator for CapturesMatches<'r, 'h> {} /// Yields all substrings delimited by a regular expression match. /// /// The spans correspond to the offsets between matches. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::split`] method. #[derive(Debug)] pub struct Split<'r, 'h> { finder: FindMatches<'r, 'h>, last: usize, } impl<'r, 'h> Split<'r, 'h> { /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input<'s>(&'s self) -> &'s Input<'h> { self.finder.input() } } impl<'r, 'h> Iterator for Split<'r, 'h> { type Item = Span; fn next(&mut self) -> Option { match self.finder.next() { None => { let len = self.finder.it.input().haystack().len(); if self.last > len { None } else { let span = Span::from(self.last..len); self.last = len + 1; // Next call will return None Some(span) } } Some(m) => { let span = Span::from(self.last..m.start()); self.last = m.end(); Some(span) } } } } impl<'r, 'h> core::iter::FusedIterator for Split<'r, 'h> {} /// Yields at most `N` spans delimited by a regular expression match. /// /// The spans correspond to the offsets between matches. The last span will be /// whatever remains after splitting. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::splitn`] method. #[derive(Debug)] pub struct SplitN<'r, 'h> { splits: Split<'r, 'h>, limit: usize, } impl<'r, 'h> SplitN<'r, 'h> { /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input<'s>(&'s self) -> &'s Input<'h> { self.splits.input() } } impl<'r, 'h> Iterator for SplitN<'r, 'h> { type Item = Span; fn next(&mut self) -> Option { if self.limit == 0 { return None; } self.limit -= 1; if self.limit > 0 { return self.splits.next(); } let len = self.splits.finder.it.input().haystack().len(); if self.splits.last > len { // We've already returned all substrings. None } else { // self.n == 0, so future calls will return None immediately Some(Span::from(self.splits.last..len)) } } fn size_hint(&self) -> (usize, Option) { (0, Some(self.limit)) } } impl<'r, 'h> core::iter::FusedIterator for SplitN<'r, 'h> {} /// Represents mutable scratch space used by regex engines during a search. /// /// Most of the regex engines in this crate require some kind of /// mutable state in order to execute a search. This mutable state is /// explicitly separated from the core regex object (such as a /// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex /// object can be shared across multiple threads simultaneously without any /// synchronization. Conversely, a `Cache` must either be duplicated if using /// the same `Regex` from multiple threads, or else there must be some kind of /// synchronization that guarantees exclusive access while it's in use by one /// thread. /// /// A `Regex` attempts to do this synchronization for you by using a thread /// pool internally. Its size scales roughly with the number of simultaneous /// regex searches. /// /// For cases where one does not want to rely on a `Regex`'s internal thread /// pool, lower level routines such as [`Regex::search_with`] are provided /// that permit callers to pass a `Cache` into the search routine explicitly. /// /// General advice is that the thread pool is often more than good enough. /// However, it may be possible to observe the effects of its latency, /// especially when searching many small haystacks from many threads /// simultaneously. /// /// Caches can be created from their corresponding `Regex` via /// [`Regex::create_cache`]. A cache can only be used with either the `Regex` /// that created it, or the `Regex` that was most recently used to reset it /// with [`Cache::reset`]. Using a cache with any other `Regex` may result in /// panics or incorrect results. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("crazy janey and her mission man"); /// assert_eq!( /// Some(Match::must(0, 20..31)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug, Clone)] pub struct Cache { pub(crate) capmatches: Captures, pub(crate) pikevm: wrappers::PikeVMCache, pub(crate) backtrack: wrappers::BoundedBacktrackerCache, pub(crate) onepass: wrappers::OnePassCache, pub(crate) hybrid: wrappers::HybridCache, pub(crate) revhybrid: wrappers::ReverseHybridCache, } impl Cache { /// Creates a new `Cache` for use with this regex. /// /// The cache returned should only be used for searches for the given /// `Regex`. If you want to reuse the cache for another `Regex`, then you /// must call [`Cache::reset`] with that `Regex`. pub fn new(re: &Regex) -> Cache { re.create_cache() } /// Reset this cache such that it can be used for searching with the given /// `Regex` (and only that `Regex`). /// /// A cache reset permits potentially reusing memory already allocated in /// this cache with a different `Regex`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.search_with(&mut cache, &Input::new("Δ")), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.search_with(&mut cache, &Input::new("☃")), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &Regex) { re.imp.strat.reset_cache(self) } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { let mut bytes = 0; bytes += self.pikevm.memory_usage(); bytes += self.backtrack.memory_usage(); bytes += self.onepass.memory_usage(); bytes += self.hybrid.memory_usage(); bytes += self.revhybrid.memory_usage(); bytes } } /// An object describing the configuration of a `Regex`. /// /// This configuration only includes options for the /// non-syntax behavior of a `Regex`, and can be applied via the /// [`Builder::configure`] method. For configuring the syntax options, see /// [`util::syntax::Config`](crate::util::syntax::Config). /// /// # Example: lower the NFA size limit /// /// In some cases, the default size limit might be too big. The size limit can /// be lowered, which will prevent large regex patterns from compiling. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." // This makes it possible to easily combine multiple configurations // without default values overwriting explicitly specified values. See the // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. match_kind: Option, utf8_empty: Option, autopre: Option, pre: Option>, which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, hybrid: Option, dfa: Option, dfa_size_limit: Option>, dfa_state_limit: Option>, onepass: Option, backtrack: Option, byte_classes: Option, line_terminator: Option, } impl Config { /// Create a new configuration object for a `Regex`. pub fn new() -> Config { Config::default() } /// Set the match semantics for a `Regex`. /// /// The default value is [`MatchKind::LeftmostFirst`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match, MatchKind}; /// /// // By default, leftmost-first semantics are used, which /// // disambiguates matches at the same position by selecting /// // the one that corresponds earlier in the pattern. /// let re = Regex::new("sam|samwise")?; /// assert_eq!(Some(Match::must(0, 0..3)), re.find("samwise")); /// /// // But with 'all' semantics, match priority is ignored /// // and all match states are included. When coupled with /// // a leftmost search, the search will report the last /// // possible match. /// let re = Regex::builder() /// .configure(Regex::config().match_kind(MatchKind::All)) /// .build("sam|samwise")?; /// assert_eq!(Some(Match::must(0, 0..7)), re.find("samwise")); /// // Beware that this can lead to skipping matches! /// // Usually 'all' is used for anchored reverse searches /// // only, or for overlapping searches. /// assert_eq!(Some(Match::must(0, 4..11)), re.find("sam samwise")); /// /// # Ok::<(), Box>(()) /// ``` pub fn match_kind(self, kind: MatchKind) -> Config { Config { match_kind: Some(kind), ..self } } /// Toggles whether empty matches are permitted to occur between the code /// units of a UTF-8 encoded codepoint. /// /// This should generally be enabled when search a `&str` or anything that /// you otherwise know is valid UTF-8. It should be disabled in all other /// cases. Namely, if the haystack is not valid UTF-8 and this is enabled, /// then behavior is unspecified. /// /// By default, this is enabled. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches only occur at the beginning and end of the snowman. /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 3..3), /// ]); /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches now occur at every position! /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// Ok::<(), Box>(()) /// ``` pub fn utf8_empty(self, yes: bool) -> Config { Config { utf8_empty: Some(yes), ..self } } /// Toggles whether automatic prefilter support is enabled. /// /// If this is disabled and [`Config::prefilter`] is not set, then the /// meta regex engine will not use any prefilters. This can sometimes /// be beneficial in cases where you know (or have measured) that the /// prefilter leads to overall worse search performance. /// /// By default, this is enabled. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().auto_prefilter(false)) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); /// /// Ok::<(), Box>(()) /// ``` pub fn auto_prefilter(self, yes: bool) -> Config { Config { autopre: Some(yes), ..self } } /// Overrides and sets the prefilter to use inside a `Regex`. /// /// This permits one to forcefully set a prefilter in cases where the /// caller knows better than whatever the automatic prefilter logic is /// capable of. /// /// By default, this is set to `None` and an automatic prefilter will be /// used if one could be built. (Assuming [`Config::auto_prefilter`] is /// enabled, which it is by default.) /// /// # Example /// /// This example shows how to set your own prefilter. In the case of a /// pattern like `Bruce \w+`, the automatic prefilter is likely to be /// constructed in a way that it will look for occurrences of `Bruce `. /// In most cases, this is the best choice. But in some cases, it may be /// the case that running `memchr` on `B` is the best choice. One can /// achieve that behavior by overriding the automatic prefilter logic /// and providing a prefilter that just matches `B`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// meta::Regex, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["B"]) /// .expect("a prefilter"); /// let re = Regex::builder() /// .configure(Regex::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: incorrect prefilters can lead to incorrect results! /// /// Be warned that setting an incorrect prefilter can lead to missed /// matches. So if you use this option, ensure your prefilter can _never_ /// report false negatives. (A false positive is, on the other hand, quite /// okay and generally unavoidable.) /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// meta::Regex, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Z"]) /// .expect("a prefilter"); /// let re = Regex::builder() /// .configure(Regex::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// // Oops! No match found, but there should be one! /// assert_eq!(None, re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(self, pre: Option) -> Config { Config { pre: Some(pre), ..self } } /// Configures what kinds of groups are compiled as "capturing" in the /// underlying regex engine. /// /// This is set to [`WhichCaptures::All`] by default. Callers may wish to /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the /// overhead of capture states for explicit groups. /// /// Note that another approach to avoiding the overhead of capture groups /// is by using non-capturing groups in the regex pattern. That is, /// `(?:a)` instead of `(a)`. This option is useful when you can't control /// the concrete syntax but know that you don't need the underlying capture /// states. For example, using `WhichCaptures::Implicit` will behave as if /// all explicit capturing groups in the pattern were non-capturing. /// /// Setting this to `WhichCaptures::None` is usually not the right thing to /// do. When no capture states are compiled, some regex engines (such as /// the `PikeVM`) won't be able to report match offsets. This will manifest /// as no match being found. /// /// # Example /// /// This example demonstrates how the results of capture groups can change /// based on this option. First we show the default (all capture groups in /// the pattern are capturing): /// /// ``` /// use regex_automata::{meta::Regex, Match, Span}; /// /// let re = Regex::new(r"foo([0-9]+)bar")?; /// let hay = "foo123bar"; /// /// let mut caps = re.create_captures(); /// re.captures(hay, &mut caps); /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); /// /// Ok::<(), Box>(()) /// ``` /// /// And now we show the behavior when we only include implicit capture /// groups. In this case, we can only find the overall match span, but the /// spans of any other explicit group don't exist because they are treated /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, /// there is no real point in using [`Regex::captures`] since it will never /// be able to report more information than [`Regex::find`].) /// /// ``` /// use regex_automata::{ /// meta::Regex, /// nfa::thompson::WhichCaptures, /// Match, /// Span, /// }; /// /// let re = Regex::builder() /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) /// .build(r"foo([0-9]+)bar")?; /// let hay = "foo123bar"; /// /// let mut caps = re.create_captures(); /// re.captures(hay, &mut caps); /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); /// assert_eq!(None, caps.get_group(1)); /// /// Ok::<(), Box>(()) /// ``` pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { self.which_captures = Some(which_captures); self } /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// /// Setting it to `None` disables the limit. This is not recommended if /// you're compiling untrusted patterns. /// /// Note that this limit is applied to _each_ NFA built, and if any of /// them exceed the limit, then construction will fail. This limit does /// _not_ correspond to the total memory used by all NFAs in the meta regex /// engine. /// /// This defaults to some reasonable number that permits most reasonable /// patterns. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// // But notice that building such a regex with the exact same limit /// // can succeed depending on other aspects of the configuration. For /// // example, a single *forward* NFA will (at time of writing) fit into /// // the 20KB limit, but a *reverse* NFA of the same pattern will not. /// // So if one configures a meta regex such that a reverse NFA is never /// // needed and thus never built, then the 20KB limit will be enough for /// // a pattern like \pL! /// let result = Regex::builder() /// .configure(Regex::config() /// .nfa_size_limit(Some(20 * (1<<10))) /// // The DFAs are the only thing that (currently) need a reverse /// // NFA. So if both are disabled, the meta regex engine will /// // skip building the reverse NFA. Note that this isn't an API /// // guarantee. A future semver compatible version may introduce /// // new use cases for a reverse NFA. /// .hybrid(false) /// .dfa(false) /// ) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_ok()); /// /// # Ok::<(), Box>(()) /// ``` pub fn nfa_size_limit(self, limit: Option) -> Config { Config { nfa_size_limit: Some(limit), ..self } } /// Sets the size limit, in bytes, for the one-pass DFA. /// /// Setting it to `None` disables the limit. Disabling the limit is /// strongly discouraged when compiling untrusted patterns. Even if the /// patterns are trusted, it still may not be a good idea, since a one-pass /// DFA can use a lot of memory. With that said, as the size of a regex /// increases, the likelihood of it being one-pass likely decreases. /// /// This defaults to some reasonable number that permits most reasonable /// one-pass patterns. /// /// # Example /// /// This shows how to set the one-pass DFA size limit. Note that since /// a one-pass DFA is an optional component of the meta regex engine, /// this size limit only impacts what is built internally and will never /// determine whether a `Regex` itself fails to build. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().onepass_size_limit(Some(2 * (1<<20)))) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn onepass_size_limit(self, limit: Option) -> Config { Config { onepass_size_limit: Some(limit), ..self } } /// Set the cache capacity, in bytes, for the lazy DFA. /// /// The cache capacity of the lazy DFA determines approximately how much /// heap memory it is allowed to use to store its state transitions. The /// state transitions are computed at search time, and if the cache fills /// up it, it is cleared. At this point, any previously generated state /// transitions are lost and are re-generated if they're needed again. /// /// This sort of cache filling and clearing works quite well _so long as /// cache clearing happens infrequently_. If it happens too often, then the /// meta regex engine will stop using the lazy DFA and switch over to a /// different regex engine. /// /// In cases where the cache is cleared too often, it may be possible to /// give the cache more space and reduce (or eliminate) how often it is /// cleared. Similarly, sometimes a regex is so big that the lazy DFA isn't /// used at all if its cache capacity isn't big enough. /// /// The capacity set here is a _limit_ on how much memory is used. The /// actual memory used is only allocated as it's needed. /// /// Determining the right value for this is a little tricky and will likely /// required some profiling. Enabling the `logging` feature and setting the /// log level to `trace` will also tell you how often the cache is being /// cleared. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().hybrid_cache_capacity(20 * (1<<20))) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn hybrid_cache_capacity(self, limit: usize) -> Config { Config { hybrid_cache_capacity: Some(limit), ..self } } /// Sets the size limit, in bytes, for heap memory used for a fully /// compiled DFA. /// /// **NOTE:** If you increase this, you'll likely also need to increase /// [`Config::dfa_state_limit`]. /// /// In contrast to the lazy DFA, building a full DFA requires computing /// all of its state transitions up front. This can be a very expensive /// process, and runs in worst case `2^n` time and space (where `n` is /// proportional to the size of the regex). However, a full DFA unlocks /// some additional optimization opportunities. /// /// Because full DFAs can be so expensive, the default limits for them are /// incredibly small. Generally speaking, if your regex is moderately big /// or if you're using Unicode features (`\w` is Unicode-aware by default /// for example), then you can expect that the meta regex engine won't even /// attempt to build a DFA for it. /// /// If this and [`Config::dfa_state_limit`] are set to `None`, then the /// meta regex will not use any sort of limits when deciding whether to /// build a DFA. This in turn makes construction of a `Regex` take /// worst case exponential time and space. Even short patterns can result /// in huge space blow ups. So it is strongly recommended to keep some kind /// of limit set! /// /// The default is set to a small number that permits some simple regexes /// to get compiled into DFAs in reasonable time. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// // 100MB is much bigger than the default. /// .configure(Regex::config() /// .dfa_size_limit(Some(100 * (1<<20))) /// // We don't care about size too much here, so just /// // remove the NFA state limit altogether. /// .dfa_state_limit(None)) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn dfa_size_limit(self, limit: Option) -> Config { Config { dfa_size_limit: Some(limit), ..self } } /// Sets a limit on the total number of NFA states, beyond which, a full /// DFA is not attempted to be compiled. /// /// This limit works in concert with [`Config::dfa_size_limit`]. Namely, /// where as `Config::dfa_size_limit` is applied by attempting to construct /// a DFA, this limit is used to avoid the attempt in the first place. This /// is useful to avoid hefty initialization costs associated with building /// a DFA for cases where it is obvious the DFA will ultimately be too big. /// /// By default, this is set to a very small number. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config() /// // Sometimes the default state limit rejects DFAs even /// // if they would fit in the size limit. Here, we disable /// // the check on the number of NFA states and just rely on /// // the size limit. /// .dfa_state_limit(None)) /// .build(r"(?-u)\w{30}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn dfa_state_limit(self, limit: Option) -> Config { Config { dfa_state_limit: Some(limit), ..self } } /// Whether to attempt to shrink the size of the alphabet for the regex /// pattern or not. When enabled, the alphabet is shrunk into a set of /// equivalence classes, where every byte in the same equivalence class /// cannot discriminate between a match or non-match. /// /// **WARNING:** This is only useful for debugging DFAs. Disabling this /// does not yield any speed advantages. Indeed, disabling it can result /// in much higher memory usage. Disabling byte classes is useful for /// debugging the actual generated transitions because it lets one see the /// transitions defined on actual bytes instead of the equivalence classes. /// /// This option is enabled by default and should never be disabled unless /// one is debugging the meta regex engine's internals. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().byte_classes(false)) /// .build(r"[a-z]+")?; /// let hay = "!!quux!!"; /// assert_eq!(Some(Match::must(0, 2..6)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn byte_classes(self, yes: bool) -> Config { Config { byte_classes: Some(yes), ..self } } /// Set the line terminator to be used by the `^` and `$` anchors in /// multi-line mode. /// /// This option has no effect when CRLF mode is enabled. That is, /// regardless of this setting, `(?Rm:^)` and `(?Rm:$)` will always treat /// `\r` and `\n` as line terminators (and will never match between a `\r` /// and a `\n`). /// /// By default, `\n` is the line terminator. /// /// **Warning**: This does not change the behavior of `.`. To do that, /// you'll need to configure the syntax option /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator) /// in addition to this. Otherwise, `.` will continue to match any /// character other than `\n`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn line_terminator(self, byte: u8) -> Config { Config { line_terminator: Some(byte), ..self } } /// Toggle whether the hybrid NFA/DFA (also known as the "lazy DFA") should /// be available for use by the meta regex engine. /// /// Enabling this does not necessarily mean that the lazy DFA will /// definitely be used. It just means that it will be _available_ for use /// if the meta regex engine thinks it will be useful. /// /// When the `hybrid` crate feature is enabled, then this is enabled by /// default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn hybrid(self, yes: bool) -> Config { Config { hybrid: Some(yes), ..self } } /// Toggle whether a fully compiled DFA should be available for use by the /// meta regex engine. /// /// Enabling this does not necessarily mean that a DFA will definitely be /// used. It just means that it will be _available_ for use if the meta /// regex engine thinks it will be useful. /// /// When the `dfa-build` crate feature is enabled, then this is enabled by /// default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn dfa(self, yes: bool) -> Config { Config { dfa: Some(yes), ..self } } /// Toggle whether a one-pass DFA should be available for use by the meta /// regex engine. /// /// Enabling this does not necessarily mean that a one-pass DFA will /// definitely be used. It just means that it will be _available_ for /// use if the meta regex engine thinks it will be useful. (Indeed, a /// one-pass DFA can only be used when the regex is one-pass. See the /// [`dfa::onepass`](crate::dfa::onepass) module for more details.) /// /// When the `dfa-onepass` crate feature is enabled, then this is enabled /// by default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn onepass(self, yes: bool) -> Config { Config { onepass: Some(yes), ..self } } /// Toggle whether a bounded backtracking regex engine should be available /// for use by the meta regex engine. /// /// Enabling this does not necessarily mean that a bounded backtracker will /// definitely be used. It just means that it will be _available_ for use /// if the meta regex engine thinks it will be useful. /// /// When the `nfa-backtrack` crate feature is enabled, then this is enabled /// by default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn backtrack(self, yes: bool) -> Config { Config { backtrack: Some(yes), ..self } } /// Returns the match kind on this configuration, as set by /// [`Config::match_kind`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns whether empty matches must fall on valid UTF-8 boundaries, as /// set by [`Config::utf8_empty`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_utf8_empty(&self) -> bool { self.utf8_empty.unwrap_or(true) } /// Returns whether automatic prefilters are enabled, as set by /// [`Config::auto_prefilter`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_auto_prefilter(&self) -> bool { self.autopre.unwrap_or(true) } /// Returns a manually set prefilter, if one was set by /// [`Config::prefilter`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Returns the capture configuration, as set by /// [`Config::which_captures`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_which_captures(&self) -> WhichCaptures { self.which_captures.unwrap_or(WhichCaptures::All) } /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_nfa_size_limit(&self) -> Option { self.nfa_size_limit.unwrap_or(Some(10 * (1 << 20))) } /// Returns one-pass DFA size limit, as set by /// [`Config::onepass_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_onepass_size_limit(&self) -> Option { self.onepass_size_limit.unwrap_or(Some(1 * (1 << 20))) } /// Returns hybrid NFA/DFA cache capacity, as set by /// [`Config::hybrid_cache_capacity`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_hybrid_cache_capacity(&self) -> usize { self.hybrid_cache_capacity.unwrap_or(2 * (1 << 20)) } /// Returns DFA size limit, as set by [`Config::dfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa_size_limit(&self) -> Option { // The default for this is VERY small because building a full DFA is // ridiculously costly. But for regexes that are very small, it can be // beneficial to use a full DFA. In particular, a full DFA can enable // additional optimizations via something called "accelerated" states. // Namely, when there's a state with only a few outgoing transitions, // we can temporary suspend walking the transition table and use memchr // for just those outgoing transitions to skip ahead very quickly. // // Generally speaking, if Unicode is enabled in your regex and you're // using some kind of Unicode feature, then it's going to blow this // size limit. Moreover, Unicode tends to defeat the "accelerated" // state optimization too, so it's a double whammy. // // We also use a limit on the number of NFA states to avoid even // starting the DFA construction process. Namely, DFA construction // itself could make lots of initial allocs proportional to the size // of the NFA, and if the NFA is large, it doesn't make sense to pay // that cost if we know it's likely to be blown by a large margin. self.dfa_size_limit.unwrap_or(Some(40 * (1 << 10))) } /// Returns DFA size limit in terms of the number of states in the NFA, as /// set by [`Config::dfa_state_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa_state_limit(&self) -> Option { // Again, as with the size limit, we keep this very small. self.dfa_state_limit.unwrap_or(Some(30)) } /// Returns whether byte classes are enabled, as set by /// [`Config::byte_classes`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_byte_classes(&self) -> bool { self.byte_classes.unwrap_or(true) } /// Returns the line terminator for this configuration, as set by /// [`Config::line_terminator`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_line_terminator(&self) -> u8 { self.line_terminator.unwrap_or(b'\n') } /// Returns whether the hybrid NFA/DFA regex engine may be used, as set by /// [`Config::hybrid`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_hybrid(&self) -> bool { #[cfg(feature = "hybrid")] { self.hybrid.unwrap_or(true) } #[cfg(not(feature = "hybrid"))] { false } } /// Returns whether the DFA regex engine may be used, as set by /// [`Config::dfa`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa(&self) -> bool { #[cfg(feature = "dfa-build")] { self.dfa.unwrap_or(true) } #[cfg(not(feature = "dfa-build"))] { false } } /// Returns whether the one-pass DFA regex engine may be used, as set by /// [`Config::onepass`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_onepass(&self) -> bool { #[cfg(feature = "dfa-onepass")] { self.onepass.unwrap_or(true) } #[cfg(not(feature = "dfa-onepass"))] { false } } /// Returns whether the bounded backtracking regex engine may be used, as /// set by [`Config::backtrack`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_backtrack(&self) -> bool { #[cfg(feature = "nfa-backtrack")] { self.backtrack.unwrap_or(true) } #[cfg(not(feature = "nfa-backtrack"))] { false } } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o .onepass_size_limit .or(self.onepass_size_limit), hybrid_cache_capacity: o .hybrid_cache_capacity .or(self.hybrid_cache_capacity), hybrid: o.hybrid.or(self.hybrid), dfa: o.dfa.or(self.dfa), dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), dfa_state_limit: o.dfa_state_limit.or(self.dfa_state_limit), onepass: o.onepass.or(self.onepass), backtrack: o.backtrack.or(self.backtrack), byte_classes: o.byte_classes.or(self.byte_classes), line_terminator: o.line_terminator.or(self.line_terminator), } } } /// A builder for configuring and constructing a `Regex`. /// /// The builder permits configuring two different aspects of a `Regex`: /// /// * [`Builder::configure`] will set high-level configuration options as /// described by a [`Config`]. /// * [`Builder::syntax`] will set the syntax level configuration options /// as described by a [`util::syntax::Config`](crate::util::syntax::Config). /// This only applies when building a `Regex` from pattern strings. /// /// Once configured, the builder can then be used to construct a `Regex` from /// one of 4 different inputs: /// /// * [`Builder::build`] creates a regex from a single pattern string. /// * [`Builder::build_many`] creates a regex from many pattern strings. /// * [`Builder::build_from_hir`] creates a regex from a /// [`regex-syntax::Hir`](Hir) expression. /// * [`Builder::build_many_from_hir`] creates a regex from many /// [`regex-syntax::Hir`](Hir) expressions. /// /// The latter two methods in particular provide a way to construct a fully /// feature regular expression matcher directly from an `Hir` expression /// without having to first convert it to a string. (This is in contrast to the /// top-level `regex` crate which intentionally provides no such API in order /// to avoid making `regex-syntax` a public dependency.) /// /// As a convenience, this builder may be created via [`Regex::builder`], which /// may help avoid an extra import. /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change the /// line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: disable UTF-8 requirement /// /// By default, regex patterns are required to match UTF-8. This includes /// regex patterns that can produce matches of length zero. In the case of an /// empty match, by default, matches will not appear between the code units of /// a UTF-8 encoded codepoint. /// /// However, it can be useful to disable this requirement, particularly if /// you're searching things like `&[u8]` that are not known to be valid UTF-8. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let mut builder = Regex::builder(); /// // Disables the requirement that non-empty matches match UTF-8. /// builder.syntax(syntax::Config::new().utf8(false)); /// // Disables the requirement that empty matches match UTF-8 boundaries. /// builder.configure(Regex::config().utf8_empty(false)); /// /// // We can match raw bytes via \xZZ syntax, but we need to disable /// // Unicode mode to do that. We could disable it everywhere, or just /// // selectively, as shown here. /// let re = builder.build(r"(?-u:\xFF)foo(?-u:\xFF)")?; /// let hay = b"\xFFfoo\xFF"; /// assert_eq!(Some(Match::must(0, 0..5)), re.find(hay)); /// /// // We can also match between code units. /// let re = builder.build(r"")?; /// let hay = "☃"; /// assert_eq!(re.find_iter(hay).collect::>(), vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, ast: ast::parse::ParserBuilder, hir: hir::translate::TranslatorBuilder, } impl Builder { /// Creates a new builder for configuring and constructing a [`Regex`]. pub fn new() -> Builder { Builder { config: Config::default(), ast: ast::parse::ParserBuilder::new(), hir: hir::translate::TranslatorBuilder::new(), } } /// Builds a `Regex` from a single pattern string. /// /// If there was a problem parsing the pattern or a problem turning it into /// a regex matcher, then an error is returned. /// /// # Example /// /// This example shows how to configure syntax options. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().crlf(true).multi_line(true)) /// .build(r"^foo$")?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Builds a `Regex` from many pattern strings. /// /// If there was a problem parsing any of the patterns or a problem turning /// them into a regex matcher, then an error is returned. /// /// # Example: finding the pattern that caused an error /// /// When a syntax error occurs, it is possible to ask which pattern /// caused the syntax error. /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::builder() /// .build_many(&["a", "b", r"\p{Foo}", "c"]) /// .unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` /// /// # Example: zero patterns is valid /// /// Building a regex with zero patterns results in a regex that never /// matches anything. Because this routine is generic, passing an empty /// slice usually requires a turbo-fish (or something else to help type /// inference). /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .build_many::<&str>(&[])?; /// assert_eq!(None, re.find("")); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_many>( &self, patterns: &[P], ) -> Result { use crate::util::primitives::IteratorIndexExt; log! { debug!("building meta regex with {} patterns:", patterns.len()); for (pid, p) in patterns.iter().with_pattern_ids() { let p = p.as_ref(); // We might split a grapheme with this truncation logic, but // that's fine. We at least avoid splitting a codepoint. let maxoff = p .char_indices() .map(|(i, ch)| i + ch.len_utf8()) .take(1000) .last() .unwrap_or(0); if maxoff < p.len() { debug!("{:?}: {}[... snip ...]", pid, &p[..maxoff]); } else { debug!("{:?}: {}", pid, p); } } } let (mut asts, mut hirs) = (vec![], vec![]); for (pid, p) in patterns.iter().with_pattern_ids() { let ast = self .ast .build() .parse(p.as_ref()) .map_err(|err| BuildError::ast(pid, err))?; asts.push(ast); } for ((pid, p), ast) in patterns.iter().with_pattern_ids().zip(asts.iter()) { let hir = self .hir .build() .translate(p.as_ref(), ast) .map_err(|err| BuildError::hir(pid, err))?; hirs.push(hir); } self.build_many_from_hir(&hirs) } /// Builds a `Regex` directly from an `Hir` expression. /// /// This is useful if you needed to parse a pattern string into an `Hir` /// for other reasons (such as analysis or transformations). This routine /// permits building a `Regex` directly from the `Hir` expression instead /// of first converting the `Hir` back to a pattern string. /// /// When using this method, any options set via [`Builder::syntax`] are /// ignored. Namely, the syntax options only apply when parsing a pattern /// string, which isn't relevant here. /// /// If there was a problem building the underlying regex matcher for the /// given `Hir`, then an error is returned. /// /// # Example /// /// This example shows how one can hand-construct an `Hir` expression and /// build a regex from it without doing any parsing at all. /// /// ``` /// use { /// regex_automata::{meta::Regex, Match}, /// regex_syntax::hir::{Hir, Look}, /// }; /// /// // (?Rm)^foo$ /// let hir = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("foo".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// let re = Regex::builder() /// .build_from_hir(&hir)?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// Ok::<(), Box>(()) /// ``` pub fn build_from_hir(&self, hir: &Hir) -> Result { self.build_many_from_hir(&[hir]) } /// Builds a `Regex` directly from many `Hir` expressions. /// /// This is useful if you needed to parse pattern strings into `Hir` /// expressions for other reasons (such as analysis or transformations). /// This routine permits building a `Regex` directly from the `Hir` /// expressions instead of first converting the `Hir` expressions back to /// pattern strings. /// /// When using this method, any options set via [`Builder::syntax`] are /// ignored. Namely, the syntax options only apply when parsing a pattern /// string, which isn't relevant here. /// /// If there was a problem building the underlying regex matcher for the /// given `Hir` expressions, then an error is returned. /// /// Note that unlike [`Builder::build_many`], this can only fail as a /// result of building the underlying matcher. In that case, there is /// no single `Hir` expression that can be isolated as a reason for the /// failure. So if this routine fails, it's not possible to determine which /// `Hir` expression caused the failure. /// /// # Example /// /// This example shows how one can hand-construct multiple `Hir` /// expressions and build a single regex from them without doing any /// parsing at all. /// /// ``` /// use { /// regex_automata::{meta::Regex, Match}, /// regex_syntax::hir::{Hir, Look}, /// }; /// /// // (?Rm)^foo$ /// let hir1 = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("foo".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// // (?Rm)^bar$ /// let hir2 = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("bar".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// let re = Regex::builder() /// .build_many_from_hir(&[&hir1, &hir2])?; /// let hay = "\r\nfoo\r\nbar"; /// let got: Vec = re.find_iter(hay).collect(); /// let expected = vec![ /// Match::must(0, 2..5), /// Match::must(1, 7..10), /// ]; /// assert_eq!(expected, got); /// /// Ok::<(), Box>(()) /// ``` pub fn build_many_from_hir>( &self, hirs: &[H], ) -> Result { let config = self.config.clone(); // We collect the HIRs into a vec so we can write internal routines // with '&[&Hir]'. i.e., Don't use generics everywhere to keep code // bloat down.. let hirs: Vec<&Hir> = hirs.iter().map(|hir| hir.borrow()).collect(); let info = RegexInfo::new(config, &hirs); let strat = strategy::new(&info, &hirs)?; let pool = { let strat = Arc::clone(&strat); let create: CachePoolFn = Box::new(move || strat.create_cache()); Pool::new(create) }; Ok(Regex { imp: Arc::new(RegexI { strat, info }), pool }) } /// Configure the behavior of a `Regex`. /// /// This configuration controls non-syntax options related to the behavior /// of a `Regex`. This includes things like whether empty matches can split /// a codepoint, prefilters, line terminators and a long list of options /// for configuring which regex engines the meta regex engine will be able /// to use internally. /// /// # Example /// /// This example shows how to disable UTF-8 empty mode. This will permit /// empty matches to occur between the UTF-8 encoding of a codepoint. /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches only occur at the beginning and end of the snowman. /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 3..3), /// ]); /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches now occur at every position! /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// Ok::<(), Box>(()) /// ``` pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Configure the syntax options when parsing a pattern string while /// building a `Regex`. /// /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`] /// are used. The other build methods accept `Hir` values, which have /// already been parsed. /// /// # Example /// /// This example shows how to enable case insensitive mode. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().case_insensitive(true)) /// .build(r"δ")?; /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ")); /// /// Ok::<(), Box>(()) /// ``` pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { config.apply_ast(&mut self.ast); config.apply_hir(&mut self.hir); self } } #[cfg(test)] mod tests { use super::*; // I found this in the course of building out the benchmark suite for // rebar. #[test] fn regression_suffix_literal_count() { let _ = env_logger::try_init(); let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); assert_eq!(1, re.find_iter("tingling").count()); } } regex-automata-0.4.9/src/meta/reverse_inner.rs000064400000000000000000000232761046102023000175240ustar 00000000000000/*! A module dedicated to plucking inner literals out of a regex pattern, and then constructing a prefilter for them. We also include a regex pattern "prefix" that corresponds to the bits of the regex that need to match before the literals do. The reverse inner optimization then proceeds by looking for matches of the inner literal(s), and then doing a reverse search of the prefix from the start of the literal match to find the overall start position of the match. The essential invariant we want to uphold here is that the literals we return reflect a set where *at least* one of them must match in order for the overall regex to match. We also need to maintain the invariant that the regex prefix returned corresponds to the entirety of the regex up until the literals we return. This somewhat limits what we can do. That is, if we a regex like `\w+(@!|%%)\w+`, then we can pluck the `{@!, %%}` out and build a prefilter from it. Then we just need to compile `\w+` in reverse. No fuss no muss. But if we have a regex like \d+@!|\w+%%`, then we get kind of stymied. Technically, we could still extract `{@!, %%}`, and it is true that at least of them must match. But then, what is our regex prefix? Again, in theory, that could be `\d+|\w+`, but that's not quite right, because the `\d+` only matches when `@!` matches, and `\w+` only matches when `%%` matches. All of that is technically possible to do, but it seemingly requires a lot of sophistication and machinery. Probably the way to tackle that is with some kind of formalism and approach this problem more generally. For now, the code below basically just looks for a top-level concatenation. And if it can find one, it looks for literals in each of the direct child sub-expressions of that concatenation. If some good ones are found, we return those and a concatenation of the Hir expressions seen up to that point. */ use alloc::vec::Vec; use regex_syntax::hir::{self, literal, Hir, HirKind}; use crate::{util::prefilter::Prefilter, MatchKind}; /// Attempts to extract an "inner" prefilter from the given HIR expressions. If /// one was found, then a concatenation of the HIR expressions that precede it /// is returned. /// /// The idea here is that the prefilter returned can be used to find candidate /// matches. And then the HIR returned can be used to build a reverse regex /// matcher, which will find the start of the candidate match. Finally, the /// match still has to be confirmed with a normal anchored forward scan to find /// the end position of the match. /// /// Note that this assumes leftmost-first match semantics, so callers must /// not call this otherwise. pub(crate) fn extract(hirs: &[&Hir]) -> Option<(Hir, Prefilter)> { if hirs.len() != 1 { debug!( "skipping reverse inner optimization since it only \ supports 1 pattern, {} were given", hirs.len(), ); return None; } let mut concat = match top_concat(hirs[0]) { Some(concat) => concat, None => { debug!( "skipping reverse inner optimization because a top-level \ concatenation could not found", ); return None; } }; // We skip the first HIR because if it did have a prefix prefilter in it, // we probably wouldn't be here looking for an inner prefilter. for i in 1..concat.len() { let hir = &concat[i]; let pre = match prefilter(hir) { None => continue, Some(pre) => pre, }; // Even if we got a prefilter, if it isn't consider "fast," then we // probably don't want to bother with it. Namely, since the reverse // inner optimization requires some overhead, it likely only makes // sense if the prefilter scan itself is (believed) to be much faster // than the regex engine. if !pre.is_fast() { debug!( "skipping extracted inner prefilter because \ it probably isn't fast" ); continue; } let concat_suffix = Hir::concat(concat.split_off(i)); let concat_prefix = Hir::concat(concat); // Look for a prefilter again. Why? Because above we only looked for // a prefilter on the individual 'hir', but we might be able to find // something better and more discriminatory by looking at the entire // suffix. We don't do this above to avoid making this loop worst case // quadratic in the length of 'concat'. let pre2 = match prefilter(&concat_suffix) { None => pre, Some(pre2) => { if pre2.is_fast() { pre2 } else { pre } } }; return Some((concat_prefix, pre2)); } debug!( "skipping reverse inner optimization because a top-level \ sub-expression with a fast prefilter could not be found" ); None } /// Attempt to extract a prefilter from an HIR expression. /// /// We do a little massaging here to do our best that the prefilter we get out /// of this is *probably* fast. Basically, the false positive rate has a much /// higher impact for things like the reverse inner optimization because more /// work needs to potentially be done for each candidate match. /// /// Note that this assumes leftmost-first match semantics, so callers must /// not call this otherwise. fn prefilter(hir: &Hir) -> Option { let mut extractor = literal::Extractor::new(); extractor.kind(literal::ExtractKind::Prefix); let mut prefixes = extractor.extract(hir); debug!( "inner prefixes (len={:?}) extracted before optimization: {:?}", prefixes.len(), prefixes ); // Since these are inner literals, we know they cannot be exact. But the // extractor doesn't know this. We mark them as inexact because this might // impact literal optimization. Namely, optimization weights "all literals // are exact" as very high, because it presumes that any match results in // an overall match. But of course, that is not the case here. // // In practice, this avoids plucking out a ASCII-only \s as an alternation // of single-byte whitespace characters. prefixes.make_inexact(); prefixes.optimize_for_prefix_by_preference(); debug!( "inner prefixes (len={:?}) extracted after optimization: {:?}", prefixes.len(), prefixes ); prefixes .literals() .and_then(|lits| Prefilter::new(MatchKind::LeftmostFirst, lits)) } /// Looks for a "top level" HirKind::Concat item in the given HIR. This will /// try to return one even if it's embedded in a capturing group, but is /// otherwise pretty conservative in what is returned. /// /// The HIR returned is a complete copy of the concat with all capturing /// groups removed. In effect, the concat returned is "flattened" with respect /// to capturing groups. This makes the detection logic above for prefixes /// a bit simpler, and it works because 1) capturing groups never influence /// whether a match occurs or not and 2) capturing groups are not used when /// doing the reverse inner search to find the start of the match. fn top_concat(mut hir: &Hir) -> Option> { loop { hir = match hir.kind() { HirKind::Empty | HirKind::Literal(_) | HirKind::Class(_) | HirKind::Look(_) | HirKind::Repetition(_) | HirKind::Alternation(_) => return None, HirKind::Capture(hir::Capture { ref sub, .. }) => sub, HirKind::Concat(ref subs) => { // We are careful to only do the flattening/copy when we know // we have a "top level" concat we can inspect. This avoids // doing extra work in cases where we definitely won't use it. // (This might still be wasted work if we can't go on to find // some literals to extract.) let concat = Hir::concat(subs.iter().map(|h| flatten(h)).collect()); return match concat.into_kind() { HirKind::Concat(xs) => Some(xs), // It is actually possible for this case to occur, because // 'Hir::concat' might simplify the expression to the point // that concatenations are actually removed. One wonders // whether this leads to other cases where we should be // extracting literals, but in theory, I believe if we do // get here, then it means that a "real" prefilter failed // to be extracted and we should probably leave well enough // alone. (A "real" prefilter is unbothered by "top-level // concats" and "capturing groups.") _ => return None, }; } }; } } /// Returns a copy of the given HIR but with all capturing groups removed. fn flatten(hir: &Hir) -> Hir { match hir.kind() { HirKind::Empty => Hir::empty(), HirKind::Literal(hir::Literal(ref x)) => Hir::literal(x.clone()), HirKind::Class(ref x) => Hir::class(x.clone()), HirKind::Look(ref x) => Hir::look(x.clone()), HirKind::Repetition(ref x) => Hir::repetition(x.with(flatten(&x.sub))), // This is the interesting case. We just drop the group information // entirely and use the child HIR itself. HirKind::Capture(hir::Capture { ref sub, .. }) => flatten(sub), HirKind::Alternation(ref xs) => { Hir::alternation(xs.iter().map(|x| flatten(x)).collect()) } HirKind::Concat(ref xs) => { Hir::concat(xs.iter().map(|x| flatten(x)).collect()) } } } regex-automata-0.4.9/src/meta/stopat.rs000064400000000000000000000212711046102023000161610ustar 00000000000000/*! This module defines two bespoke forward DFA search routines. One for the lazy DFA and one for the fully compiled DFA. These routines differ from the normal ones by reporting the position at which the search terminates when a match *isn't* found. This position at which a search terminates is useful in contexts where the meta regex engine runs optimizations that could go quadratic if we aren't careful. Namely, a regex search *could* scan to the end of the haystack only to report a non-match. If the caller doesn't know that the search scanned to the end of the haystack, it might restart the search at the next literal candidate it finds and repeat the process. Providing the caller with the position at which the search stopped provides a way for the caller to determine the point at which subsequent scans should not pass. This is principally used in the "reverse inner" optimization, which works like this: 1. Look for a match of an inner literal. Say, 'Z' in '\w+Z\d+'. 2. At the spot where 'Z' matches, do a reverse anchored search from there for '\w+'. 3. If the reverse search matches, it corresponds to the start position of a (possible) match. At this point, do a forward anchored search to find the end position. If an end position is found, then we have a match and we know its bounds. If the forward anchored search in (3) searches the entire rest of the haystack but reports a non-match, then a naive implementation of the above will continue back at step 1 looking for more candidates. There might still be a match to be found! It's possible. But we already scanned the whole haystack. So if we keep repeating the process, then we might wind up taking quadratic time in the size of the haystack, which is not great. So if the forward anchored search in (3) reports the position at which it stops, then we can detect whether quadratic behavior might be occurring in steps (1) and (2). For (1), it occurs if the literal candidate found occurs *before* the end of the previous search in (3), since that means we're now going to look for another match in a place where the forward search has already scanned. It is *correct* to do so, but our technique has become inefficient. For (2), quadratic behavior occurs similarly when its reverse search extends past the point where the previous forward search in (3) terminated. Indeed, to implement (2), we use the sibling 'limited' module for ensuring our reverse scan doesn't go further than we want. See the 'opt/reverse-inner' benchmarks in rebar for a real demonstration of how quadratic behavior is mitigated. */ use crate::{meta::error::RetryFailError, HalfMatch, Input, MatchError}; #[cfg(feature = "dfa-build")] pub(crate) fn dfa_try_search_half_fwd( dfa: &crate::dfa::dense::DFA>, input: &Input<'_>, ) -> Result, RetryFailError> { use crate::dfa::{accel, Automaton}; let mut mat = None; let mut sid = dfa.start_state_forward(input)?; let mut at = input.start(); while at < input.end() { sid = dfa.next_state(sid, input.haystack()[at]); if dfa.is_special_state(sid) { if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); mat = Some(HalfMatch::new(pattern, at)); if input.get_earliest() { return Ok(mat.ok_or(at)); } if dfa.is_accel_state(sid) { let needs = dfa.accelerator(sid); at = accel::find_fwd(needs, input.haystack(), at) .unwrap_or(input.end()); continue; } } else if dfa.is_accel_state(sid) { let needs = dfa.accelerator(sid); at = accel::find_fwd(needs, input.haystack(), at) .unwrap_or(input.end()); continue; } else if dfa.is_dead_state(sid) { return Ok(mat.ok_or(at)); } else if dfa.is_quit_state(sid) { return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // Ideally we wouldn't use a DFA that specialized start states // and thus 'is_start_state()' could never be true here, but in // practice we reuse the DFA created for the full regex which // will specialize start states whenever there is a prefilter. debug_assert!(dfa.is_start_state(sid)); } } at += 1; } dfa_eoi_fwd(dfa, input, &mut sid, &mut mat)?; Ok(mat.ok_or(at)) } #[cfg(feature = "hybrid")] pub(crate) fn hybrid_try_search_half_fwd( dfa: &crate::hybrid::dfa::DFA, cache: &mut crate::hybrid::dfa::Cache, input: &Input<'_>, ) -> Result, RetryFailError> { let mut mat = None; let mut sid = dfa.start_state_forward(cache, input)?; let mut at = input.start(); while at < input.end() { sid = dfa .next_state(cache, sid, input.haystack()[at]) .map_err(|_| MatchError::gave_up(at))?; if sid.is_tagged() { if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); mat = Some(HalfMatch::new(pattern, at)); if input.get_earliest() { return Ok(mat.ok_or(at)); } } else if sid.is_dead() { return Ok(mat.ok_or(at)); } else if sid.is_quit() { return Err(MatchError::quit(input.haystack()[at], at).into()); } else { // We should NEVER get an unknown state ID back from // dfa.next_state(). debug_assert!(!sid.is_unknown()); // Ideally we wouldn't use a lazy DFA that specialized start // states and thus 'sid.is_start()' could never be true here, // but in practice we reuse the lazy DFA created for the full // regex which will specialize start states whenever there is // a prefilter. debug_assert!(sid.is_start()); } } at += 1; } hybrid_eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; Ok(mat.ok_or(at)) } #[cfg(feature = "dfa-build")] #[cfg_attr(feature = "perf-inline", inline(always))] fn dfa_eoi_fwd( dfa: &crate::dfa::dense::DFA>, input: &Input<'_>, sid: &mut crate::util::primitives::StateID, mat: &mut Option, ) -> Result<(), MatchError> { use crate::dfa::Automaton; let sp = input.get_span(); match input.haystack().get(sp.end) { Some(&b) => { *sid = dfa.next_state(*sid, b); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) } #[cfg(feature = "hybrid")] #[cfg_attr(feature = "perf-inline", inline(always))] fn hybrid_eoi_fwd( dfa: &crate::hybrid::dfa::DFA, cache: &mut crate::hybrid::dfa::Cache, input: &Input<'_>, sid: &mut crate::hybrid::LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); match input.haystack().get(sp.end) { Some(&b) => { *sid = dfa .next_state(cache, *sid, b) .map_err(|_| MatchError::gave_up(sp.end))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa .next_eoi_state(cache, *sid) .map_err(|_| MatchError::gave_up(input.haystack().len()))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, input.haystack().len())); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } } Ok(()) } regex-automata-0.4.9/src/meta/strategy.rs000064400000000000000000002237041046102023000165160ustar 00000000000000use core::{ fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; use alloc::sync::Arc; use regex_syntax::hir::{literal, Hir}; use crate::{ meta::{ error::{BuildError, RetryError, RetryFailError, RetryQuadraticError}, regex::{Cache, RegexInfo}, reverse_inner, wrappers, }, nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, prefilter::{self, Prefilter, PrefilterI}, primitives::{NonMaxUsize, PatternID}, search::{Anchored, HalfMatch, Input, Match, MatchKind, PatternSet}, }, }; /// A trait that represents a single meta strategy. Its main utility is in /// providing a way to do dynamic dispatch over a few choices. /// /// Why dynamic dispatch? I actually don't have a super compelling reason, and /// importantly, I have not benchmarked it with the main alternative: an enum. /// I went with dynamic dispatch initially because the regex engine search code /// really can't be inlined into caller code in most cases because it's just /// too big. In other words, it is already expected that every regex search /// will entail at least the cost of a function call. /// /// I do wonder whether using enums would result in better codegen overall /// though. It's a worthwhile experiment to try. Probably the most interesting /// benchmark to run in such a case would be one with a high match count. That /// is, a benchmark to test the overall latency of a search call. pub(super) trait Strategy: Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static { fn group_info(&self) -> &GroupInfo; fn create_cache(&self) -> Cache; fn reset_cache(&self, cache: &mut Cache); fn is_accelerated(&self) -> bool; fn memory_usage(&self) -> usize; fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option; fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option; fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool; fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option; fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ); } pub(super) fn new( info: &RegexInfo, hirs: &[&Hir], ) -> Result, BuildError> { // At this point, we're committed to a regex engine of some kind. So pull // out a prefilter if we can, which will feed to each of the constituent // regex engines. let pre = if info.is_always_anchored_start() { // PERF: I'm not sure we necessarily want to do this... We may want to // run a prefilter for quickly rejecting in some cases. The problem // is that anchored searches overlap quite a bit with the use case // of "run a regex on every line to extract data." In that case, the // regex always matches, so running a prefilter doesn't really help us // there. The main place where a prefilter helps in an anchored search // is if the anchored search is not expected to match frequently. That // is, the prefilter gives us a way to possibly reject a haystack very // quickly. // // Maybe we should do use a prefilter, but only for longer haystacks? // Or maybe we should only use a prefilter when we think it's "fast"? // // Interestingly, I think we currently lack the infrastructure for // disabling a prefilter based on haystack length. That would probably // need to be a new 'Input' option. (Interestingly, an 'Input' used to // carry a 'Prefilter' with it, but I moved away from that.) debug!("skipping literal extraction since regex is anchored"); None } else if let Some(pre) = info.config().get_prefilter() { debug!( "skipping literal extraction since the caller provided a prefilter" ); Some(pre.clone()) } else if info.config().get_auto_prefilter() { let kind = info.config().get_match_kind(); let prefixes = crate::util::prefilter::prefixes(kind, hirs); // If we can build a full `Strategy` from just the extracted prefixes, // then we can short-circuit and avoid building a regex engine at all. if let Some(pre) = Pre::from_prefixes(info, &prefixes) { debug!( "found that the regex can be broken down to a literal \ search, avoiding the regex engine entirely", ); return Ok(pre); } // This now attempts another short-circuit of the regex engine: if we // have a huge alternation of just plain literals, then we can just use // Aho-Corasick for that and avoid the regex engine entirely. // // You might think this case would just be handled by // `Pre::from_prefixes`, but that technique relies on heuristic literal // extraction from the corresponding `Hir`. That works, but part of // heuristics limit the size and number of literals returned. This case // will specifically handle patterns with very large alternations. // // One wonders if we should just roll this our heuristic literal // extraction, and then I think this case could disappear entirely. if let Some(pre) = Pre::from_alternation_literals(info, hirs) { debug!( "found plain alternation of literals, \ avoiding regex engine entirely and using Aho-Corasick" ); return Ok(pre); } prefixes.literals().and_then(|strings| { debug!( "creating prefilter from {} literals: {:?}", strings.len(), strings, ); Prefilter::new(kind, strings) }) } else { debug!("skipping literal extraction since prefilters were disabled"); None }; let mut core = Core::new(info.clone(), pre.clone(), hirs)?; // Now that we have our core regex engines built, there are a few cases // where we can do a little bit better than just a normal "search forward // and maybe use a prefilter when in a start state." However, these cases // may not always work or otherwise build on top of the Core searcher. // For example, the reverse anchored optimization seems like it might // always work, but only the DFAs support reverse searching and the DFAs // might give up or quit for reasons. If we had, e.g., a PikeVM that // supported reverse searching, then we could avoid building a full Core // engine for this case. core = match ReverseAnchored::new(core) { Err(core) => core, Ok(ra) => { debug!("using reverse anchored strategy"); return Ok(Arc::new(ra)); } }; core = match ReverseSuffix::new(core, hirs) { Err(core) => core, Ok(rs) => { debug!("using reverse suffix strategy"); return Ok(Arc::new(rs)); } }; core = match ReverseInner::new(core, hirs) { Err(core) => core, Ok(ri) => { debug!("using reverse inner strategy"); return Ok(Arc::new(ri)); } }; debug!("using core strategy"); Ok(Arc::new(core)) } #[derive(Clone, Debug)] struct Pre

{ pre: P, group_info: GroupInfo, } impl Pre

{ fn new(pre: P) -> Arc { // The only thing we support when we use prefilters directly as a // strategy is the start and end of the overall match for a single // pattern. In other words, exactly one implicit capturing group. Which // is exactly what we use here for a GroupInfo. let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); Arc::new(Pre { pre, group_info }) } } // This is a little weird, but we don't actually care about the type parameter // here because we're selecting which underlying prefilter to use. So we just // define it on an arbitrary type. impl Pre<()> { /// Given a sequence of prefixes, attempt to return a full `Strategy` using /// just the prefixes. /// /// Basically, this occurs when the prefixes given not just prefixes, /// but an enumeration of the entire language matched by the regular /// expression. /// /// A number of other conditions need to be true too. For example, there /// can be only one pattern, the number of explicit capture groups is 0, no /// look-around assertions and so on. /// /// Note that this ignores `Config::get_auto_prefilter` because if this /// returns something, then it isn't a prefilter but a matcher itself. /// Therefore, it shouldn't suffer from the problems typical to prefilters /// (such as a high false positive rate). fn from_prefixes( info: &RegexInfo, prefixes: &literal::Seq, ) -> Option> { let kind = info.config().get_match_kind(); // Check to see if our prefixes are exact, which means we might be // able to bypass the regex engine entirely and just rely on literal // searches. if !prefixes.is_exact() { return None; } // We also require that we have a single regex pattern. Namely, // we reuse the prefilter infrastructure to implement search and // prefilters only report spans. Prefilters don't know about pattern // IDs. The multi-regex case isn't a lost cause, we might still use // Aho-Corasick and we might still just use a regular prefilter, but // that's done below. if info.pattern_len() != 1 { return None; } // We can't have any capture groups either. The literal engines don't // know how to deal with things like '(foo)(bar)'. In that case, a // prefilter will just be used and then the regex engine will resolve // the capture groups. if info.props()[0].explicit_captures_len() != 0 { return None; } // We also require that it has zero look-around assertions. Namely, // literal extraction treats look-around assertions as if they match // *every* empty string. But of course, that isn't true. So for // example, 'foo\bquux' never matches anything, but 'fooquux' is // extracted from that as an exact literal. Such cases should just run // the regex engine. 'fooquux' will be used as a normal prefilter, and // then the regex engine will try to look for an actual match. if !info.props()[0].look_set().is_empty() { return None; } // Finally, currently, our prefilters are all oriented around // leftmost-first match semantics, so don't try to use them if the // caller asked for anything else. if kind != MatchKind::LeftmostFirst { return None; } // The above seems like a lot of requirements to meet, but it applies // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet // the above criteria, for example. // // Note that this is effectively a latency optimization. If we didn't // do this, then the extracted literals would still get bundled into // a prefilter, and every regex engine capable of running unanchored // searches supports prefilters. So this optimization merely sidesteps // having to run the regex engine at all to confirm the match. Thus, it // decreases the latency of a match. // OK because we know the set is exact and thus finite. let prefixes = prefixes.literals().unwrap(); debug!( "trying to bypass regex engine by creating \ prefilter from {} literals: {:?}", prefixes.len(), prefixes, ); let choice = match prefilter::Choice::new(kind, prefixes) { Some(choice) => choice, None => { debug!( "regex bypass failed because no prefilter could be built" ); return None; } }; let strat: Arc = match choice { prefilter::Choice::Memchr(pre) => Pre::new(pre), prefilter::Choice::Memchr2(pre) => Pre::new(pre), prefilter::Choice::Memchr3(pre) => Pre::new(pre), prefilter::Choice::Memmem(pre) => Pre::new(pre), prefilter::Choice::Teddy(pre) => Pre::new(pre), prefilter::Choice::ByteSet(pre) => Pre::new(pre), prefilter::Choice::AhoCorasick(pre) => Pre::new(pre), }; Some(strat) } /// Attempts to extract an alternation of literals, and if it's deemed /// worth doing, returns an Aho-Corasick prefilter as a strategy. /// /// And currently, this only returns something when 'hirs.len() == 1'. This /// could in theory do something if there are multiple HIRs where all of /// them are alternation of literals, but I haven't had the time to go down /// that path yet. fn from_alternation_literals( info: &RegexInfo, hirs: &[&Hir], ) -> Option> { use crate::util::prefilter::AhoCorasick; let lits = crate::meta::literal::alternation_literals(info, hirs)?; let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?; Some(Pre::new(ac)) } } // This implements Strategy for anything that implements PrefilterI. // // Note that this must only be used for regexes of length 1. Multi-regexes // don't work here. The prefilter interface only provides the span of a match // and not the pattern ID. (I did consider making it more expressive, but I // couldn't figure out how to tie everything together elegantly.) Thus, so long // as the regex only contains one pattern, we can simply assume that a match // corresponds to PatternID::ZERO. And indeed, that's what we do here. // // In practice, since this impl is used to report matches directly and thus // completely bypasses the regex engine, we only wind up using this under the // following restrictions: // // * There must be only one pattern. As explained above. // * The literal sequence must be finite and only contain exact literals. // * There must not be any look-around assertions. If there are, the literals // extracted might be exact, but a match doesn't necessarily imply an overall // match. As a trivial example, 'foo\bbar' does not match 'foobar'. // * The pattern must not have any explicit capturing groups. If it does, the // caller might expect them to be resolved. e.g., 'foo(bar)'. // // So when all of those things are true, we use a prefilter directly as a // strategy. // // In the case where the number of patterns is more than 1, we don't use this // but do use a special Aho-Corasick strategy if all of the regexes are just // simple literals or alternations of literals. (We also use the Aho-Corasick // strategy when len(patterns)==1 if the number of literals is large. In that // case, literal extraction gives up and will return an infinite set.) impl Strategy for Pre

{ #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { &self.group_info } fn create_cache(&self) -> Cache { Cache { capmatches: Captures::all(self.group_info().clone()), pikevm: wrappers::PikeVMCache::none(), backtrack: wrappers::BoundedBacktrackerCache::none(), onepass: wrappers::OnePassCache::none(), hybrid: wrappers::HybridCache::none(), revhybrid: wrappers::ReverseHybridCache::none(), } } fn reset_cache(&self, _cache: &mut Cache) {} fn is_accelerated(&self) -> bool { self.pre.is_fast() } fn memory_usage(&self) -> usize { self.pre.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, _cache: &mut Cache, input: &Input<'_>) -> Option { if input.is_done() { return None; } if input.get_anchored().is_anchored() { return self .pre .prefix(input.haystack(), input.get_span()) .map(|sp| Match::new(PatternID::ZERO, sp)); } self.pre .find(input.haystack(), input.get_span()) .map(|sp| Match::new(PatternID::ZERO, sp)) } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end())) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { self.search(cache, input).is_some() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let m = self.search(cache, input)?; if let Some(slot) = slots.get_mut(0) { *slot = NonMaxUsize::new(m.start()); } if let Some(slot) = slots.get_mut(1) { *slot = NonMaxUsize::new(m.end()); } Some(m.pattern()) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { if self.search(cache, input).is_some() { patset.insert(PatternID::ZERO); } } } #[derive(Debug)] struct Core { info: RegexInfo, pre: Option, nfa: NFA, nfarev: Option, pikevm: wrappers::PikeVM, backtrack: wrappers::BoundedBacktracker, onepass: wrappers::OnePass, hybrid: wrappers::Hybrid, dfa: wrappers::DFA, } impl Core { fn new( info: RegexInfo, pre: Option, hirs: &[&Hir], ) -> Result { let mut lookm = LookMatcher::new(); lookm.set_line_terminator(info.config().get_line_terminator()); let thompson_config = thompson::Config::new() .utf8(info.config().get_utf8_empty()) .nfa_size_limit(info.config().get_nfa_size_limit()) .shrink(false) .which_captures(info.config().get_which_captures()) .look_matcher(lookm); let nfa = thompson::Compiler::new() .configure(thompson_config.clone()) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; // It's possible for the PikeVM or the BB to fail to build, even though // at this point, we already have a full NFA in hand. They can fail // when a Unicode word boundary is used but where Unicode word boundary // support is disabled at compile time, thus making it impossible to // match. (Construction can also fail if the NFA was compiled without // captures, but we always enable that above.) let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?; let backtrack = wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?; // The onepass engine can of course fail to build, but we expect it to // fail in many cases because it is an optimization that doesn't apply // to all regexes. The 'OnePass' wrapper encapsulates this failure (and // logs a message if it occurs). let onepass = wrappers::OnePass::new(&info, &nfa); // We try to encapsulate whether a particular regex engine should be // used within each respective wrapper, but the DFAs need a reverse NFA // to build itself, and we really do not want to build a reverse NFA if // we know we aren't going to use the lazy DFA. So we do a config check // up front, which is in practice the only way we won't try to use the // DFA. let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() && !info.config().get_dfa() { (None, wrappers::Hybrid::none(), wrappers::DFA::none()) } else { // FIXME: Technically, we don't quite yet KNOW that we need // a reverse NFA. It's possible for the DFAs below to both // fail to build just based on the forward NFA. In which case, // building the reverse NFA was totally wasted work. But... // fixing this requires breaking DFA construction apart into // two pieces: one for the forward part and another for the // reverse part. Quite annoying. Making it worse, when building // both DFAs fails, it's quite likely that the NFA is large and // that it will take quite some time to build the reverse NFA // too. So... it's really probably worth it to do this! let nfarev = thompson::Compiler::new() // Currently, reverse NFAs don't support capturing groups, // so we MUST disable them. But even if we didn't have to, // we would, because nothing in this crate does anything // useful with capturing groups in reverse. And of course, // the lazy DFA ignores capturing groups in all cases. .configure( thompson_config .clone() .which_captures(WhichCaptures::None) .reverse(true), ) .build_many_from_hir(hirs) .map_err(BuildError::nfa)?; let dfa = if !info.config().get_dfa() { wrappers::DFA::none() } else { wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev) }; let hybrid = if !info.config().get_hybrid() { wrappers::Hybrid::none() } else if dfa.is_some() { debug!("skipping lazy DFA because we have a full DFA"); wrappers::Hybrid::none() } else { wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev) }; (Some(nfarev), hybrid, dfa) }; Ok(Core { info, pre, nfa, nfarev, pikevm, backtrack, onepass, hybrid, dfa, }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_mayfail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option, RetryFailError>> { if let Some(e) = self.dfa.get(input) { trace!("using full DFA for search at {:?}", input.get_span()); Some(e.try_search(input)) } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for search at {:?}", input.get_span()); Some(e.try_search(&mut cache.hybrid, input)) } else { None } } fn search_nofail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { let caps = &mut cache.capmatches; caps.set_pattern(None); // We manually inline 'try_search_slots_nofail' here because we need to // borrow from 'cache.capmatches' in this method, but if we do, then // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a // classic example of how the borrow checker inhibits decomposition. // There are of course work-arounds (more types and/or interior // mutability), but that's more annoying than this IMO. let pid = if let Some(ref e) = self.onepass.get(input) { trace!("using OnePass for search at {:?}", input.get_span()); e.search_slots(&mut cache.onepass, input, caps.slots_mut()) } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for search at {:?}", input.get_span() ); e.search_slots(&mut cache.backtrack, input, caps.slots_mut()) } else { trace!("using PikeVM for search at {:?}", input.get_span()); let e = self.pikevm.get(); e.search_slots(&mut cache.pikevm, input, caps.slots_mut()) }; caps.set_pattern(pid); caps.get_match() } fn search_half_nofail( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { // Only the lazy/full DFA returns half-matches, since the DFA requires // a reverse scan to find the start position. These fallback regex // engines can find the start and end in a single pass, so we just do // that and throw away the start offset to conform to the API. let m = self.search_nofail(cache, input)?; Some(HalfMatch::new(m.pattern(), m.end())) } fn search_slots_nofail( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if let Some(ref e) = self.onepass.get(input) { trace!( "using OnePass for capture search at {:?}", input.get_span() ); e.search_slots(&mut cache.onepass, input, slots) } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for capture search at {:?}", input.get_span() ); e.search_slots(&mut cache.backtrack, input, slots) } else { trace!( "using PikeVM for capture search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.search_slots(&mut cache.pikevm, input, slots) } } fn is_match_nofail(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if let Some(ref e) = self.onepass.get(input) { trace!( "using OnePass for is-match search at {:?}", input.get_span() ); e.search_slots(&mut cache.onepass, input, &mut []).is_some() } else if let Some(ref e) = self.backtrack.get(input) { trace!( "using BoundedBacktracker for is-match search at {:?}", input.get_span() ); e.is_match(&mut cache.backtrack, input) } else { trace!( "using PikeVM for is-match search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.is_match(&mut cache.pikevm, input) } } fn is_capture_search_needed(&self, slots_len: usize) -> bool { slots_len > self.nfa.group_info().implicit_slot_len() } } impl Strategy for Core { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.nfa.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { Cache { capmatches: Captures::all(self.group_info().clone()), pikevm: self.pikevm.create_cache(), backtrack: self.backtrack.create_cache(), onepass: self.onepass.create_cache(), hybrid: self.hybrid.create_cache(), revhybrid: wrappers::ReverseHybridCache::none(), } } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { cache.pikevm.reset(&self.pikevm); cache.backtrack.reset(&self.backtrack); cache.onepass.reset(&self.onepass); cache.hybrid.reset(&self.hybrid); } fn is_accelerated(&self) -> bool { self.pre.as_ref().map_or(false, |pre| pre.is_fast()) } fn memory_usage(&self) -> usize { self.info.memory_usage() + self.pre.as_ref().map_or(0, |pre| pre.memory_usage()) + self.nfa.memory_usage() + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage()) + self.onepass.memory_usage() + self.dfa.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { // We manually inline try_search_mayfail here because letting the // compiler do it seems to produce pretty crappy codegen. return if let Some(e) = self.dfa.get(input) { trace!("using full DFA for full search at {:?}", input.get_span()); match e.try_search(input) { Ok(x) => x, Err(_err) => { trace!("full DFA search failed: {}", _err); self.search_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for full search at {:?}", input.get_span()); match e.try_search(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { trace!("lazy DFA search failed: {}", _err); self.search_nofail(cache, input) } } } else { self.search_nofail(cache, input) }; } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { // The main difference with 'search' is that if we're using a DFA, we // can use a single forward scan without needing to run the reverse // DFA. if let Some(e) = self.dfa.get(input) { trace!("using full DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(input) { Ok(x) => x, Err(_err) => { trace!("full DFA half search failed: {}", _err); self.search_half_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!("using lazy DFA for half search at {:?}", input.get_span()); match e.try_search_half_fwd(&mut cache.hybrid, input) { Ok(x) => x, Err(_err) => { trace!("lazy DFA half search failed: {}", _err); self.search_half_nofail(cache, input) } } } else { self.search_half_nofail(cache, input) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if let Some(e) = self.dfa.get(input) { trace!( "using full DFA for is-match search at {:?}", input.get_span() ); match e.try_search_half_fwd(input) { Ok(x) => x.is_some(), Err(_err) => { trace!("full DFA half search failed: {}", _err); self.is_match_nofail(cache, input) } } } else if let Some(e) = self.hybrid.get(input) { trace!( "using lazy DFA for is-match search at {:?}", input.get_span() ); match e.try_search_half_fwd(&mut cache.hybrid, input) { Ok(x) => x.is_some(), Err(_err) => { trace!("lazy DFA half search failed: {}", _err); self.is_match_nofail(cache, input) } } } else { self.is_match_nofail(cache, input) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { // Even if the regex has explicit capture groups, if the caller didn't // provide any explicit slots, then it doesn't make sense to try and do // extra work to get offsets for those slots. Ideally the caller should // realize this and not call this routine in the first place, but alas, // we try to save the caller from themselves if they do. if !self.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } // If the onepass DFA is available for this search (which only happens // when it's anchored), then skip running a fallible DFA. The onepass // DFA isn't as fast as a full or lazy DFA, but it is typically quite // a bit faster than the backtracker or the PikeVM. So it isn't as // advantageous to try and do a full/lazy DFA scan first. // // We still theorize that it's better to do a full/lazy DFA scan, even // when it's anchored, because it's usually much faster and permits us // to say "no match" much more quickly. This does hurt the case of, // say, parsing each line in a log file into capture groups, because // in that case, the line always matches. So the lazy DFA scan is // usually just wasted work. But, the lazy DFA is usually quite fast // and doesn't cost too much here. if self.onepass.get(&input).is_some() { return self.search_slots_nofail(cache, &input, slots); } let m = match self.try_search_mayfail(cache, input) { Some(Ok(Some(m))) => m, Some(Ok(None)) => return None, Some(Err(_err)) => { trace!("fast capture search failed: {}", _err); return self.search_slots_nofail(cache, input, slots); } None => { return self.search_slots_nofail(cache, input, slots); } }; // At this point, now that we've found the bounds of the // match, we need to re-run something that can resolve // capturing groups. But we only need to run on it on the // match bounds and not the entire haystack. trace!( "match found at {}..{} in capture search, \ using another engine to find captures", m.start(), m.end(), ); let input = input .clone() .span(m.start()..m.end()) .anchored(Anchored::Pattern(m.pattern())); Some( self.search_slots_nofail(cache, &input, slots) .expect("should find a match"), ) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { if let Some(e) = self.dfa.get(input) { trace!( "using full DFA for overlapping search at {:?}", input.get_span() ); let _err = match e.try_which_overlapping_matches(input, patset) { Ok(()) => return, Err(err) => err, }; trace!("fast overlapping search failed: {}", _err); } else if let Some(e) = self.hybrid.get(input) { trace!( "using lazy DFA for overlapping search at {:?}", input.get_span() ); let _err = match e.try_which_overlapping_matches( &mut cache.hybrid, input, patset, ) { Ok(()) => { return; } Err(err) => err, }; trace!("fast overlapping search failed: {}", _err); } trace!( "using PikeVM for overlapping search at {:?}", input.get_span() ); let e = self.pikevm.get(); e.which_overlapping_matches(&mut cache.pikevm, input, patset) } } #[derive(Debug)] struct ReverseAnchored { core: Core, } impl ReverseAnchored { fn new(core: Core) -> Result { if !core.info.is_always_anchored_end() { debug!( "skipping reverse anchored optimization because \ the regex is not always anchored at the end" ); return Err(core); } // Note that the caller can still request an anchored search even when // the regex isn't anchored at the start. We detect that case in the // search routines below and just fallback to the core engine. This // is fine because both searches are anchored. It's just a matter of // picking one. Falling back to the core engine is a little simpler, // since if we used the reverse anchored approach, we'd have to add an // extra check to ensure the match reported starts at the place where // the caller requested the search to start. if core.info.is_always_anchored_start() { debug!( "skipping reverse anchored optimization because \ the regex is also anchored at the start" ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse anchored optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } Ok(ReverseAnchored { core }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_anchored_rev( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { // We of course always want an anchored search. In theory, the // underlying regex engines should automatically enable anchored // searches since the regex is itself anchored, but this more clearly // expresses intent and is always correct. let input = input.clone().anchored(Anchored::Yes); if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for reverse anchored search at {:?}", input.get_span() ); e.try_search_half_rev(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for reverse anchored search at {:?}", input.get_span() ); e.try_search_half_rev(&mut cache.hybrid, &input) } else { unreachable!("ReverseAnchored always has a DFA") } } } // Note that in this impl, we don't check that 'input.end() == // input.haystack().len()'. In particular, when that condition is false, a // match is always impossible because we know that the regex is always anchored // at the end (or else 'ReverseAnchored' won't be built). We don't check that // here because the 'Regex' wrapper actually does that for us in all cases. // Thus, in this impl, we can actually assume that the end position in 'input' // is equivalent to the length of the haystack. impl Strategy for ReverseAnchored { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { self.core.create_cache() } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); } fn is_accelerated(&self) -> bool { // Since this is anchored at the end, a reverse anchored search is // almost certainly guaranteed to result in a much faster search than // a standard forward search. true } fn memory_usage(&self) -> usize { self.core.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(None) => None, Ok(Some(hm)) => { Some(Match::new(hm.pattern(), hm.offset()..input.end())) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(hm)) => { // Careful here! 'try_search_half' is a *forward* search that // only cares about the *end* position of a match. But // 'hm.offset()' is actually the start of the match. So we // actually just throw that away here and, since we know we // have a match, return the only possible position at which a // match can occur: input.end(). Some(HalfMatch::new(hm.pattern(), input.end())) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } match self.try_search_half_anchored_rev(cache, input) { Err(_err) => { trace!("fast reverse anchored search failed: {}", _err); self.core.search_slots_nofail(cache, input, slots) } Ok(None) => None, Ok(Some(hm)) => { if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, skipping captures"); let m = Match::new(hm.pattern(), hm.offset()..input.end()); copy_match_to_slots(m, slots); return Some(m.pattern()); } let start = hm.offset(); let input = input .clone() .span(start..input.end()) .anchored(Anchored::Pattern(hm.pattern())); self.core.search_slots_nofail(cache, &input, slots) } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { // It seems like this could probably benefit from a reverse anchored // optimization, perhaps by doing an overlapping reverse search (which // the DFAs do support). I haven't given it much thought though, and // I'm currently focus more on the single pattern case. self.core.which_overlapping_matches(cache, input, patset) } } #[derive(Debug)] struct ReverseSuffix { core: Core, pre: Prefilter, } impl ReverseSuffix { fn new(core: Core, hirs: &[&Hir]) -> Result { if !core.info.config().get_auto_prefilter() { debug!( "skipping reverse suffix optimization because \ automatic prefilters are disabled" ); return Err(core); } // Like the reverse inner optimization, we don't do this for regexes // that are always anchored. It could lead to scanning too much, but // could say "no match" much more quickly than running the regex // engine if the initial literal scan doesn't match. With that said, // the reverse suffix optimization has lower overhead, since it only // requires a reverse scan after a literal match to confirm or reject // the match. (Although, in the case of confirmation, it then needs to // do another forward scan to find the end position.) // // Note that the caller can still request an anchored search even // when the regex isn't anchored. We detect that case in the search // routines below and just fallback to the core engine. Currently this // optimization assumes all searches are unanchored, so if we do want // to enable this optimization for anchored searches, it will need a // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse suffix optimization because \ the regex is always anchored at the start", ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse suffix optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } if core.pre.as_ref().map_or(false, |p| p.is_fast()) { debug!( "skipping reverse suffix optimization because \ we already have a prefilter that we think is fast" ); return Err(core); } let kind = core.info.config().get_match_kind(); let suffixes = crate::util::prefilter::suffixes(kind, hirs); let lcs = match suffixes.longest_common_suffix() { None => { debug!( "skipping reverse suffix optimization because \ a longest common suffix could not be found", ); return Err(core); } Some(lcs) if lcs.is_empty() => { debug!( "skipping reverse suffix optimization because \ the longest common suffix is the empty string", ); return Err(core); } Some(lcs) => lcs, }; let pre = match Prefilter::new(kind, &[lcs]) { Some(pre) => pre, None => { debug!( "skipping reverse suffix optimization because \ a prefilter could not be constructed from the \ longest common suffix", ); return Err(core); } }; if !pre.is_fast() { debug!( "skipping reverse suffix optimization because \ while we have a suffix prefilter, it is not \ believed to be 'fast'" ); return Err(core); } Ok(ReverseSuffix { core, pre }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_start( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryError> { let mut span = input.get_span(); let mut min_start = 0; loop { let litmatch = match self.pre.find(input.haystack(), span) { None => return Ok(None), Some(span) => span, }; trace!("reverse suffix scan found suffix match at {:?}", litmatch); let revinput = input .clone() .anchored(Anchored::Yes) .span(input.start()..litmatch.end); match self .try_search_half_rev_limited(cache, &revinput, min_start)? { None => { if span.start >= span.end { break; } span.start = litmatch.start.checked_add(1).unwrap(); } Some(hm) => return Ok(Some(hm)), } min_start = litmatch.end; } Ok(None) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_fwd( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for forward reverse suffix search at {:?}", input.get_span() ); e.try_search_half_fwd(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for forward reverse suffix search at {:?}", input.get_span() ); e.try_search_half_fwd(&mut cache.hybrid, &input) } else { unreachable!("ReverseSuffix always has a DFA") } } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_rev_limited( &self, cache: &mut Cache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for reverse suffix search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&mut cache.hybrid, &input, min_start) } else { unreachable!("ReverseSuffix always has a DFA") } } } impl Strategy for ReverseSuffix { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { self.core.create_cache() } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); } fn is_accelerated(&self) -> bool { self.pre.is_fast() } fn memory_usage(&self) -> usize { self.core.memory_usage() + self.pre.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix optimization failed: {}", _err); self.core.search(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse suffix reverse fast search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(None) => None, Ok(Some(hm_start)) => { let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd(cache, &fwdinput) { Err(_err) => { trace!( "reverse suffix forward fast search failed: {}", _err ); self.core.search_nofail(cache, input) } Ok(None) => { unreachable!( "suffix match plus reverse match implies \ there must be a match", ) } Ok(Some(hm_end)) => Some(Match::new( hm_start.pattern(), hm_start.offset()..hm_end.offset(), )), } } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); self.core.search_half(cache, input) } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast half search failed: {}", _err ); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(hm_start)) => { // This is a bit subtle. It is tempting to just stop searching // at this point and return a half-match with an offset // corresponding to where the suffix was found. But the suffix // match does not necessarily correspond to the end of the // proper leftmost-first match. Consider /[a-z]+ing/ against // 'tingling'. The first suffix match is the first 'ing', and // the /[a-z]+/ matches the 't'. So if we stopped here, then // we'd report 'ting' as the match. But 'tingling' is the // correct match because of greediness. let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd(cache, &fwdinput) { Err(_err) => { trace!( "reverse suffix forward fast search failed: {}", _err ); self.core.search_half_nofail(cache, input) } Ok(None) => { unreachable!( "suffix match plus reverse match implies \ there must be a match", ) } Ok(Some(hm_end)) => Some(hm_end), } } } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse suffix half optimization failed: {}", _err); self.core.is_match_nofail(cache, input) } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast half search failed: {}", _err ); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } let hm_start = match self.try_search_half_start(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!( "reverse suffix captures optimization failed: {}", _err ); return self.core.search_slots(cache, input, slots); } Err(RetryError::Fail(_err)) => { trace!( "reverse suffix reverse fast captures search failed: {}", _err ); return self.core.search_slots_nofail(cache, input, slots); } Ok(None) => return None, Ok(Some(hm_start)) => hm_start, }; trace!( "match found at {}..{} in capture search, \ using another engine to find captures", hm_start.offset(), input.end(), ); let start = hm_start.offset(); let input = input .clone() .span(start..input.end()) .anchored(Anchored::Pattern(hm_start.pattern())); self.core.search_slots_nofail(cache, &input, slots) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.core.which_overlapping_matches(cache, input, patset) } } #[derive(Debug)] struct ReverseInner { core: Core, preinner: Prefilter, nfarev: NFA, hybrid: wrappers::ReverseHybrid, dfa: wrappers::ReverseDFA, } impl ReverseInner { fn new(core: Core, hirs: &[&Hir]) -> Result { if !core.info.config().get_auto_prefilter() { debug!( "skipping reverse inner optimization because \ automatic prefilters are disabled" ); return Err(core); } // Currently we hard-code the assumption of leftmost-first match // semantics. This isn't a huge deal because 'all' semantics tend to // only be used for forward overlapping searches with multiple regexes, // and this optimization only supports a single pattern at the moment. if core.info.config().get_match_kind() != MatchKind::LeftmostFirst { debug!( "skipping reverse inner optimization because \ match kind is {:?} but this only supports leftmost-first", core.info.config().get_match_kind(), ); return Err(core); } // It's likely that a reverse inner scan has too much overhead for it // to be worth it when the regex is anchored at the start. It is // possible for it to be quite a bit faster if the initial literal // scan fails to detect a match, in which case, we can say "no match" // very quickly. But this could be undesirable, e.g., scanning too far // or when the literal scan matches. If it matches, then confirming the // match requires a reverse scan followed by a forward scan to confirm // or reject, which is a fair bit of work. // // Note that the caller can still request an anchored search even // when the regex isn't anchored. We detect that case in the search // routines below and just fallback to the core engine. Currently this // optimization assumes all searches are unanchored, so if we do want // to enable this optimization for anchored searches, it will need a // little work to support it. if core.info.is_always_anchored_start() { debug!( "skipping reverse inner optimization because \ the regex is always anchored at the start", ); return Err(core); } // Only DFAs can do reverse searches (currently), so we need one of // them in order to do this optimization. It's possible (although // pretty unlikely) that we have neither and need to give up. if !core.hybrid.is_some() && !core.dfa.is_some() { debug!( "skipping reverse inner optimization because \ we don't have a lazy DFA or a full DFA" ); return Err(core); } if core.pre.as_ref().map_or(false, |p| p.is_fast()) { debug!( "skipping reverse inner optimization because \ we already have a prefilter that we think is fast" ); return Err(core); } else if core.pre.is_some() { debug!( "core engine has a prefix prefilter, but it is \ probably not fast, so continuing with attempt to \ use reverse inner prefilter" ); } let (concat_prefix, preinner) = match reverse_inner::extract(hirs) { Some(x) => x, // N.B. the 'extract' function emits debug messages explaining // why we bailed out here. None => return Err(core), }; debug!("building reverse NFA for prefix before inner literal"); let mut lookm = LookMatcher::new(); lookm.set_line_terminator(core.info.config().get_line_terminator()); let thompson_config = thompson::Config::new() .reverse(true) .utf8(core.info.config().get_utf8_empty()) .nfa_size_limit(core.info.config().get_nfa_size_limit()) .shrink(false) .which_captures(WhichCaptures::None) .look_matcher(lookm); let result = thompson::Compiler::new() .configure(thompson_config) .build_from_hir(&concat_prefix); let nfarev = match result { Ok(nfarev) => nfarev, Err(_err) => { debug!( "skipping reverse inner optimization because the \ reverse NFA failed to build: {}", _err, ); return Err(core); } }; debug!("building reverse DFA for prefix before inner literal"); let dfa = if !core.info.config().get_dfa() { wrappers::ReverseDFA::none() } else { wrappers::ReverseDFA::new(&core.info, &nfarev) }; let hybrid = if !core.info.config().get_hybrid() { wrappers::ReverseHybrid::none() } else if dfa.is_some() { debug!( "skipping lazy DFA for reverse inner optimization \ because we have a full DFA" ); wrappers::ReverseHybrid::none() } else { wrappers::ReverseHybrid::new(&core.info, &nfarev) }; Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_full( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryError> { let mut span = input.get_span(); let mut min_match_start = 0; let mut min_pre_start = 0; loop { let litmatch = match self.preinner.find(input.haystack(), span) { None => return Ok(None), Some(span) => span, }; if litmatch.start < min_pre_start { trace!( "found inner prefilter match at {:?}, which starts \ before the end of the last forward scan at {}, \ quitting to avoid quadratic behavior", litmatch, min_pre_start, ); return Err(RetryError::Quadratic(RetryQuadraticError::new())); } trace!("reverse inner scan found inner match at {:?}", litmatch); let revinput = input .clone() .anchored(Anchored::Yes) .span(input.start()..litmatch.start); // Note that in addition to the literal search above scanning past // our minimum start point, this routine can also return an error // as a result of detecting possible quadratic behavior if the // reverse scan goes past the minimum start point. That is, the // literal search might not, but the reverse regex search for the // prefix might! match self.try_search_half_rev_limited( cache, &revinput, min_match_start, )? { None => { if span.start >= span.end { break; } span.start = litmatch.start.checked_add(1).unwrap(); } Some(hm_start) => { let fwdinput = input .clone() .anchored(Anchored::Pattern(hm_start.pattern())) .span(hm_start.offset()..input.end()); match self.try_search_half_fwd_stopat(cache, &fwdinput)? { Err(stopat) => { min_pre_start = stopat; span.start = litmatch.start.checked_add(1).unwrap(); } Ok(hm_end) => { return Ok(Some(Match::new( hm_start.pattern(), hm_start.offset()..hm_end.offset(), ))) } } } } min_match_start = litmatch.end; } Ok(None) } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_fwd_stopat( &self, cache: &mut Cache, input: &Input<'_>, ) -> Result, RetryFailError> { if let Some(e) = self.core.dfa.get(&input) { trace!( "using full DFA for forward reverse inner search at {:?}", input.get_span() ); e.try_search_half_fwd_stopat(&input) } else if let Some(e) = self.core.hybrid.get(&input) { trace!( "using lazy DFA for forward reverse inner search at {:?}", input.get_span() ); e.try_search_half_fwd_stopat(&mut cache.hybrid, &input) } else { unreachable!("ReverseInner always has a DFA") } } #[cfg_attr(feature = "perf-inline", inline(always))] fn try_search_half_rev_limited( &self, cache: &mut Cache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { if let Some(e) = self.dfa.get(&input) { trace!( "using full DFA for reverse inner search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited(&input, min_start) } else if let Some(e) = self.hybrid.get(&input) { trace!( "using lazy DFA for reverse inner search at {:?}, \ but will be stopped at {} to avoid quadratic behavior", input.get_span(), min_start, ); e.try_search_half_rev_limited( &mut cache.revhybrid, &input, min_start, ) } else { unreachable!("ReverseInner always has a DFA") } } } impl Strategy for ReverseInner { #[cfg_attr(feature = "perf-inline", inline(always))] fn group_info(&self) -> &GroupInfo { self.core.group_info() } #[cfg_attr(feature = "perf-inline", inline(always))] fn create_cache(&self) -> Cache { let mut cache = self.core.create_cache(); cache.revhybrid = self.hybrid.create_cache(); cache } #[cfg_attr(feature = "perf-inline", inline(always))] fn reset_cache(&self, cache: &mut Cache) { self.core.reset_cache(cache); cache.revhybrid.reset(&self.hybrid); } fn is_accelerated(&self) -> bool { self.preinner.is_fast() } fn memory_usage(&self) -> usize { self.core.memory_usage() + self.preinner.memory_usage() + self.nfarev.memory_usage() + self.dfa.memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn search(&self, cache: &mut Cache, input: &Input<'_>) -> Option { if input.get_anchored().is_anchored() { return self.core.search(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner optimization failed: {}", _err); self.core.search(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast search failed: {}", _err); self.core.search_nofail(cache, input) } Ok(matornot) => matornot, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_half( &self, cache: &mut Cache, input: &Input<'_>, ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_half(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); self.core.search_half(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast half search failed: {}", _err); self.core.search_half_nofail(cache, input) } Ok(None) => None, Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())), } } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_match(&self, cache: &mut Cache, input: &Input<'_>) -> bool { if input.get_anchored().is_anchored() { return self.core.is_match(cache, input); } match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner half optimization failed: {}", _err); self.core.is_match_nofail(cache, input) } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast half search failed: {}", _err); self.core.is_match_nofail(cache, input) } Ok(None) => false, Ok(Some(_)) => true, } } #[cfg_attr(feature = "perf-inline", inline(always))] fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { if input.get_anchored().is_anchored() { return self.core.search_slots(cache, input, slots); } if !self.core.is_capture_search_needed(slots.len()) { trace!("asked for slots unnecessarily, trying fast path"); let m = self.search(cache, input)?; copy_match_to_slots(m, slots); return Some(m.pattern()); } let m = match self.try_search_full(cache, input) { Err(RetryError::Quadratic(_err)) => { trace!("reverse inner captures optimization failed: {}", _err); return self.core.search_slots(cache, input, slots); } Err(RetryError::Fail(_err)) => { trace!("reverse inner fast captures search failed: {}", _err); return self.core.search_slots_nofail(cache, input, slots); } Ok(None) => return None, Ok(Some(m)) => m, }; trace!( "match found at {}..{} in capture search, \ using another engine to find captures", m.start(), m.end(), ); let input = input .clone() .span(m.start()..m.end()) .anchored(Anchored::Pattern(m.pattern())); self.core.search_slots_nofail(cache, &input, slots) } #[cfg_attr(feature = "perf-inline", inline(always))] fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.core.which_overlapping_matches(cache, input, patset) } } /// Copies the offsets in the given match to the corresponding positions in /// `slots`. /// /// In effect, this sets the slots corresponding to the implicit group for the /// pattern in the given match. If the indices for the corresponding slots do /// not exist, then no slots are set. /// /// This is useful when the caller provides slots (or captures), but you use a /// regex engine that doesn't operate on slots (like a lazy DFA). This function /// lets you map the match you get back to the slots provided by the caller. #[cfg_attr(feature = "perf-inline", inline(always))] fn copy_match_to_slots(m: Match, slots: &mut [Option]) { let slot_start = m.pattern().as_usize() * 2; let slot_end = slot_start + 1; if let Some(slot) = slots.get_mut(slot_start) { *slot = NonMaxUsize::new(m.start()); } if let Some(slot) = slots.get_mut(slot_end) { *slot = NonMaxUsize::new(m.end()); } } regex-automata-0.4.9/src/meta/wrappers.rs000064400000000000000000001264301046102023000165150ustar 00000000000000/*! This module contains a boat load of wrappers around each of our internal regex engines. They encapsulate a few things: 1. The wrappers manage the conditional existence of the regex engine. Namely, the PikeVM is the only required regex engine. The rest are optional. These wrappers present a uniform API regardless of which engines are available. And availability might be determined by compile time features or by dynamic configuration via `meta::Config`. Encapsulating the conditional compilation features is in particular a huge simplification for the higher level code that composes these engines. 2. The wrappers manage construction of each engine, including skipping it if the engine is unavailable or configured to not be used. 3. The wrappers manage whether an engine *can* be used for a particular search configuration. For example, `BoundedBacktracker::get` only returns a backtracking engine when the haystack is bigger than the maximum supported length. The wrappers also sometimes take a position on when an engine *ought* to be used, but only in cases where the logic is extremely local to the engine itself. Otherwise, things like "choose between the backtracker and the one-pass DFA" are managed by the higher level meta strategy code. There are also corresponding wrappers for the various `Cache` types for each regex engine that needs them. If an engine is unavailable or not used, then a cache for it will *not* actually be allocated. */ use alloc::vec::Vec; use crate::{ meta::{ error::{BuildError, RetryError, RetryFailError}, regex::RegexInfo, }, nfa::thompson::{pikevm, NFA}, util::{prefilter::Prefilter, primitives::NonMaxUsize}, HalfMatch, Input, Match, MatchKind, PatternID, PatternSet, }; #[cfg(feature = "dfa-build")] use crate::dfa; #[cfg(feature = "dfa-onepass")] use crate::dfa::onepass; #[cfg(feature = "hybrid")] use crate::hybrid; #[cfg(feature = "nfa-backtrack")] use crate::nfa::thompson::backtrack; #[derive(Debug)] pub(crate) struct PikeVM(PikeVMEngine); impl PikeVM { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, ) -> Result { PikeVMEngine::new(info, pre, nfa).map(PikeVM) } pub(crate) fn create_cache(&self) -> PikeVMCache { PikeVMCache::new(self) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self) -> &PikeVMEngine { &self.0 } } #[derive(Debug)] pub(crate) struct PikeVMEngine(pikevm::PikeVM); impl PikeVMEngine { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, ) -> Result { let pikevm_config = pikevm::Config::new() .match_kind(info.config().get_match_kind()) .prefilter(pre); let engine = pikevm::Builder::new() .configure(pikevm_config) .build_from_nfa(nfa.clone()) .map_err(BuildError::nfa)?; debug!("PikeVM built"); Ok(PikeVMEngine(engine)) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_match( &self, cache: &mut PikeVMCache, input: &Input<'_>, ) -> bool { self.0.is_match(cache.0.as_mut().unwrap(), input.clone()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, cache: &mut PikeVMCache, input: &Input<'_>, slots: &mut [Option], ) -> Option { self.0.search_slots(cache.0.as_mut().unwrap(), input, slots) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn which_overlapping_matches( &self, cache: &mut PikeVMCache, input: &Input<'_>, patset: &mut PatternSet, ) { self.0.which_overlapping_matches( cache.0.as_mut().unwrap(), input, patset, ) } } #[derive(Clone, Debug)] pub(crate) struct PikeVMCache(Option); impl PikeVMCache { pub(crate) fn none() -> PikeVMCache { PikeVMCache(None) } pub(crate) fn new(builder: &PikeVM) -> PikeVMCache { PikeVMCache(Some(builder.get().0.create_cache())) } pub(crate) fn reset(&mut self, builder: &PikeVM) { self.0.as_mut().unwrap().reset(&builder.get().0); } pub(crate) fn memory_usage(&self) -> usize { self.0.as_ref().map_or(0, |c| c.memory_usage()) } } #[derive(Debug)] pub(crate) struct BoundedBacktracker(Option); impl BoundedBacktracker { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, ) -> Result { BoundedBacktrackerEngine::new(info, pre, nfa).map(BoundedBacktracker) } pub(crate) fn create_cache(&self) -> BoundedBacktrackerCache { BoundedBacktrackerCache::new(self) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get( &self, input: &Input<'_>, ) -> Option<&BoundedBacktrackerEngine> { let engine = self.0.as_ref()?; // It is difficult to make the backtracker give up early if it is // guaranteed to eventually wind up in a match state. This is because // of the greedy nature of a backtracker: it just blindly mushes // forward. Every other regex engine is able to give up more quickly, // so even if the backtracker might be able to zip through faster than // (say) the PikeVM, we prefer the theoretical benefit that some other // engine might be able to scan much less of the haystack than the // backtracker. // // Now, if the haystack is really short already, then we allow the // backtracker to run. (This hasn't been litigated quantitatively with // benchmarks. Just a hunch.) if input.get_earliest() && input.haystack().len() > 128 { return None; } // If the backtracker is just going to return an error because the // haystack is too long, then obviously do not use it. if input.get_span().len() > engine.max_haystack_len() { return None; } Some(engine) } } #[derive(Debug)] pub(crate) struct BoundedBacktrackerEngine( #[cfg(feature = "nfa-backtrack")] backtrack::BoundedBacktracker, #[cfg(not(feature = "nfa-backtrack"))] (), ); impl BoundedBacktrackerEngine { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, ) -> Result, BuildError> { #[cfg(feature = "nfa-backtrack")] { if !info.config().get_backtrack() || info.config().get_match_kind() != MatchKind::LeftmostFirst { return Ok(None); } let backtrack_config = backtrack::Config::new().prefilter(pre); let engine = backtrack::Builder::new() .configure(backtrack_config) .build_from_nfa(nfa.clone()) .map_err(BuildError::nfa)?; debug!( "BoundedBacktracker built (max haystack length: {:?})", engine.max_haystack_len() ); Ok(Some(BoundedBacktrackerEngine(engine))) } #[cfg(not(feature = "nfa-backtrack"))] { Ok(None) } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_match( &self, cache: &mut BoundedBacktrackerCache, input: &Input<'_>, ) -> bool { #[cfg(feature = "nfa-backtrack")] { // OK because we only permit access to this engine when we know // the haystack is short enough for the backtracker to run without // reporting an error. self.0 .try_is_match(cache.0.as_mut().unwrap(), input.clone()) .unwrap() } #[cfg(not(feature = "nfa-backtrack"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, cache: &mut BoundedBacktrackerCache, input: &Input<'_>, slots: &mut [Option], ) -> Option { #[cfg(feature = "nfa-backtrack")] { // OK because we only permit access to this engine when we know // the haystack is short enough for the backtracker to run without // reporting an error. self.0 .try_search_slots(cache.0.as_mut().unwrap(), input, slots) .unwrap() } #[cfg(not(feature = "nfa-backtrack"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] fn max_haystack_len(&self) -> usize { #[cfg(feature = "nfa-backtrack")] { self.0.max_haystack_len() } #[cfg(not(feature = "nfa-backtrack"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } #[derive(Clone, Debug)] pub(crate) struct BoundedBacktrackerCache( #[cfg(feature = "nfa-backtrack")] Option, #[cfg(not(feature = "nfa-backtrack"))] (), ); impl BoundedBacktrackerCache { pub(crate) fn none() -> BoundedBacktrackerCache { #[cfg(feature = "nfa-backtrack")] { BoundedBacktrackerCache(None) } #[cfg(not(feature = "nfa-backtrack"))] { BoundedBacktrackerCache(()) } } pub(crate) fn new( builder: &BoundedBacktracker, ) -> BoundedBacktrackerCache { #[cfg(feature = "nfa-backtrack")] { BoundedBacktrackerCache( builder.0.as_ref().map(|e| e.0.create_cache()), ) } #[cfg(not(feature = "nfa-backtrack"))] { BoundedBacktrackerCache(()) } } pub(crate) fn reset(&mut self, builder: &BoundedBacktracker) { #[cfg(feature = "nfa-backtrack")] if let Some(ref e) = builder.0 { self.0.as_mut().unwrap().reset(&e.0); } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "nfa-backtrack")] { self.0.as_ref().map_or(0, |c| c.memory_usage()) } #[cfg(not(feature = "nfa-backtrack"))] { 0 } } } #[derive(Debug)] pub(crate) struct OnePass(Option); impl OnePass { pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> OnePass { OnePass(OnePassEngine::new(info, nfa)) } pub(crate) fn create_cache(&self) -> OnePassCache { OnePassCache::new(self) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self, input: &Input<'_>) -> Option<&OnePassEngine> { let engine = self.0.as_ref()?; if !input.get_anchored().is_anchored() && !engine.get_nfa().is_always_start_anchored() { return None; } Some(engine) } pub(crate) fn memory_usage(&self) -> usize { self.0.as_ref().map_or(0, |e| e.memory_usage()) } } #[derive(Debug)] pub(crate) struct OnePassEngine( #[cfg(feature = "dfa-onepass")] onepass::DFA, #[cfg(not(feature = "dfa-onepass"))] (), ); impl OnePassEngine { pub(crate) fn new(info: &RegexInfo, nfa: &NFA) -> Option { #[cfg(feature = "dfa-onepass")] { if !info.config().get_onepass() { return None; } // In order to even attempt building a one-pass DFA, we require // that we either have at least one explicit capturing group or // there's a Unicode word boundary somewhere. If we don't have // either of these things, then the lazy DFA will almost certainly // be useable and be much faster. The only case where it might // not is if the lazy DFA isn't utilizing its cache effectively, // but in those cases, the underlying regex is almost certainly // not one-pass or is too big to fit within the current one-pass // implementation limits. if info.props_union().explicit_captures_len() == 0 && !info.props_union().look_set().contains_word_unicode() { debug!("not building OnePass because it isn't worth it"); return None; } let onepass_config = onepass::Config::new() .match_kind(info.config().get_match_kind()) // Like for the lazy DFA, we unconditionally enable this // because it doesn't cost much and makes the API more // flexible. .starts_for_each_pattern(true) .byte_classes(info.config().get_byte_classes()) .size_limit(info.config().get_onepass_size_limit()); let result = onepass::Builder::new() .configure(onepass_config) .build_from_nfa(nfa.clone()); let engine = match result { Ok(engine) => engine, Err(_err) => { debug!("OnePass failed to build: {}", _err); return None; } }; debug!("OnePass built, {} bytes", engine.memory_usage()); Some(OnePassEngine(engine)) } #[cfg(not(feature = "dfa-onepass"))] { None } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn search_slots( &self, cache: &mut OnePassCache, input: &Input<'_>, slots: &mut [Option], ) -> Option { #[cfg(feature = "dfa-onepass")] { // OK because we only permit getting a OnePassEngine when we know // the search is anchored and thus an error cannot occur. self.0 .try_search_slots(cache.0.as_mut().unwrap(), input, slots) .unwrap() } #[cfg(not(feature = "dfa-onepass"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "dfa-onepass")] { self.0.memory_usage() } #[cfg(not(feature = "dfa-onepass"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] fn get_nfa(&self) -> &NFA { #[cfg(feature = "dfa-onepass")] { self.0.get_nfa() } #[cfg(not(feature = "dfa-onepass"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } #[derive(Clone, Debug)] pub(crate) struct OnePassCache( #[cfg(feature = "dfa-onepass")] Option, #[cfg(not(feature = "dfa-onepass"))] (), ); impl OnePassCache { pub(crate) fn none() -> OnePassCache { #[cfg(feature = "dfa-onepass")] { OnePassCache(None) } #[cfg(not(feature = "dfa-onepass"))] { OnePassCache(()) } } pub(crate) fn new(builder: &OnePass) -> OnePassCache { #[cfg(feature = "dfa-onepass")] { OnePassCache(builder.0.as_ref().map(|e| e.0.create_cache())) } #[cfg(not(feature = "dfa-onepass"))] { OnePassCache(()) } } pub(crate) fn reset(&mut self, builder: &OnePass) { #[cfg(feature = "dfa-onepass")] if let Some(ref e) = builder.0 { self.0.as_mut().unwrap().reset(&e.0); } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "dfa-onepass")] { self.0.as_ref().map_or(0, |c| c.memory_usage()) } #[cfg(not(feature = "dfa-onepass"))] { 0 } } } #[derive(Debug)] pub(crate) struct Hybrid(Option); impl Hybrid { pub(crate) fn none() -> Hybrid { Hybrid(None) } pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA, ) -> Hybrid { Hybrid(HybridEngine::new(info, pre, nfa, nfarev)) } pub(crate) fn create_cache(&self) -> HybridCache { HybridCache::new(self) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&HybridEngine> { let engine = self.0.as_ref()?; Some(engine) } pub(crate) fn is_some(&self) -> bool { self.0.is_some() } } #[derive(Debug)] pub(crate) struct HybridEngine( #[cfg(feature = "hybrid")] hybrid::regex::Regex, #[cfg(not(feature = "hybrid"))] (), ); impl HybridEngine { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA, ) -> Option { #[cfg(feature = "hybrid")] { if !info.config().get_hybrid() { return None; } let dfa_config = hybrid::dfa::Config::new() .match_kind(info.config().get_match_kind()) .prefilter(pre.clone()) // Enabling this is necessary for ensuring we can service any // kind of 'Input' search without error. For the lazy DFA, // this is not particularly costly, since the start states are // generated lazily. .starts_for_each_pattern(true) .byte_classes(info.config().get_byte_classes()) .unicode_word_boundary(true) .specialize_start_states(pre.is_some()) .cache_capacity(info.config().get_hybrid_cache_capacity()) // This makes it possible for building a lazy DFA to // fail even though the NFA has already been built. Namely, // if the cache capacity is too small to fit some minimum // number of states (which is small, like 4 or 5), then the // DFA will refuse to build. // // We shouldn't enable this to make building always work, since // this could cause the allocation of a cache bigger than the // provided capacity amount. // // This is effectively the only reason why building a lazy DFA // could fail. If it does, then we simply suppress the error // and return None. .skip_cache_capacity_check(false) // This and enabling heuristic Unicode word boundary support // above make it so the lazy DFA can quit at match time. .minimum_cache_clear_count(Some(3)) .minimum_bytes_per_state(Some(10)); let result = hybrid::dfa::Builder::new() .configure(dfa_config.clone()) .build_from_nfa(nfa.clone()); let fwd = match result { Ok(fwd) => fwd, Err(_err) => { debug!("forward lazy DFA failed to build: {}", _err); return None; } }; let result = hybrid::dfa::Builder::new() .configure( dfa_config .clone() .match_kind(MatchKind::All) .prefilter(None) .specialize_start_states(false), ) .build_from_nfa(nfarev.clone()); let rev = match result { Ok(rev) => rev, Err(_err) => { debug!("reverse lazy DFA failed to build: {}", _err); return None; } }; let engine = hybrid::regex::Builder::new().build_from_dfas(fwd, rev); debug!("lazy DFA built"); Some(HybridEngine(engine)) } #[cfg(not(feature = "hybrid"))] { None } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search( &self, cache: &mut HybridCache, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "hybrid")] { let cache = cache.0.as_mut().unwrap(); self.0.try_search(cache, input).map_err(|e| e.into()) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_fwd( &self, cache: &mut HybridCache, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "hybrid")] { let fwd = self.0.forward(); let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; fwd.try_search_fwd(&mut fwdcache, input).map_err(|e| e.into()) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_fwd_stopat( &self, cache: &mut HybridCache, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "hybrid")] { let dfa = self.0.forward(); let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0; crate::meta::stopat::hybrid_try_search_half_fwd( dfa, &mut cache, input, ) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev( &self, cache: &mut HybridCache, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "hybrid")] { let rev = self.0.reverse(); let mut revcache = cache.0.as_mut().unwrap().as_parts_mut().1; rev.try_search_rev(&mut revcache, input).map_err(|e| e.into()) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev_limited( &self, cache: &mut HybridCache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { #[cfg(feature = "hybrid")] { let dfa = self.0.reverse(); let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1; crate::meta::limited::hybrid_try_search_half_rev( dfa, &mut cache, input, min_start, ) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[inline] pub(crate) fn try_which_overlapping_matches( &self, cache: &mut HybridCache, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), RetryFailError> { #[cfg(feature = "hybrid")] { let fwd = self.0.forward(); let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0; fwd.try_which_overlapping_matches(&mut fwdcache, input, patset) .map_err(|e| e.into()) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } #[derive(Clone, Debug)] pub(crate) struct HybridCache( #[cfg(feature = "hybrid")] Option, #[cfg(not(feature = "hybrid"))] (), ); impl HybridCache { pub(crate) fn none() -> HybridCache { #[cfg(feature = "hybrid")] { HybridCache(None) } #[cfg(not(feature = "hybrid"))] { HybridCache(()) } } pub(crate) fn new(builder: &Hybrid) -> HybridCache { #[cfg(feature = "hybrid")] { HybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) } #[cfg(not(feature = "hybrid"))] { HybridCache(()) } } pub(crate) fn reset(&mut self, builder: &Hybrid) { #[cfg(feature = "hybrid")] if let Some(ref e) = builder.0 { self.0.as_mut().unwrap().reset(&e.0); } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "hybrid")] { self.0.as_ref().map_or(0, |c| c.memory_usage()) } #[cfg(not(feature = "hybrid"))] { 0 } } } #[derive(Debug)] pub(crate) struct DFA(Option); impl DFA { pub(crate) fn none() -> DFA { DFA(None) } pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA, ) -> DFA { DFA(DFAEngine::new(info, pre, nfa, nfarev)) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&DFAEngine> { let engine = self.0.as_ref()?; Some(engine) } pub(crate) fn is_some(&self) -> bool { self.0.is_some() } pub(crate) fn memory_usage(&self) -> usize { self.0.as_ref().map_or(0, |e| e.memory_usage()) } } #[derive(Debug)] pub(crate) struct DFAEngine( #[cfg(feature = "dfa-build")] dfa::regex::Regex, #[cfg(not(feature = "dfa-build"))] (), ); impl DFAEngine { pub(crate) fn new( info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA, ) -> Option { #[cfg(feature = "dfa-build")] { if !info.config().get_dfa() { return None; } // If our NFA is anything but small, don't even bother with a DFA. if let Some(state_limit) = info.config().get_dfa_state_limit() { if nfa.states().len() > state_limit { debug!( "skipping full DFA because NFA has {} states, \ which exceeds the heuristic limit of {}", nfa.states().len(), state_limit, ); return None; } } // We cut the size limit in four because the total heap used by // DFA construction is determinization aux memory and the DFA // itself, and those things are configured independently in the // lower level DFA builder API. And then split that in two because // of forward and reverse DFAs. let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4); let dfa_config = dfa::dense::Config::new() .match_kind(info.config().get_match_kind()) .prefilter(pre.clone()) // Enabling this is necessary for ensuring we can service any // kind of 'Input' search without error. For the full DFA, this // can be quite costly. But since we have such a small bound // on the size of the DFA, in practice, any multl-regexes are // probably going to blow the limit anyway. .starts_for_each_pattern(true) .byte_classes(info.config().get_byte_classes()) .unicode_word_boundary(true) .specialize_start_states(pre.is_some()) .determinize_size_limit(size_limit) .dfa_size_limit(size_limit); let result = dfa::dense::Builder::new() .configure(dfa_config.clone()) .build_from_nfa(&nfa); let fwd = match result { Ok(fwd) => fwd, Err(_err) => { debug!("forward full DFA failed to build: {}", _err); return None; } }; let result = dfa::dense::Builder::new() .configure( dfa_config .clone() // We never need unanchored reverse searches, so // there's no point in building it into the DFA, which // WILL take more space. (This isn't done for the lazy // DFA because the DFA is, well, lazy. It doesn't pay // the cost for supporting unanchored searches unless // you actually do an unanchored search, which we // don't.) .start_kind(dfa::StartKind::Anchored) .match_kind(MatchKind::All) .prefilter(None) .specialize_start_states(false), ) .build_from_nfa(&nfarev); let rev = match result { Ok(rev) => rev, Err(_err) => { debug!("reverse full DFA failed to build: {}", _err); return None; } }; let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev); debug!( "fully compiled forward and reverse DFAs built, {} bytes", engine.forward().memory_usage() + engine.reverse().memory_usage(), ); Some(DFAEngine(engine)) } #[cfg(not(feature = "dfa-build"))] { None } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search( &self, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "dfa-build")] { self.0.try_search(input).map_err(|e| e.into()) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_fwd( &self, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "dfa-build")] { use crate::dfa::Automaton; self.0.forward().try_search_fwd(input).map_err(|e| e.into()) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_fwd_stopat( &self, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "dfa-build")] { let dfa = self.0.forward(); crate::meta::stopat::dfa_try_search_half_fwd(dfa, input) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev( &self, input: &Input<'_>, ) -> Result, RetryFailError> { #[cfg(feature = "dfa-build")] { use crate::dfa::Automaton; self.0.reverse().try_search_rev(&input).map_err(|e| e.into()) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev_limited( &self, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { #[cfg(feature = "dfa-build")] { let dfa = self.0.reverse(); crate::meta::limited::dfa_try_search_half_rev( dfa, input, min_start, ) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } #[inline] pub(crate) fn try_which_overlapping_matches( &self, input: &Input<'_>, patset: &mut PatternSet, ) -> Result<(), RetryFailError> { #[cfg(feature = "dfa-build")] { use crate::dfa::Automaton; self.0 .forward() .try_which_overlapping_matches(input, patset) .map_err(|e| e.into()) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "dfa-build")] { self.0.forward().memory_usage() + self.0.reverse().memory_usage() } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } #[derive(Debug)] pub(crate) struct ReverseHybrid(Option); impl ReverseHybrid { pub(crate) fn none() -> ReverseHybrid { ReverseHybrid(None) } pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid { ReverseHybrid(ReverseHybridEngine::new(info, nfarev)) } pub(crate) fn create_cache(&self) -> ReverseHybridCache { ReverseHybridCache::new(self) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get( &self, _input: &Input<'_>, ) -> Option<&ReverseHybridEngine> { let engine = self.0.as_ref()?; Some(engine) } } #[derive(Debug)] pub(crate) struct ReverseHybridEngine( #[cfg(feature = "hybrid")] hybrid::dfa::DFA, #[cfg(not(feature = "hybrid"))] (), ); impl ReverseHybridEngine { pub(crate) fn new( info: &RegexInfo, nfarev: &NFA, ) -> Option { #[cfg(feature = "hybrid")] { if !info.config().get_hybrid() { return None; } // Since we only use this for reverse searches, we can hard-code // a number of things like match semantics, prefilters, starts // for each pattern and so on. let dfa_config = hybrid::dfa::Config::new() .match_kind(MatchKind::All) .prefilter(None) .starts_for_each_pattern(false) .byte_classes(info.config().get_byte_classes()) .unicode_word_boundary(true) .specialize_start_states(false) .cache_capacity(info.config().get_hybrid_cache_capacity()) .skip_cache_capacity_check(false) .minimum_cache_clear_count(Some(3)) .minimum_bytes_per_state(Some(10)); let result = hybrid::dfa::Builder::new() .configure(dfa_config) .build_from_nfa(nfarev.clone()); let rev = match result { Ok(rev) => rev, Err(_err) => { debug!("lazy reverse DFA failed to build: {}", _err); return None; } }; debug!("lazy reverse DFA built"); Some(ReverseHybridEngine(rev)) } #[cfg(not(feature = "hybrid"))] { None } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev_limited( &self, cache: &mut ReverseHybridCache, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { #[cfg(feature = "hybrid")] { let dfa = &self.0; let mut cache = cache.0.as_mut().unwrap(); crate::meta::limited::hybrid_try_search_half_rev( dfa, &mut cache, input, min_start, ) } #[cfg(not(feature = "hybrid"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } #[derive(Clone, Debug)] pub(crate) struct ReverseHybridCache( #[cfg(feature = "hybrid")] Option, #[cfg(not(feature = "hybrid"))] (), ); impl ReverseHybridCache { pub(crate) fn none() -> ReverseHybridCache { #[cfg(feature = "hybrid")] { ReverseHybridCache(None) } #[cfg(not(feature = "hybrid"))] { ReverseHybridCache(()) } } pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache { #[cfg(feature = "hybrid")] { ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache())) } #[cfg(not(feature = "hybrid"))] { ReverseHybridCache(()) } } pub(crate) fn reset(&mut self, builder: &ReverseHybrid) { #[cfg(feature = "hybrid")] if let Some(ref e) = builder.0 { self.0.as_mut().unwrap().reset(&e.0); } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "hybrid")] { self.0.as_ref().map_or(0, |c| c.memory_usage()) } #[cfg(not(feature = "hybrid"))] { 0 } } } #[derive(Debug)] pub(crate) struct ReverseDFA(Option); impl ReverseDFA { pub(crate) fn none() -> ReverseDFA { ReverseDFA(None) } pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA { ReverseDFA(ReverseDFAEngine::new(info, nfarev)) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self, _input: &Input<'_>) -> Option<&ReverseDFAEngine> { let engine = self.0.as_ref()?; Some(engine) } pub(crate) fn is_some(&self) -> bool { self.0.is_some() } pub(crate) fn memory_usage(&self) -> usize { self.0.as_ref().map_or(0, |e| e.memory_usage()) } } #[derive(Debug)] pub(crate) struct ReverseDFAEngine( #[cfg(feature = "dfa-build")] dfa::dense::DFA>, #[cfg(not(feature = "dfa-build"))] (), ); impl ReverseDFAEngine { pub(crate) fn new( info: &RegexInfo, nfarev: &NFA, ) -> Option { #[cfg(feature = "dfa-build")] { if !info.config().get_dfa() { return None; } // If our NFA is anything but small, don't even bother with a DFA. if let Some(state_limit) = info.config().get_dfa_state_limit() { if nfarev.states().len() > state_limit { debug!( "skipping full reverse DFA because NFA has {} states, \ which exceeds the heuristic limit of {}", nfarev.states().len(), state_limit, ); return None; } } // We cut the size limit in two because the total heap used by DFA // construction is determinization aux memory and the DFA itself, // and those things are configured independently in the lower level // DFA builder API. let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2); // Since we only use this for reverse searches, we can hard-code // a number of things like match semantics, prefilters, starts // for each pattern and so on. We also disable acceleration since // it's incompatible with limited searches (which is the only // operation we support for this kind of engine at the moment). let dfa_config = dfa::dense::Config::new() .match_kind(MatchKind::All) .prefilter(None) .accelerate(false) .start_kind(dfa::StartKind::Anchored) .starts_for_each_pattern(false) .byte_classes(info.config().get_byte_classes()) .unicode_word_boundary(true) .specialize_start_states(false) .determinize_size_limit(size_limit) .dfa_size_limit(size_limit); let result = dfa::dense::Builder::new() .configure(dfa_config) .build_from_nfa(&nfarev); let rev = match result { Ok(rev) => rev, Err(_err) => { debug!("full reverse DFA failed to build: {}", _err); return None; } }; debug!( "fully compiled reverse DFA built, {} bytes", rev.memory_usage() ); Some(ReverseDFAEngine(rev)) } #[cfg(not(feature = "dfa-build"))] { None } } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn try_search_half_rev_limited( &self, input: &Input<'_>, min_start: usize, ) -> Result, RetryError> { #[cfg(feature = "dfa-build")] { let dfa = &self.0; crate::meta::limited::dfa_try_search_half_rev( dfa, input, min_start, ) } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } pub(crate) fn memory_usage(&self) -> usize { #[cfg(feature = "dfa-build")] { self.0.memory_usage() } #[cfg(not(feature = "dfa-build"))] { // Impossible to reach because this engine is never constructed // if the requisite features aren't enabled. unreachable!() } } } regex-automata-0.4.9/src/nfa/mod.rs000064400000000000000000000060071046102023000152440ustar 00000000000000/*! Provides non-deterministic finite automata (NFA) and regex engines that use them. While NFAs and DFAs (deterministic finite automata) have equivalent *theoretical* power, their usage in practice tends to result in different engineering trade offs. While this isn't meant to be a comprehensive treatment of the topic, here are a few key trade offs that are, at minimum, true for this crate: * NFAs tend to be represented sparsely where as DFAs are represented densely. Sparse representations use less memory, but are slower to traverse. Conversely, dense representations use more memory, but are faster to traverse. (Sometimes these lines are blurred. For example, an `NFA` might choose to represent a particular state in a dense fashion, and a DFA can be built using a sparse representation via [`sparse::DFA`](crate::dfa::sparse::DFA). * NFAs have espilon transitions and DFAs don't. In practice, this means that handling a single byte in a haystack with an NFA at search time may require visiting multiple NFA states. In a DFA, each byte only requires visiting a single state. Stated differently, NFAs require a variable number of CPU instructions to process one byte in a haystack where as a DFA uses a constant number of CPU instructions to process one byte. * NFAs are generally easier to amend with secondary storage. For example, the [`thompson::pikevm::PikeVM`] uses an NFA to match, but also uses additional memory beyond the model of a finite state machine to track offsets for matching capturing groups. Conversely, the most a DFA can do is report the offset (and pattern ID) at which a match occurred. This is generally why we also compile DFAs in reverse, so that we can run them after finding the end of a match to also find the start of a match. * NFAs take worst case linear time to build, but DFAs take worst case exponential time to build. The [hybrid NFA/DFA](crate::hybrid) mitigates this challenge for DFAs in many practical cases. There are likely other differences, but the bottom line is that NFAs tend to be more memory efficient and give easier opportunities for increasing expressive power, where as DFAs are faster to search with. # Why only a Thompson NFA? Currently, the only kind of NFA we support in this crate is a [Thompson NFA](https://en.wikipedia.org/wiki/Thompson%27s_construction). This refers to a specific construction algorithm that takes the syntax of a regex pattern and converts it to an NFA. Specifically, it makes gratuitous use of epsilon transitions in order to keep its structure simple. In exchange, its construction time is linear in the size of the regex. A Thompson NFA also makes the guarantee that given any state and a character in a haystack, there is at most one transition defined for it. (Although there may be many epsilon transitions.) It possible that other types of NFAs will be added in the future, such as a [Glushkov NFA](https://en.wikipedia.org/wiki/Glushkov%27s_construction_algorithm). But currently, this crate only provides a Thompson NFA. */ #[cfg(feature = "nfa-thompson")] pub mod thompson; regex-automata-0.4.9/src/nfa/thompson/backtrack.rs000064400000000000000000002216501046102023000202640ustar 00000000000000/*! An NFA backed bounded backtracker for executing regex searches with capturing groups. This module provides a [`BoundedBacktracker`] that works by simulating an NFA using the classical backtracking algorithm with a twist: it avoids redoing work that it has done before and thereby avoids worst case exponential time. In exchange, it can only be used on "short" haystacks. Its advantage is that is can be faster than the [`PikeVM`](thompson::pikevm::PikeVM) in many cases because it does less book-keeping. */ use alloc::{vec, vec::Vec}; use crate::{ nfa::thompson::{self, BuildError, State, NFA}, util::{ captures::Captures, empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, search::{Anchored, HalfMatch, Input, Match, MatchError, Span}, }, }; /// Returns the minimum visited capacity for the given haystack. /// /// This function can be used as the argument to [`Config::visited_capacity`] /// in order to guarantee that a backtracking search for the given `input` /// won't return an error when using a [`BoundedBacktracker`] built from the /// given `NFA`. /// /// This routine exists primarily as a way to test that the bounded backtracker /// works correctly when its capacity is set to the smallest possible amount. /// Still, it may be useful in cases where you know you want to use the bounded /// backtracker for a specific input, and just need to know what visited /// capacity to provide to make it work. /// /// Be warned that this number could be quite large as it is multiplicative in /// the size the given NFA and haystack. pub fn min_visited_capacity(nfa: &NFA, input: &Input<'_>) -> usize { div_ceil(nfa.states().len() * (input.get_span().len() + 1), 8) } /// The configuration used for building a bounded backtracker. /// /// A bounded backtracker configuration is a simple data object that is /// typically used with [`Builder::configure`]. #[derive(Clone, Debug, Default)] pub struct Config { pre: Option>, visited_capacity: Option, } impl Config { /// Return a new default regex configuration. pub fn new() -> Config { Config::default() } /// Set a prefilter to be used whenever a start state is entered. /// /// A [`Prefilter`] in this context is meant to accelerate searches by /// looking for literal prefixes that every match for the corresponding /// pattern (or patterns) must start with. Once a prefilter produces a /// match, the underlying search routine continues on to try and confirm /// the match. /// /// Be warned that setting a prefilter does not guarantee that the search /// will be faster. While it's usually a good bet, if the prefilter /// produces a lot of false positive candidates (i.e., positions matched /// by the prefilter but not by the regex), then the overall result can /// be slower than if you had just executed the regex engine without any /// prefilters. /// /// By default no prefilter is set. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// util::prefilter::Prefilter, /// Input, Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); /// let re = BoundedBacktracker::builder() /// .configure(BoundedBacktracker::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// assert_eq!( /// Some(Match::must(0, 5..11)), /// re.try_find(&mut cache, input)?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// Be warned though that an incorrect prefilter can lead to incorrect /// results! /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); /// let re = BoundedBacktracker::builder() /// .configure(BoundedBacktracker::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// // No match reported even though there clearly is one! /// assert_eq!(None, re.try_find(&mut cache, input)?); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(mut self, pre: Option) -> Config { self.pre = Some(pre); self } /// Set the visited capacity used to bound backtracking. /// /// The visited capacity represents the amount of heap memory (in bytes) to /// allocate toward tracking which parts of the backtracking search have /// been done before. The heap memory needed for any particular search is /// proportional to `haystack.len() * nfa.states().len()`, which an be /// quite large. Therefore, the bounded backtracker is typically only able /// to run on shorter haystacks. /// /// For a given regex, increasing the visited capacity means that the /// maximum haystack length that can be searched is increased. The /// [`BoundedBacktracker::max_haystack_len`] method returns that maximum. /// /// The default capacity is a reasonable but empirically chosen size. /// /// # Example /// /// As with other regex engines, Unicode is what tends to make the bounded /// backtracker less useful by making the maximum haystack length quite /// small. If necessary, increasing the visited capacity using this routine /// will increase the maximum haystack length at the cost of using more /// memory. /// /// Note though that the specific maximum values here are not an API /// guarantee. The default visited capacity is subject to change and not /// covered by semver. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// // Unicode inflates the size of the underlying NFA quite a bit, and /// // thus means that the backtracker can only handle smaller haystacks, /// // assuming that the visited capacity remains unchanged. /// let re = BoundedBacktracker::new(r"\w+")?; /// assert!(re.max_haystack_len() <= 7_000); /// // But we can increase the visited capacity to handle bigger haystacks! /// let re = BoundedBacktracker::builder() /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) /// .build(r"\w+")?; /// assert!(re.max_haystack_len() >= 25_000); /// assert!(re.max_haystack_len() <= 28_000); /// # Ok::<(), Box>(()) /// ``` pub fn visited_capacity(mut self, capacity: usize) -> Config { self.visited_capacity = Some(capacity); self } /// Returns the prefilter set in this configuration, if one at all. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Returns the configured visited capacity. /// /// Note that the actual capacity used may be slightly bigger than the /// configured capacity. pub fn get_visited_capacity(&self) -> usize { const DEFAULT: usize = 256 * (1 << 10); // 256 KB self.visited_capacity.unwrap_or(DEFAULT) } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { pre: o.pre.or_else(|| self.pre.clone()), visited_capacity: o.visited_capacity.or(self.visited_capacity), } } } /// A builder for a bounded backtracker. /// /// This builder permits configuring options for the syntax of a pattern, the /// NFA construction and the `BoundedBacktracker` construction. This builder /// is different from a general purpose regex builder in that it permits fine /// grain configuration of the construction process. The trade off for this is /// complexity, and the possibility of setting a configuration that might not /// make sense. For example, there are two different UTF-8 modes: /// /// * [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) controls /// whether the pattern itself can contain sub-expressions that match invalid /// UTF-8. /// * [`thompson::Config::utf8`] controls how the regex iterators themselves /// advance the starting position of the next search when a match with zero /// length is found. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, backtrack::BoundedBacktracker}, /// util::syntax, /// Match, /// }; /// /// let re = BoundedBacktracker::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Ok(Match::must(0, 1..9))); /// let got = re.try_find_iter(&mut cache, haystack).next(); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// // /// // N.B. This example does not show the impact of /// // disabling UTF-8 mode on a BoundedBacktracker Config, since that /// // only impacts regexes that can produce matches of /// // length 0. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap()?.range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } impl Builder { /// Create a new BoundedBacktracker builder with its default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a `BoundedBacktracker` from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build( &self, pattern: &str, ) -> Result { self.build_many(&[pattern]) } /// Build a `BoundedBacktracker` from the given patterns. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self.thompson.build_many(patterns)?; self.build_from_nfa(nfa) } /// Build a `BoundedBacktracker` directly from its NFA. /// /// Note that when using this method, any configuration that applies to the /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa( &self, nfa: NFA, ) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; Ok(BoundedBacktracker { config: self.config.clone(), nfa }) } /// Apply the given `BoundedBacktracker` configuration options to this /// builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a `BoundedBacktracker` /// directly from a pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like if additional time should be spent /// shrinking the size of the NFA. /// /// These settings only apply when constructing a `BoundedBacktracker` /// directly from a pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } /// A backtracking regex engine that bounds its execution to avoid exponential /// blow-up. /// /// This regex engine only implements leftmost-first match semantics and /// only supports leftmost searches. It effectively does the same thing as a /// [`PikeVM`](thompson::pikevm::PikeVM), but typically does it faster because /// it doesn't have to worry about copying capturing group spans for most NFA /// states. Instead, the backtracker can maintain one set of captures (provided /// by the caller) and never needs to copy them. In exchange, the backtracker /// bounds itself to ensure it doesn't exhibit worst case exponential time. /// This results in the backtracker only being able to handle short haystacks /// given reasonable memory usage. /// /// # Searches may return an error! /// /// By design, this backtracking regex engine is bounded. This bound is /// implemented by not visiting any combination of NFA state ID and position /// in a haystack more than once. Thus, the total memory required to bound /// backtracking is proportional to `haystack.len() * nfa.states().len()`. /// This can obviously get quite large, since large haystacks aren't terribly /// uncommon. To avoid using exorbitant memory, the capacity is bounded by /// a fixed limit set via [`Config::visited_capacity`]. Thus, if the total /// capacity required for a particular regex and a haystack exceeds this /// capacity, then the search routine will return an error. /// /// Unlike other regex engines that may return an error at search time (like /// the DFA or the hybrid NFA/DFA), there is no way to guarantee that a bounded /// backtracker will work for every haystack. Therefore, this regex engine /// _only_ exposes fallible search routines to avoid the footgun of panicking /// when running a search on a haystack that is too big. /// /// If one wants to use the fallible search APIs without handling the /// error, the only way to guarantee an error won't occur from the /// haystack length is to ensure the haystack length does not exceed /// [`BoundedBacktracker::max_haystack_len`]. /// /// # Example: Unicode word boundaries /// /// This example shows that the bounded backtracker implements Unicode word /// boundaries correctly by default. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; /// /// let re = BoundedBacktracker::new(r"\b\w+\b")?; /// let mut cache = re.create_cache(); /// /// let mut it = re.try_find_iter(&mut cache, "Шерлок Холмс"); /// assert_eq!(Some(Ok(Match::must(0, 0..12))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 13..23))), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: multiple regex patterns /// /// The bounded backtracker supports searching for multiple patterns /// simultaneously, just like other regex engines. Note though that because it /// uses a backtracking strategy, this regex engine is unlikely to scale well /// as more patterns are added. But then again, as more patterns are added, the /// maximum haystack length allowed will also shorten (assuming the visited /// capacity remains invariant). /// /// ``` /// use regex_automata::{nfa::thompson::backtrack::BoundedBacktracker, Match}; /// /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct BoundedBacktracker { config: Config, nfa: NFA, } impl BoundedBacktracker { /// Parse the given regular expression using the default configuration and /// return the corresponding `BoundedBacktracker`. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Ok(Match::must(0, 3..14))), /// re.try_find_iter(&mut cache, "zzzfoo12345barzzz").next(), /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { BoundedBacktracker::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re = BoundedBacktracker::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.try_find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Ok(Match::must(0, 0..3))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 4..5))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 6..9))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 10..14))), it.next()); /// assert_eq!(Some(Ok(Match::must(1, 15..16))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 17..21))), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { BoundedBacktracker::builder().build_many(patterns) } /// # Example /// /// This shows how to hand assemble a regular expression via its HIR, /// compile an NFA from it and build a BoundedBacktracker from the NFA. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{NFA, backtrack::BoundedBacktracker}, /// Match, /// }; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = BoundedBacktracker::new_from_nfa(nfa)?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let expected = Some(Match::must(0, 3..4)); /// re.try_captures(&mut cache, "!@#A#@!", &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_from_nfa(nfa: NFA) -> Result { BoundedBacktracker::builder().build_from_nfa(nfa) } /// Create a new `BoundedBacktracker` that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re = BoundedBacktracker::always_match()?; /// let mut cache = re.create_cache(); /// /// let expected = Some(Ok(Match::must(0, 0..0))); /// assert_eq!(expected, re.try_find_iter(&mut cache, "").next()); /// assert_eq!(expected, re.try_find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); BoundedBacktracker::new_from_nfa(nfa) } /// Create a new `BoundedBacktracker` that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// let re = BoundedBacktracker::never_match()?; /// let mut cache = re.create_cache(); /// /// assert_eq!(None, re.try_find_iter(&mut cache, "").next()); /// assert_eq!(None, re.try_find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); BoundedBacktracker::new_from_nfa(nfa) } /// Return a default configuration for a `BoundedBacktracker`. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of a `BoundedBacktracker`. /// /// # Example /// /// This example shows how to disable UTF-8 mode. When UTF-8 mode is /// disabled, zero-width matches that split a codepoint are allowed. /// Otherwise they are never reported. /// /// In the code below, notice that `""` is permitted to match positions /// that split the encoding of a codepoint. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, backtrack::BoundedBacktracker}, /// Match, /// }; /// /// let re = BoundedBacktracker::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"")?; /// let mut cache = re.create_cache(); /// /// let haystack = "a☃z"; /// let mut it = re.try_find_iter(&mut cache, haystack); /// assert_eq!(Some(Ok(Match::must(0, 0..0))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 1..1))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 2..2))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 3..3))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 4..4))), it.next()); /// assert_eq!(Some(Ok(Match::must(0, 5..5))), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a /// `BoundedBacktracker`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::{self, backtrack::BoundedBacktracker}, /// util::syntax, /// Match, /// }; /// /// let re = BoundedBacktracker::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// re.try_captures(&mut cache, haystack, &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new cache for this regex. /// /// The cache returned should only be used for searches for this /// regex. If you want to reuse the cache for another regex, then you /// must call [`Cache::reset`] with that regex (or, equivalently, /// [`BoundedBacktracker::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Create a new empty set of capturing groups that is guaranteed to be /// valid for the search APIs on this `BoundedBacktracker`. /// /// A `Captures` value created for a specific `BoundedBacktracker` cannot /// be used with any other `BoundedBacktracker`. /// /// This is a convenience function for [`Captures::all`]. See the /// [`Captures`] documentation for an explanation of its alternative /// constructors that permit the `BoundedBacktracker` to do less work /// during a search, and thus might make it faster. pub fn create_captures(&self) -> Captures { Captures::all(self.get_nfa().group_info().clone()) } /// Reset the given cache such that it can be used for searching with the /// this `BoundedBacktracker` (and only this `BoundedBacktracker`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `BoundedBacktracker`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different /// `BoundedBacktracker`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re1 = BoundedBacktracker::new(r"\w")?; /// let re2 = BoundedBacktracker::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Ok(Match::must(0, 0..2))), /// re1.try_find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the BoundedBacktracker we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Ok(Match::must(0, 0..3))), /// re2.try_find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { cache.reset(self); } /// Returns the total number of patterns compiled into this /// `BoundedBacktracker`. /// /// In the case of a `BoundedBacktracker` that contains no patterns, this /// returns `0`. /// /// # Example /// /// This example shows the pattern length for a `BoundedBacktracker` that /// never matches: /// /// ``` /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// let re = BoundedBacktracker::never_match()?; /// assert_eq!(re.pattern_len(), 0); /// # Ok::<(), Box>(()) /// ``` /// /// And another example for a `BoundedBacktracker` that matches at every /// position: /// /// ``` /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// let re = BoundedBacktracker::always_match()?; /// assert_eq!(re.pattern_len(), 1); /// # Ok::<(), Box>(()) /// ``` /// /// And finally, a `BoundedBacktracker` that was constructed from multiple /// patterns: /// /// ``` /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// let re = BoundedBacktracker::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(re.pattern_len(), 3); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.nfa.pattern_len() } /// Return the config for this `BoundedBacktracker`. #[inline] pub fn get_config(&self) -> &Config { &self.config } /// Returns a reference to the underlying NFA. #[inline] pub fn get_nfa(&self) -> &NFA { &self.nfa } /// Returns the maximum haystack length supported by this backtracker. /// /// This routine is a function of both [`Config::visited_capacity`] and the /// internal size of the backtracker's NFA. /// /// # Example /// /// This example shows how the maximum haystack length can vary depending /// on the size of the regex itself. Note though that the specific maximum /// values here are not an API guarantee. The default visited capacity is /// subject to change and not covered by semver. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, MatchError, /// }; /// /// // If you're only using ASCII, you get a big budget. /// let re = BoundedBacktracker::new(r"(?-u)\w+")?; /// let mut cache = re.create_cache(); /// assert_eq!(re.max_haystack_len(), 299_592); /// // Things work up to the max. /// let mut haystack = "a".repeat(299_592); /// let expected = Some(Ok(Match::must(0, 0..299_592))); /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); /// // But you'll get an error if you provide a haystack that's too big. /// // Notice that we use the 'try_find_iter' routine instead, which /// // yields Result instead of Match. /// haystack.push('a'); /// let expected = Some(Err(MatchError::haystack_too_long(299_593))); /// assert_eq!(expected, re.try_find_iter(&mut cache, &haystack).next()); /// /// // Unicode inflates the size of the underlying NFA quite a bit, and /// // thus means that the backtracker can only handle smaller haystacks, /// // assuming that the visited capacity remains unchanged. /// let re = BoundedBacktracker::new(r"\w+")?; /// assert!(re.max_haystack_len() <= 7_000); /// // But we can increase the visited capacity to handle bigger haystacks! /// let re = BoundedBacktracker::builder() /// .configure(BoundedBacktracker::config().visited_capacity(1<<20)) /// .build(r"\w+")?; /// assert!(re.max_haystack_len() >= 25_000); /// assert!(re.max_haystack_len() <= 28_000); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn max_haystack_len(&self) -> usize { // The capacity given in the config is "bytes of heap memory," but the // capacity we use here is "number of bits." So convert the capacity in // bytes to the capacity in bits. let capacity = 8 * self.get_config().get_visited_capacity(); let blocks = div_ceil(capacity, Visited::BLOCK_SIZE); let real_capacity = blocks.saturating_mul(Visited::BLOCK_SIZE); // It's possible for `real_capacity` to be smaller than the number of // NFA states for particularly large regexes, so we saturate towards // zero. (real_capacity / self.nfa.states().len()).saturating_sub(1) } } impl BoundedBacktracker { /// Returns true if and only if this regex matches the given haystack. /// /// In the case of a backtracking regex engine, and unlike most other /// regex engines in this crate, short circuiting isn't practical. However, /// this routine may still be faster because it instructs backtracking to /// not keep track of any capturing groups. /// /// # Errors /// /// This routine only errors if the search could not complete. For this /// backtracking regex engine, this only occurs when the haystack length /// exceeds [`BoundedBacktracker::max_haystack_len`]. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::backtrack::BoundedBacktracker; /// /// let re = BoundedBacktracker::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.try_is_match(&mut cache, "foo12345bar")?); /// assert!(!re.try_is_match(&mut cache, "foobar")?); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `find` returns a /// match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Input, /// }; /// /// let re = BoundedBacktracker::new("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(!re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{ /// nfa::thompson::{backtrack::BoundedBacktracker, NFA}, /// Input, /// }; /// /// let re = BoundedBacktracker::builder() /// .thompson(NFA::config().utf8(false)) /// .build("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(re.try_is_match(&mut cache, Input::new("☃").span(1..2))?); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Result { let input = input.into().earliest(true); self.try_search_slots(cache, &input, &mut []).map(|pid| pid.is_some()) } /// Executes a leftmost forward search and returns a `Match` if one exists. /// /// This routine only includes the overall match span. To get /// access to the individual spans of each capturing group, use /// [`BoundedBacktracker::try_captures`]. /// /// # Errors /// /// This routine only errors if the search could not complete. For this /// backtracking regex engine, this only occurs when the haystack length /// exceeds [`BoundedBacktracker::max_haystack_len`]. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re = BoundedBacktracker::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..8); /// assert_eq!(Some(expected), re.try_find(&mut cache, "foo12345")?); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Result, MatchError> { let input = input.into(); if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = match self.try_search_slots(cache, &input, &mut slots)? { None => return Ok(None), Some(pid) => pid, }; let start = match slots[0] { None => return Ok(None), Some(s) => s.get(), }; let end = match slots[1] { None => return Ok(None), Some(s) => s.get(), }; return Ok(Some(Match::new(pid, Span { start, end }))); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = match self.try_search_slots(cache, &input, &mut slots)? { None => return Ok(None), Some(pid) => pid, }; let start = match slots[pid.as_usize() * 2] { None => return Ok(None), Some(s) => s.get(), }; let end = match slots[pid.as_usize() * 2 + 1] { None => return Ok(None), Some(s) => s.get(), }; Ok(Some(Match::new(pid, Span { start, end }))) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// # Errors /// /// This routine only errors if the search could not complete. For this /// backtracking regex engine, this only occurs when the haystack length /// exceeds [`BoundedBacktracker::max_haystack_len`]. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Span, /// }; /// /// let re = BoundedBacktracker::new( /// r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$", /// )?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.try_captures(&mut cache, "2010-03-14", &mut caps)?; /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_captures<'h, I: Into>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) -> Result<(), MatchError> { self.try_search(cache, &input.into(), caps) } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// If the regex engine returns an error at any point, then the iterator /// will yield that error. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, MatchError, /// }; /// /// let re = BoundedBacktracker::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let result: Result, MatchError> = re /// .try_find_iter(&mut cache, text) /// .collect(); /// let matches = result?; /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_find_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryFindMatches<'r, 'c, 'h> { let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); TryFindMatches { re: self, cache, caps, it } } /// Returns an iterator over all non-overlapping `Captures` values. If no /// match exists, then the iterator yields no elements. /// /// This yields the same matches as [`BoundedBacktracker::try_find_iter`], /// but it includes the spans of all capturing groups that participate in /// each match. /// /// If the regex engine returns an error at any point, then the iterator /// will yield that error. /// /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for /// how to correctly iterate over all matches in a haystack while avoiding /// the creation of a new `Captures` value for every match. (Which you are /// forced to do with an `Iterator`.) /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Span, /// }; /// /// let re = BoundedBacktracker::new("foo(?P[0-9]+)")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let mut spans = vec![]; /// for result in re.try_captures_iter(&mut cache, text) { /// let caps = result?; /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// spans.push(caps.get_group_by_name("numbers").unwrap()); /// } /// assert_eq!(spans, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_captures_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> TryCapturesMatches<'r, 'c, 'h> { let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); TryCapturesMatches { re: self, cache, caps, it } } } impl BoundedBacktracker { /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// This is like [`BoundedBacktracker::try_captures`], but it accepts a /// concrete `&Input` instead of an `Into`. /// /// # Errors /// /// This routine only errors if the search could not complete. For this /// backtracking regex engine, this only occurs when the haystack length /// exceeds [`BoundedBacktracker::max_haystack_len`]. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// # Example: specific pattern search /// /// This example shows how to build a multi bounded backtracker that /// permits searching for specific patterns. /// /// ``` /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Anchored, Input, Match, PatternID, /// }; /// /// let re = BoundedBacktracker::new_many(&[ /// "[a-z0-9]{6}", /// "[a-z][a-z0-9]{5}", /// ])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.try_search(&mut cache, &Input::new(haystack), &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.try_search(&mut cache, &input, &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, Input, /// }; /// /// let re = BoundedBacktracker::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// re.try_search(&mut cache, &Input::new(&haystack[3..6]), &mut caps)?; /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// re.try_search( /// &mut cache, &Input::new(haystack).range(3..6), &mut caps, /// )?; /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) -> Result<(), MatchError> { caps.set_pattern(None); let pid = self.try_search_slots(cache, input, caps.slots_mut())?; caps.set_pattern(pid); Ok(()) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided `slots`, and /// returns the matching pattern ID. The contents of the slots for patterns /// other than the matching pattern are unspecified. If no match was found, /// then `None` is returned and the contents of all `slots` is unspecified. /// /// This is like [`BoundedBacktracker::try_search`], but it accepts a raw /// slots slice instead of a `Captures` value. This is useful in contexts /// where you don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) /// slots, if you only care about the overall match spans for each matching /// pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Errors /// /// This routine only errors if the search could not complete. For this /// backtracking regex engine, this only occurs when the haystack length /// exceeds [`BoundedBacktracker::max_haystack_len`]. /// /// When a search cannot complete, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// PatternID, Input, /// }; /// /// let re = BoundedBacktracker::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.try_search_slots(&mut cache, &input, &mut slots)?; /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; return Ok(maybe_hm.map(|hm| hm.pattern())); } // See PikeVM::try_search_slots for why we do this. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { let maybe_hm = self.try_search_slots_imp(cache, input, slots)?; return Ok(maybe_hm.map(|hm| hm.pattern())); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger // than `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return Ok(got.map(|hm| hm.pattern())); } let mut enough = vec![None; min]; let got = self.try_search_slots_imp(cache, input, &mut enough)?; // This is OK because we know `enough_slots` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); Ok(got.map(|hm| hm.pattern())) } /// This is the actual implementation of `try_search_slots_imp` that /// doesn't account for the special case when 1) the NFA has UTF-8 mode /// enabled, 2) the NFA can match the empty string and 3) the caller has /// provided an insufficient number of slots to record match offsets. #[inline(never)] fn try_search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let hm = match self.search_imp(cache, input, slots)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; empty::skip_splits_fwd(input, hm, hm.offset(), |input| { Ok(self .search_imp(cache, input, slots)? .map(|hm| (hm, hm.offset()))) }) } /// The implementation of standard leftmost backtracking search. /// /// Capturing group spans are written to 'caps', but only if requested. /// 'caps' can be one of three things: 1) totally empty, in which case, we /// only report the pattern that matched or 2) only has slots for recording /// the overall match offsets for any pattern or 3) has all slots available /// for recording the spans of any groups participating in a match. fn search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Result, MatchError> { // Unlike in the PikeVM, we write our capturing group spans directly // into the caller's captures groups. So we have to make sure we're // starting with a blank slate first. In the PikeVM, we avoid this // by construction: the spans that are copied to every slot in the // 'Captures' value already account for presence/absence. In this // backtracker, we write directly into the caller provided slots, where // as in the PikeVM, we write into scratch space first and only copy // them to the caller provided slots when a match is found. for slot in slots.iter_mut() { *slot = None; } cache.setup_search(&self, input)?; if input.is_done() { return Ok(None); } let (anchored, start_id) = match input.get_anchored() { // Only way we're unanchored is if both the caller asked for an // unanchored search *and* the pattern is itself not anchored. Anchored::No => ( self.nfa.is_always_start_anchored(), // We always use the anchored starting state here, even if // doing an unanchored search. The "unanchored" part of it is // implemented in the loop below, by simply trying the next // byte offset if the previous backtracking exploration failed. self.nfa.start_anchored(), ), Anchored::Yes => (true, self.nfa.start_anchored()), Anchored::Pattern(pid) => match self.nfa.start_pattern(pid) { None => return Ok(None), Some(sid) => (true, sid), }, }; if anchored { let at = input.start(); return Ok(self.backtrack(cache, input, at, start_id, slots)); } let pre = self.get_config().get_prefilter(); let mut at = input.start(); while at <= input.end() { if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, Some(ref span) => at = span.start, } } if let Some(hm) = self.backtrack(cache, input, at, start_id, slots) { return Ok(Some(hm)); } at += 1; } Ok(None) } /// Look for a match starting at `at` in `input` and write the matching /// pattern ID and group spans to `caps`. The search uses `start_id` as its /// starting state in the underlying NFA. /// /// If no match was found, then the caller should increment `at` and try /// at the next position. #[cfg_attr(feature = "perf-inline", inline(always))] fn backtrack( &self, cache: &mut Cache, input: &Input<'_>, at: usize, start_id: StateID, slots: &mut [Option], ) -> Option { cache.stack.push(Frame::Step { sid: start_id, at }); while let Some(frame) = cache.stack.pop() { match frame { Frame::Step { sid, at } => { if let Some(hm) = self.step(cache, input, sid, at, slots) { return Some(hm); } } Frame::RestoreCapture { slot, offset } => { slots[slot] = offset; } } } None } // LAMENTATION: The actual backtracking search is implemented in about // 75 lines below. Yet this file is over 2,000 lines long. What have I // done? /// Execute a "step" in the backtracing algorithm. /// /// A "step" is somewhat of a misnomer, because this routine keeps going /// until it either runs out of things to try or fins a match. In the /// former case, it may have pushed some things on to the backtracking /// stack, in which case, those will be tried next as part of the /// 'backtrack' routine above. #[cfg_attr(feature = "perf-inline", inline(always))] fn step( &self, cache: &mut Cache, input: &Input<'_>, mut sid: StateID, mut at: usize, slots: &mut [Option], ) -> Option { loop { if !cache.visited.insert(sid, at - input.start()) { return None; } match *self.nfa.state(sid) { State::ByteRange { ref trans } => { // Why do we need this? Unlike other regex engines in this // crate, the backtracker can steam roll ahead in the // haystack outside of the main loop over the bytes in the // haystack. While 'trans.matches()' below handles the case // of 'at' being out of bounds of 'input.haystack()', we // also need to handle the case of 'at' going out of bounds // of the span the caller asked to search. // // We should perhaps make the 'trans.matches()' API accept // an '&Input' instead of a '&[u8]'. Or at least, add a new // API that does it. if at >= input.end() { return None; } if !trans.matches(input.haystack(), at) { return None; } sid = trans.next; at += 1; } State::Sparse(ref sparse) => { if at >= input.end() { return None; } sid = sparse.matches(input.haystack(), at)?; at += 1; } State::Dense(ref dense) => { if at >= input.end() { return None; } sid = dense.matches(input.haystack(), at)?; at += 1; } State::Look { look, next } => { // OK because we don't permit building a searcher with a // Unicode word boundary if the requisite Unicode data is // unavailable. if !self.nfa.look_matcher().matches_inline( look, input.haystack(), at, ) { return None; } sid = next; } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return None, Some(&sid) => sid, }; cache.stack.extend( alternates[1..] .iter() .copied() .rev() .map(|sid| Frame::Step { sid, at }), ); } State::BinaryUnion { alt1, alt2 } => { sid = alt1; cache.stack.push(Frame::Step { sid: alt2, at }); } State::Capture { next, slot, .. } => { if slot.as_usize() < slots.len() { cache.stack.push(Frame::RestoreCapture { slot, offset: slots[slot], }); slots[slot] = NonMaxUsize::new(at); } sid = next; } State::Fail => return None, State::Match { pattern_id } => { return Some(HalfMatch::new(pattern_id, at)); } } } } } /// An iterator over all non-overlapping matches for a fallible search. /// /// The iterator yields a `Result { re: &'r BoundedBacktracker, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for TryFindMatches<'r, 'c, 'h> { type Item = Result; #[inline] fn next(&mut self) -> Option> { // Splitting 'self' apart seems necessary to appease borrowck. let TryFindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; it.try_advance(|input| { re.try_search(cache, input, caps)?; Ok(caps.get_match()) }) .transpose() } } /// An iterator over all non-overlapping leftmost matches, with their capturing /// groups, for a fallible search. /// /// The iterator yields a `Result` value until no more /// matches could be found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the BoundedBacktracker. /// * `'c` represents the lifetime of the BoundedBacktracker's cache. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the /// [`BoundedBacktracker::try_captures_iter`] method. #[derive(Debug)] pub struct TryCapturesMatches<'r, 'c, 'h> { re: &'r BoundedBacktracker, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for TryCapturesMatches<'r, 'c, 'h> { type Item = Result; #[inline] fn next(&mut self) -> Option> { // Splitting 'self' apart seems necessary to appease borrowck. let TryCapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; let _ = it .try_advance(|input| { re.try_search(cache, input, caps)?; Ok(caps.get_match()) }) .transpose()?; if caps.is_match() { Some(Ok(caps.clone())) } else { None } } } /// A cache represents mutable state that a [`BoundedBacktracker`] requires /// during a search. /// /// For a given [`BoundedBacktracker`], its corresponding cache may be created /// either via [`BoundedBacktracker::create_cache`], or via [`Cache::new`]. /// They are equivalent in every way, except the former does not require /// explicitly importing `Cache`. /// /// A particular `Cache` is coupled with the [`BoundedBacktracker`] from which /// it was created. It may only be used with that `BoundedBacktracker`. A cache /// and its allocations may be re-purposed via [`Cache::reset`], in which case, /// it can only be used with the new `BoundedBacktracker` (and not the old /// one). #[derive(Clone, Debug)] pub struct Cache { /// Stack used on the heap for doing backtracking instead of the /// traditional recursive approach. We don't want recursion because then /// we're likely to hit a stack overflow for bigger regexes. stack: Vec, /// The set of (StateID, HaystackOffset) pairs that have been visited /// by the backtracker within a single search. If such a pair has been /// visited, then we avoid doing the work for that pair again. This is /// what "bounds" the backtracking and prevents it from having worst case /// exponential time. visited: Visited, } impl Cache { /// Create a new [`BoundedBacktracker`] cache. /// /// A potentially more convenient routine to create a cache is /// [`BoundedBacktracker::create_cache`], as it does not require also /// importing the `Cache` type. /// /// If you want to reuse the returned `Cache` with some other /// `BoundedBacktracker`, then you must call [`Cache::reset`] with the /// desired `BoundedBacktracker`. pub fn new(re: &BoundedBacktracker) -> Cache { Cache { stack: vec![], visited: Visited::new(re) } } /// Reset this cache such that it can be used for searching with different /// [`BoundedBacktracker`]. /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `BoundedBacktracker`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different /// `BoundedBacktracker`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::backtrack::BoundedBacktracker, /// Match, /// }; /// /// let re1 = BoundedBacktracker::new(r"\w")?; /// let re2 = BoundedBacktracker::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Ok(Match::must(0, 0..2))), /// re1.try_find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the BoundedBacktracker we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Ok(Match::must(0, 0..3))), /// re2.try_find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &BoundedBacktracker) { self.visited.reset(re); } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.stack.len() * core::mem::size_of::() + self.visited.memory_usage() } /// Clears this cache. This should be called at the start of every search /// to ensure we start with a clean slate. /// /// This also sets the length of the capturing groups used in the current /// search. This permits an optimization where by 'SlotTable::for_state' /// only returns the number of slots equivalent to the number of slots /// given in the 'Captures' value. This may be less than the total number /// of possible slots, e.g., when one only wants to track overall match /// offsets. This in turn permits less copying of capturing group spans /// in the BoundedBacktracker. fn setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError> { self.stack.clear(); self.visited.setup_search(re, input)?; Ok(()) } } /// Represents a stack frame on the heap while doing backtracking. /// /// Instead of using explicit recursion for backtracking, we use a stack on /// the heap to keep track of things that we want to explore if the current /// backtracking branch turns out to not lead to a match. #[derive(Clone, Debug)] enum Frame { /// Look for a match starting at `sid` and the given position in the /// haystack. Step { sid: StateID, at: usize }, /// Reset the given `slot` to the given `offset` (which might be `None`). /// This effectively gives a "scope" to capturing groups, such that an /// offset for a particular group only gets returned if the match goes /// through that capturing group. If backtracking ends up going down a /// different branch that results in a different offset (or perhaps none at /// all), then this "restore capture" frame will cause the offset to get /// reset. RestoreCapture { slot: SmallIndex, offset: Option }, } /// A bitset that keeps track of whether a particular (StateID, offset) has /// been considered during backtracking. If it has already been visited, then /// backtracking skips it. This is what gives backtracking its "bound." #[derive(Clone, Debug)] struct Visited { /// The actual underlying bitset. Each element in the bitset corresponds /// to a particular (StateID, offset) pair. States correspond to the rows /// and the offsets correspond to the columns. /// /// If our underlying NFA has N states and the haystack we're searching /// has M bytes, then we have N*(M+1) entries in our bitset table. The /// M+1 occurs because our matches are delayed by one byte (to support /// look-around), and so we need to handle the end position itself rather /// than stopping just before the end. (If there is no end position, then /// it's treated as "end-of-input," which is matched by things like '$'.) /// /// Given BITS=N*(M+1), we wind up with div_ceil(BITS, sizeof(usize)) /// blocks. /// /// We use 'usize' to represent our blocks because it makes some of the /// arithmetic in 'insert' a bit nicer. For example, if we used 'u32' for /// our block, we'd either need to cast u32s to usizes or usizes to u32s. bitset: Vec, /// The stride represents one plus length of the haystack we're searching /// (as described above). The stride must be initialized for each search. stride: usize, } impl Visited { /// The size of each block, in bits. const BLOCK_SIZE: usize = 8 * core::mem::size_of::(); /// Create a new visited set for the given backtracker. /// /// The set is ready to use, but must be setup at the beginning of each /// search by calling `setup_search`. fn new(re: &BoundedBacktracker) -> Visited { let mut visited = Visited { bitset: vec![], stride: 0 }; visited.reset(re); visited } /// Insert the given (StateID, offset) pair into this set. If it already /// exists, then this is a no-op and it returns false. Otherwise this /// returns true. fn insert(&mut self, sid: StateID, at: usize) -> bool { let table_index = sid.as_usize() * self.stride + at; let block_index = table_index / Visited::BLOCK_SIZE; let bit = table_index % Visited::BLOCK_SIZE; let block_with_bit = 1 << bit; if self.bitset[block_index] & block_with_bit != 0 { return false; } self.bitset[block_index] |= block_with_bit; true } /// Reset this visited set to work with the given bounded backtracker. fn reset(&mut self, _: &BoundedBacktracker) { self.bitset.truncate(0); } /// Setup this visited set to work for a search using the given NFA /// and input configuration. The NFA must be the same NFA used by the /// BoundedBacktracker given to Visited::reset. Failing to call this might /// result in panics or silently incorrect search behavior. fn setup_search( &mut self, re: &BoundedBacktracker, input: &Input<'_>, ) -> Result<(), MatchError> { // Our haystack length is only the length of the span of the entire // haystack that we'll be searching. let haylen = input.get_span().len(); let err = || MatchError::haystack_too_long(haylen); // Our stride is one more than the length of the input because our main // search loop includes the position at input.end(). (And it does this // because matches are delayed by one byte to account for look-around.) self.stride = haylen + 1; let needed_capacity = match re.get_nfa().states().len().checked_mul(self.stride) { None => return Err(err()), Some(capacity) => capacity, }; let max_capacity = 8 * re.get_config().get_visited_capacity(); if needed_capacity > max_capacity { return Err(err()); } let needed_blocks = div_ceil(needed_capacity, Visited::BLOCK_SIZE); self.bitset.truncate(needed_blocks); for block in self.bitset.iter_mut() { *block = 0; } if needed_blocks > self.bitset.len() { self.bitset.resize(needed_blocks, 0); } Ok(()) } /// Return the heap memory usage, in bytes, of this visited set. fn memory_usage(&self) -> usize { self.bitset.len() * core::mem::size_of::() } } /// Integer division, but rounds up instead of down. fn div_ceil(lhs: usize, rhs: usize) -> usize { if lhs % rhs == 0 { lhs / rhs } else { (lhs / rhs) + 1 } } #[cfg(test)] mod tests { use super::*; // This is a regression test for the maximum haystack length computation. // Previously, it assumed that the total capacity of the backtracker's // bitset would always be greater than the number of NFA states. But there // is of course no guarantee that this is true. This regression test // ensures that not only does `max_haystack_len` not panic, but that it // should return `0`. #[cfg(feature = "syntax")] #[test] fn max_haystack_len_overflow() { let re = BoundedBacktracker::builder() .configure(BoundedBacktracker::config().visited_capacity(10)) .build(r"[0-9A-Za-z]{100}") .unwrap(); assert_eq!(0, re.max_haystack_len()); } } regex-automata-0.4.9/src/nfa/thompson/builder.rs000064400000000000000000001634411046102023000177700ustar 00000000000000use core::mem; use alloc::{sync::Arc, vec, vec::Vec}; use crate::{ nfa::thompson::{ error::BuildError, nfa::{self, SparseTransitions, Transition, NFA}, }, util::{ look::{Look, LookMatcher}, primitives::{IteratorIndexExt, PatternID, SmallIndex, StateID}, }, }; /// An intermediate NFA state used during construction. /// /// During construction of an NFA, it is often convenient to work with states /// that are amenable to mutation and other carry more information than we /// otherwise need once an NFA has been built. This type represents those /// needs. /// /// Once construction is finished, the builder will convert these states to a /// [`nfa::thompson::State`](crate::nfa::thompson::State). This conversion not /// only results in a simpler representation, but in some cases, entire classes /// of states are completely removed (such as [`State::Empty`]). #[derive(Clone, Debug, Eq, PartialEq)] enum State { /// An empty state whose only purpose is to forward the automaton to /// another state via an unconditional epsilon transition. /// /// Unconditional epsilon transitions are quite useful during the /// construction of an NFA, as they permit the insertion of no-op /// placeholders that make it easier to compose NFA sub-graphs. When /// the Thompson NFA builder produces a final NFA, all unconditional /// epsilon transitions are removed, and state identifiers are remapped /// accordingly. Empty { /// The next state that this state should transition to. next: StateID, }, /// A state that only transitions to another state if the current input /// byte is in a particular range of bytes. ByteRange { trans: Transition }, /// A state with possibly many transitions, represented in a sparse /// fashion. Transitions must be ordered lexicographically by input range /// and be non-overlapping. As such, this may only be used when every /// transition has equal priority. (In practice, this is only used for /// encoding large UTF-8 automata.) In contrast, a `Union` state has each /// alternate in order of priority. Priority is used to implement greedy /// matching and also alternations themselves, e.g., `abc|a` where `abc` /// has priority over `a`. /// /// To clarify, it is possible to remove `Sparse` and represent all things /// that `Sparse` is used for via `Union`. But this creates a more bloated /// NFA with more epsilon transitions than is necessary in the special case /// of character classes. Sparse { transitions: Vec }, /// A conditional epsilon transition satisfied via some sort of /// look-around. Look { look: Look, next: StateID }, /// An empty state that records the start of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for /// search. CaptureStart { /// The ID of the pattern that this capture was defined. pattern_id: PatternID, /// The capture group index that this capture state corresponds to. /// The capture group index is always relative to its corresponding /// pattern. Therefore, in the presence of multiple patterns, both the /// pattern ID and the capture group index are required to uniquely /// identify a capturing group. group_index: SmallIndex, /// The next state that this state should transition to. next: StateID, }, /// An empty state that records the end of a capture location. This is an /// unconditional epsilon transition like `Empty`, except it can be used to /// record position information for a capture group when using the NFA for /// search. CaptureEnd { /// The ID of the pattern that this capture was defined. pattern_id: PatternID, /// The capture group index that this capture state corresponds to. /// The capture group index is always relative to its corresponding /// pattern. Therefore, in the presence of multiple patterns, both the /// pattern ID and the capture group index are required to uniquely /// identify a capturing group. group_index: SmallIndex, /// The next state that this state should transition to. next: StateID, }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. Union { alternates: Vec }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions are /// preferred over earlier transitions. /// /// This "reverse" state exists for convenience during compilation that /// permits easy construction of non-greedy combinations of NFA states. At /// the end of compilation, Union and UnionReverse states are merged into /// one Union type of state, where the latter has its epsilon transitions /// reversed to reflect the priority inversion. /// /// The "convenience" here arises from the fact that as new states are /// added to the list of `alternates`, we would like that add operation /// to be amortized constant time. But if we used a `Union`, we'd need to /// prepend the state, which takes O(n) time. There are other approaches we /// could use to solve this, but this seems simple enough. UnionReverse { alternates: Vec }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose a /// `Fail` state to represent it. Fail, /// A match state. There is at most one such occurrence of this state in /// an NFA for each pattern compiled into the NFA. At time of writing, a /// match state is always produced for every pattern given, but in theory, /// if a pattern can never lead to a match, then the match state could be /// omitted. /// /// `pattern_id` refers to the ID of the pattern itself, which corresponds /// to the pattern's index (starting at 0). Match { pattern_id: PatternID }, } impl State { /// If this state is an unconditional epsilon transition, then this returns /// the target of the transition. fn goto(&self) -> Option { match *self { State::Empty { next } => Some(next), State::Union { ref alternates } if alternates.len() == 1 => { Some(alternates[0]) } State::UnionReverse { ref alternates } if alternates.len() == 1 => { Some(alternates[0]) } _ => None, } } /// Returns the heap memory usage, in bytes, of this state. fn memory_usage(&self) -> usize { match *self { State::Empty { .. } | State::ByteRange { .. } | State::Look { .. } | State::CaptureStart { .. } | State::CaptureEnd { .. } | State::Fail | State::Match { .. } => 0, State::Sparse { ref transitions } => { transitions.len() * mem::size_of::() } State::Union { ref alternates } => { alternates.len() * mem::size_of::() } State::UnionReverse { ref alternates } => { alternates.len() * mem::size_of::() } } } } /// An abstraction for building Thompson NFAs by hand. /// /// A builder is what a [`thompson::Compiler`](crate::nfa::thompson::Compiler) /// uses internally to translate a regex's high-level intermediate /// representation into an [`NFA`]. /// /// The primary function of this builder is to abstract away the internal /// representation of an NFA and make it difficult to produce NFAs are that /// internally invalid or inconsistent. This builder also provides a way to /// add "empty" states (which can be thought of as unconditional epsilon /// transitions), despite the fact that [`thompson::State`](nfa::State) does /// not have any "empty" representation. The advantage of "empty" states is /// that they make the code for constructing a Thompson NFA logically simpler. /// /// Many of the routines on this builder may panic or return errors. Generally /// speaking, panics occur when an invalid sequence of method calls were made, /// where as an error occurs if things get too big. (Where "too big" might mean /// exhausting identifier space or using up too much heap memory in accordance /// with the configured [`size_limit`](Builder::set_size_limit).) /// /// # Overview /// /// ## Adding multiple patterns /// /// Each pattern you add to an NFA should correspond to a pair of /// [`Builder::start_pattern`] and [`Builder::finish_pattern`] calls, with /// calls inbetween that add NFA states for that pattern. NFA states may be /// added without first calling `start_pattern`, with the exception of adding /// capturing states. /// /// ## Adding NFA states /// /// Here is a very brief overview of each of the methods that add NFA states. /// Every method adds a single state. /// /// * [`add_empty`](Builder::add_empty): Add a state with a single /// unconditional epsilon transition to another state. /// * [`add_union`](Builder::add_union): Adds a state with unconditional /// epsilon transitions to two or more states, with earlier transitions /// preferred over later ones. /// * [`add_union_reverse`](Builder::add_union_reverse): Adds a state with /// unconditional epsilon transitions to two or more states, with later /// transitions preferred over earlier ones. /// * [`add_range`](Builder::add_range): Adds a state with a single transition /// to another state that can only be followed if the current input byte is /// within the range given. /// * [`add_sparse`](Builder::add_sparse): Adds a state with two or more /// range transitions to other states, where a transition is only followed /// if the current input byte is within one of the ranges. All transitions /// in this state have equal priority, and the corresponding ranges must be /// non-overlapping. /// * [`add_look`](Builder::add_look): Adds a state with a single *conditional* /// epsilon transition to another state, where the condition depends on a /// limited look-around property. /// * [`add_capture_start`](Builder::add_capture_start): Adds a state with /// a single unconditional epsilon transition that also instructs an NFA /// simulation to record the current input position to a specific location in /// memory. This is intended to represent the starting location of a capturing /// group. /// * [`add_capture_end`](Builder::add_capture_end): Adds a state with /// a single unconditional epsilon transition that also instructs an NFA /// simulation to record the current input position to a specific location in /// memory. This is intended to represent the ending location of a capturing /// group. /// * [`add_fail`](Builder::add_fail): Adds a state that never transitions to /// another state. /// * [`add_match`](Builder::add_match): Add a state that indicates a match has /// been found for a particular pattern. A match state is a final state with /// no outgoing transitions. /// /// ## Setting transitions between NFA states /// /// The [`Builder::patch`] method creates a transition from one state to the /// next. If the `from` state corresponds to a state that supports multiple /// outgoing transitions (such as "union"), then this adds the corresponding /// transition. Otherwise, it sets the single transition. (This routine panics /// if `from` corresponds to a state added by `add_sparse`, since sparse states /// need more specialized handling.) /// /// # Example /// /// This annotated example shows how to hand construct the regex `[a-z]+` /// (without an unanchored prefix). /// /// ``` /// use regex_automata::{ /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, /// util::primitives::StateID, /// Match, /// }; /// /// let mut builder = Builder::new(); /// // Before adding NFA states for our pattern, we need to tell the builder /// // that we are starting the pattern. /// builder.start_pattern()?; /// // Since we use the Pike VM below for searching, we need to add capturing /// // states. If you're just going to build a DFA from the NFA, then capturing /// // states do not need to be added. /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; /// let range = builder.add_range(Transition { /// // We don't know the state ID of the 'next' state yet, so we just fill /// // in a dummy 'ZERO' value. /// start: b'a', end: b'z', next: StateID::ZERO, /// })?; /// // This state will point back to 'range', but also enable us to move ahead. /// // That is, this implements the '+' repetition operator. We add 'range' and /// // then 'end' below to this alternation. /// let alt = builder.add_union(vec![])?; /// // The final state before the match state, which serves to capture the /// // end location of the match. /// let end = builder.add_capture_end(StateID::ZERO, 0)?; /// // The match state for our pattern. /// let mat = builder.add_match()?; /// // Now we fill in the transitions between states. /// builder.patch(start, range)?; /// builder.patch(range, alt)?; /// // If we added 'end' before 'range', then we'd implement non-greedy /// // matching, i.e., '+?'. /// builder.patch(alt, range)?; /// builder.patch(alt, end)?; /// builder.patch(end, mat)?; /// // We must explicitly finish pattern and provide the starting state ID for /// // this particular pattern. /// builder.finish_pattern(start)?; /// // Finally, when we build the NFA, we provide the anchored and unanchored /// // starting state IDs. Since we didn't bother with an unanchored prefix /// // here, we only support anchored searching. Thus, both starting states are /// // the same. /// let nfa = builder.build(start, start)?; /// /// // Now build a Pike VM from our NFA, and use it for searching. This shows /// // how we can use a regex engine without ever worrying about syntax! /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(0, 0..3)); /// re.captures(&mut cache, "foo0", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug, Default)] pub struct Builder { /// The ID of the pattern that we're currently building. /// /// Callers are required to set (and unset) this by calling /// {start,finish}_pattern. Otherwise, most methods will panic. pattern_id: Option, /// A sequence of intermediate NFA states. Once a state is added to this /// sequence, it is assigned a state ID equivalent to its index. Once a /// state is added, it is still expected to be mutated, e.g., to set its /// transition to a state that didn't exist at the time it was added. states: Vec, /// The starting states for each individual pattern. Starting at any /// of these states will result in only an anchored search for the /// corresponding pattern. The vec is indexed by pattern ID. When the NFA /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, /// A map from pattern ID to capture group index to name. (If no name /// exists, then a None entry is present. Thus, all capturing groups are /// present in this mapping.) /// /// The outer vec is indexed by pattern ID, while the inner vec is indexed /// by capture index offset for the corresponding pattern. /// /// The first capture group for each pattern is always unnamed and is thus /// always None. captures: Vec>>>, /// The combined memory used by each of the 'State's in 'states'. This /// only includes heap usage by each state, and not the size of the state /// itself. In other words, this tracks heap memory used that isn't /// captured via `size_of::() * states.len()`. memory_states: usize, /// Whether this NFA only matches UTF-8 and whether regex engines using /// this NFA for searching should report empty matches that split a /// codepoint. utf8: bool, /// Whether this NFA should be matched in reverse or not. reverse: bool, /// The matcher to use for look-around assertions. look_matcher: LookMatcher, /// A size limit to respect when building an NFA. If the total heap memory /// of the intermediate NFA states exceeds (or would exceed) this amount, /// then an error is returned. size_limit: Option, } impl Builder { /// Create a new builder for hand-assembling NFAs. pub fn new() -> Builder { Builder::default() } /// Clear this builder. /// /// Clearing removes all state associated with building an NFA, but does /// not reset configuration (such as size limits and whether the NFA /// should only match UTF-8). After clearing, the builder can be reused to /// assemble an entirely new NFA. pub fn clear(&mut self) { self.pattern_id = None; self.states.clear(); self.start_pattern.clear(); self.captures.clear(); self.memory_states = 0; } /// Assemble a [`NFA`] from the states added so far. /// /// After building an NFA, more states may be added and `build` may be /// called again. To reuse a builder to produce an entirely new NFA from /// scratch, call the [`clear`](Builder::clear) method first. /// /// `start_anchored` refers to the ID of the starting state that anchored /// searches should use. That is, searches who matches are limited to the /// starting position of the search. /// /// `start_unanchored` refers to the ID of the starting state that /// unanchored searches should use. This permits searches to report matches /// that start after the beginning of the search. In cases where unanchored /// searches are not supported, the unanchored starting state ID must be /// the same as the anchored starting state ID. /// /// # Errors /// /// This returns an error if there was a problem producing the final NFA. /// In particular, this might include an error if the capturing groups /// added to this builder violate any of the invariants documented on /// [`GroupInfo`](crate::util::captures::GroupInfo). /// /// # Panics /// /// If `start_pattern` was called, then `finish_pattern` must be called /// before `build`, otherwise this panics. /// /// This may panic for other invalid uses of a builder. For example, if /// a "start capture" state was added without a corresponding "end capture" /// state. pub fn build( &self, start_anchored: StateID, start_unanchored: StateID, ) -> Result { assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); debug!( "intermediate NFA compilation via builder is complete, \ intermediate NFA size: {} states, {} bytes on heap", self.states.len(), self.memory_usage(), ); let mut nfa = nfa::Inner::default(); nfa.set_utf8(self.utf8); nfa.set_reverse(self.reverse); nfa.set_look_matcher(self.look_matcher.clone()); // A set of compiler internal state IDs that correspond to states // that are exclusively epsilon transitions, i.e., goto instructions, // combined with the state that they point to. This is used to // record said states while transforming the compiler's internal NFA // representation to the external form. let mut empties = vec![]; // A map used to re-map state IDs when translating this builder's // internal NFA state representation to the final NFA representation. let mut remap = vec![]; remap.resize(self.states.len(), StateID::ZERO); nfa.set_starts(start_anchored, start_unanchored, &self.start_pattern); nfa.set_captures(&self.captures).map_err(BuildError::captures)?; // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting // transitions, which are expressed in terms of state IDs. The new // set of states will be smaller because of partial epsilon removal, // so the state IDs will not be the same. for (sid, state) in self.states.iter().with_state_ids() { match *state { State::Empty { next } => { // Since we're removing empty states, we need to handle // them later since we don't yet know which new state this // empty state will be mapped to. empties.push((sid, next)); } State::ByteRange { trans } => { remap[sid] = nfa.add(nfa::State::ByteRange { trans }); } State::Sparse { ref transitions } => { remap[sid] = match transitions.len() { 0 => nfa.add(nfa::State::Fail), 1 => nfa.add(nfa::State::ByteRange { trans: transitions[0], }), _ => { let transitions = transitions.to_vec().into_boxed_slice(); let sparse = SparseTransitions { transitions }; nfa.add(nfa::State::Sparse(sparse)) } } } State::Look { look, next } => { remap[sid] = nfa.add(nfa::State::Look { look, next }); } State::CaptureStart { pattern_id, group_index, next } => { // We can't remove this empty state because of the side // effect of capturing an offset for this capture slot. let slot = nfa .group_info() .slot(pattern_id, group_index.as_usize()) .expect("invalid capture index"); let slot = SmallIndex::new(slot).expect("a small enough slot"); remap[sid] = nfa.add(nfa::State::Capture { next, pattern_id, group_index, slot, }); } State::CaptureEnd { pattern_id, group_index, next } => { // We can't remove this empty state because of the side // effect of capturing an offset for this capture slot. // Also, this always succeeds because we check that all // slot indices are valid for all capture indices when they // are initially added. let slot = nfa .group_info() .slot(pattern_id, group_index.as_usize()) .expect("invalid capture index") .checked_add(1) .unwrap(); let slot = SmallIndex::new(slot).expect("a small enough slot"); remap[sid] = nfa.add(nfa::State::Capture { next, pattern_id, group_index, slot, }); } State::Union { ref alternates } => { if alternates.is_empty() { remap[sid] = nfa.add(nfa::State::Fail); } else if alternates.len() == 1 { empties.push((sid, alternates[0])); remap[sid] = alternates[0]; } else if alternates.len() == 2 { remap[sid] = nfa.add(nfa::State::BinaryUnion { alt1: alternates[0], alt2: alternates[1], }); } else { let alternates = alternates.to_vec().into_boxed_slice(); remap[sid] = nfa.add(nfa::State::Union { alternates }); } } State::UnionReverse { ref alternates } => { if alternates.is_empty() { remap[sid] = nfa.add(nfa::State::Fail); } else if alternates.len() == 1 { empties.push((sid, alternates[0])); remap[sid] = alternates[0]; } else if alternates.len() == 2 { remap[sid] = nfa.add(nfa::State::BinaryUnion { alt1: alternates[1], alt2: alternates[0], }); } else { let mut alternates = alternates.to_vec().into_boxed_slice(); alternates.reverse(); remap[sid] = nfa.add(nfa::State::Union { alternates }); } } State::Fail => { remap[sid] = nfa.add(nfa::State::Fail); } State::Match { pattern_id } => { remap[sid] = nfa.add(nfa::State::Match { pattern_id }); } } } // Some of the new states still point to empty state IDs, so we need to // follow each of them and remap the empty state IDs to their non-empty // state IDs. // // We also keep track of which states we've already mapped. This helps // avoid quadratic behavior in a long chain of empty states. For // example, in 'a{0}{50000}'. let mut remapped = vec![false; self.states.len()]; for &(empty_id, empty_next) in empties.iter() { if remapped[empty_id] { continue; } // empty states can point to other empty states, forming a chain. // So we must follow the chain until the end, which must end at // a non-empty state, and therefore, a state that is correctly // remapped. We are guaranteed to terminate because our compiler // never builds a loop among only empty states. let mut new_next = empty_next; while let Some(next) = self.states[new_next].goto() { new_next = next; } remap[empty_id] = remap[new_next]; remapped[empty_id] = true; // Now that we've remapped the main 'empty_id' above, we re-follow // the chain from above and remap every empty state we found along // the way to our ultimate non-empty target. We are careful to set // 'remapped' to true for each such state. We thus will not need // to re-compute this chain for any subsequent empty states in // 'empties' that are part of this chain. let mut next2 = empty_next; while let Some(next) = self.states[next2].goto() { remap[next2] = remap[new_next]; remapped[next2] = true; next2 = next; } } // Finally remap all of the state IDs. nfa.remap(&remap); let final_nfa = nfa.into_nfa(); debug!( "NFA compilation via builder complete, \ final NFA size: {} states, {} bytes on heap, \ has empty? {:?}, utf8? {:?}", final_nfa.states().len(), final_nfa.memory_usage(), final_nfa.has_empty(), final_nfa.is_utf8(), ); Ok(final_nfa) } /// Start the assembly of a pattern in this NFA. /// /// Upon success, this returns the identifier for the new pattern. /// Identifiers start at `0` and are incremented by 1 for each new pattern. /// /// It is necessary to call this routine before adding capturing states. /// Otherwise, any other NFA state may be added before starting a pattern. /// /// # Errors /// /// If the pattern identifier space is exhausted, then this returns an /// error. /// /// # Panics /// /// If this is called while assembling another pattern (i.e., before /// `finish_pattern` is called), then this panics. pub fn start_pattern(&mut self) -> Result { assert!(self.pattern_id.is_none(), "must call 'finish_pattern' first"); let proposed = self.start_pattern.len(); let pid = PatternID::new(proposed) .map_err(|_| BuildError::too_many_patterns(proposed))?; self.pattern_id = Some(pid); // This gets filled in when 'finish_pattern' is called. self.start_pattern.push(StateID::ZERO); Ok(pid) } /// Finish the assembly of a pattern in this NFA. /// /// Upon success, this returns the identifier for the new pattern. /// Identifiers start at `0` and are incremented by 1 for each new /// pattern. This is the same identifier returned by the corresponding /// `start_pattern` call. /// /// Note that `start_pattern` and `finish_pattern` pairs cannot be /// interleaved or nested. A correct `finish_pattern` call _always_ /// corresponds to the most recently called `start_pattern` routine. /// /// # Errors /// /// This currently never returns an error, but this is subject to change. /// /// # Panics /// /// If this is called without a corresponding `start_pattern` call, then /// this panics. pub fn finish_pattern( &mut self, start_id: StateID, ) -> Result { let pid = self.current_pattern_id(); self.start_pattern[pid] = start_id; self.pattern_id = None; Ok(pid) } /// Returns the pattern identifier of the current pattern. /// /// # Panics /// /// If this doesn't occur after a `start_pattern` call and before the /// corresponding `finish_pattern` call, then this panics. pub fn current_pattern_id(&self) -> PatternID { self.pattern_id.expect("must call 'start_pattern' first") } /// Returns the number of patterns added to this builder so far. /// /// This only includes patterns that have had `finish_pattern` called /// for them. pub fn pattern_len(&self) -> usize { self.start_pattern.len() } /// Add an "empty" NFA state. /// /// An "empty" NFA state is a state with a single unconditional epsilon /// transition to another NFA state. Such empty states are removed before /// building the final [`NFA`] (which has no such "empty" states), but they /// can be quite useful in the construction process of an NFA. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_empty(&mut self) -> Result { self.add(State::Empty { next: StateID::ZERO }) } /// Add a "union" NFA state. /// /// A "union" NFA state that contains zero or more unconditional epsilon /// transitions to other NFA states. The order of these transitions /// reflects a priority order where earlier transitions are preferred over /// later transitions. /// /// Callers may provide an empty set of alternates to this method call, and /// then later add transitions via `patch`. At final build time, a "union" /// state with no alternates is converted to a "fail" state, and a "union" /// state with exactly one alternate is treated as if it were an "empty" /// state. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_union( &mut self, alternates: Vec, ) -> Result { self.add(State::Union { alternates }) } /// Add a "reverse union" NFA state. /// /// A "reverse union" NFA state contains zero or more unconditional epsilon /// transitions to other NFA states. The order of these transitions /// reflects a priority order where later transitions are preferred /// over earlier transitions. This is an inverted priority order when /// compared to `add_union`. This is useful, for example, for implementing /// non-greedy repetition operators. /// /// Callers may provide an empty set of alternates to this method call, and /// then later add transitions via `patch`. At final build time, a "reverse /// union" state with no alternates is converted to a "fail" state, and a /// "reverse union" state with exactly one alternate is treated as if it /// were an "empty" state. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_union_reverse( &mut self, alternates: Vec, ) -> Result { self.add(State::UnionReverse { alternates }) } /// Add a "range" NFA state. /// /// A "range" NFA state is a state with one outgoing transition to another /// state, where that transition may only be followed if the current input /// byte falls between a range of bytes given. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_range( &mut self, trans: Transition, ) -> Result { self.add(State::ByteRange { trans }) } /// Add a "sparse" NFA state. /// /// A "sparse" NFA state contains zero or more outgoing transitions, where /// the transition to be followed (if any) is chosen based on whether the /// current input byte falls in the range of one such transition. The /// transitions given *must* be non-overlapping and in ascending order. (A /// "sparse" state with no transitions is equivalent to a "fail" state.) /// /// A "sparse" state is like adding a "union" state and pointing it at a /// bunch of "range" states, except that the different alternates have /// equal priority. /// /// Note that a "sparse" state is the only state that cannot be patched. /// This is because a "sparse" state has many transitions, each of which /// may point to a different NFA state. Moreover, adding more such /// transitions requires more than just an NFA state ID to point to. It /// also requires a byte range. The `patch` routine does not support the /// additional information required. Therefore, callers must ensure that /// all outgoing transitions for this state are included when `add_sparse` /// is called. There is no way to add more later. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. /// /// # Panics /// /// This routine _may_ panic if the transitions given overlap or are not /// in ascending order. pub fn add_sparse( &mut self, transitions: Vec, ) -> Result { self.add(State::Sparse { transitions }) } /// Add a "look" NFA state. /// /// A "look" NFA state corresponds to a state with exactly one /// *conditional* epsilon transition to another NFA state. Namely, it /// represents one of a small set of simplistic look-around operators. /// /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), /// and then change it later with [`patch`](Builder::patch). /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_look( &mut self, next: StateID, look: Look, ) -> Result { self.add(State::Look { look, next }) } /// Add a "start capture" NFA state. /// /// A "start capture" NFA state corresponds to a state with exactly one /// outgoing unconditional epsilon transition to another state. Unlike /// "empty" states, a "start capture" state also carries with it an /// instruction for saving the current position of input to a particular /// location in memory. NFA simulations, like the Pike VM, may use this /// information to report the match locations of capturing groups in a /// regex pattern. /// /// If the corresponding capturing group has a name, then callers should /// include it here. /// /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), /// and then change it later with [`patch`](Builder::patch). /// /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and /// end states may be interleaved. Indeed, it is typical for many "start /// capture" NFA states to appear before the first "end capture" state. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded or if the given /// capture index overflows `usize`. /// /// While the above are the only conditions in which this routine can /// currently return an error, it is possible to call this method with an /// inputs that results in the final `build()` step failing to produce an /// NFA. For example, if one adds two distinct capturing groups with the /// same name, then that will result in `build()` failing with an error. /// /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for /// more information on what qualifies as valid capturing groups. /// /// # Example /// /// This example shows that an error occurs when one tries to add multiple /// capturing groups with the same name to the same pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson::Builder, /// util::primitives::StateID, /// }; /// /// let name = Some(std::sync::Arc::from("foo")); /// let mut builder = Builder::new(); /// builder.start_pattern()?; /// // 0th capture group should always be unnamed. /// let start = builder.add_capture_start(StateID::ZERO, 0, None)?; /// // OK /// builder.add_capture_start(StateID::ZERO, 1, name.clone())?; /// // This is not OK, but 'add_capture_start' still succeeds. We don't /// // get an error until we call 'build' below. Without this call, the /// // call to 'build' below would succeed. /// builder.add_capture_start(StateID::ZERO, 2, name.clone())?; /// // Finish our pattern so we can try to build the NFA. /// builder.finish_pattern(start)?; /// let result = builder.build(start, start); /// assert!(result.is_err()); /// /// # Ok::<(), Box>(()) /// ``` /// /// However, adding multiple capturing groups with the same name to /// distinct patterns is okay: /// /// ``` /// use std::sync::Arc; /// /// use regex_automata::{ /// nfa::thompson::{pikevm::PikeVM, Builder, Transition}, /// util::{ /// captures::Captures, /// primitives::{PatternID, StateID}, /// }, /// Span, /// }; /// /// // Hand-compile the patterns '(?P[a-z])' and '(?P[A-Z])'. /// let mut builder = Builder::new(); /// // We compile them to support an unanchored search, which requires /// // adding an implicit '(?s-u:.)*?' prefix before adding either pattern. /// let unanchored_prefix = builder.add_union_reverse(vec![])?; /// let any = builder.add_range(Transition { /// start: b'\x00', end: b'\xFF', next: StateID::ZERO, /// })?; /// builder.patch(unanchored_prefix, any)?; /// builder.patch(any, unanchored_prefix)?; /// /// // Compile an alternation that permits matching multiple patterns. /// let alt = builder.add_union(vec![])?; /// builder.patch(unanchored_prefix, alt)?; /// /// // Compile '(?P[a-z]+)'. /// builder.start_pattern()?; /// let start0 = builder.add_capture_start(StateID::ZERO, 0, None)?; /// // N.B. 0th capture group must always be unnamed. /// let foo_start0 = builder.add_capture_start( /// StateID::ZERO, 1, Some(Arc::from("foo")), /// )?; /// let lowercase = builder.add_range(Transition { /// start: b'a', end: b'z', next: StateID::ZERO, /// })?; /// let foo_end0 = builder.add_capture_end(StateID::ZERO, 1)?; /// let end0 = builder.add_capture_end(StateID::ZERO, 0)?; /// let match0 = builder.add_match()?; /// builder.patch(start0, foo_start0)?; /// builder.patch(foo_start0, lowercase)?; /// builder.patch(lowercase, foo_end0)?; /// builder.patch(foo_end0, end0)?; /// builder.patch(end0, match0)?; /// builder.finish_pattern(start0)?; /// /// // Compile '(?P[A-Z]+)'. /// builder.start_pattern()?; /// let start1 = builder.add_capture_start(StateID::ZERO, 0, None)?; /// // N.B. 0th capture group must always be unnamed. /// let foo_start1 = builder.add_capture_start( /// StateID::ZERO, 1, Some(Arc::from("foo")), /// )?; /// let uppercase = builder.add_range(Transition { /// start: b'A', end: b'Z', next: StateID::ZERO, /// })?; /// let foo_end1 = builder.add_capture_end(StateID::ZERO, 1)?; /// let end1 = builder.add_capture_end(StateID::ZERO, 0)?; /// let match1 = builder.add_match()?; /// builder.patch(start1, foo_start1)?; /// builder.patch(foo_start1, uppercase)?; /// builder.patch(uppercase, foo_end1)?; /// builder.patch(foo_end1, end1)?; /// builder.patch(end1, match1)?; /// builder.finish_pattern(start1)?; /// /// // Now add the patterns to our alternation that we started above. /// builder.patch(alt, start0)?; /// builder.patch(alt, start1)?; /// /// // Finally build the NFA. The first argument is the anchored starting /// // state (the pattern alternation) where as the second is the /// // unanchored starting state (the unanchored prefix). /// let nfa = builder.build(alt, unanchored_prefix)?; /// /// // Now build a Pike VM from our NFA and access the 'foo' capture /// // group regardless of which pattern matched, since it is defined /// // for both patterns. /// let vm = PikeVM::new_from_nfa(nfa)?; /// let mut cache = vm.create_cache(); /// let caps: Vec = /// vm.captures_iter(&mut cache, "0123aAaAA").collect(); /// assert_eq!(5, caps.len()); /// /// assert_eq!(Some(PatternID::must(0)), caps[0].pattern()); /// assert_eq!(Some(Span::from(4..5)), caps[0].get_group_by_name("foo")); /// /// assert_eq!(Some(PatternID::must(1)), caps[1].pattern()); /// assert_eq!(Some(Span::from(5..6)), caps[1].get_group_by_name("foo")); /// /// assert_eq!(Some(PatternID::must(0)), caps[2].pattern()); /// assert_eq!(Some(Span::from(6..7)), caps[2].get_group_by_name("foo")); /// /// assert_eq!(Some(PatternID::must(1)), caps[3].pattern()); /// assert_eq!(Some(Span::from(7..8)), caps[3].get_group_by_name("foo")); /// /// assert_eq!(Some(PatternID::must(1)), caps[4].pattern()); /// assert_eq!(Some(Span::from(8..9)), caps[4].get_group_by_name("foo")); /// /// # Ok::<(), Box>(()) /// ``` pub fn add_capture_start( &mut self, next: StateID, group_index: u32, name: Option>, ) -> Result { let pid = self.current_pattern_id(); let group_index = match SmallIndex::try_from(group_index) { Err(_) => { return Err(BuildError::invalid_capture_index(group_index)) } Ok(group_index) => group_index, }; // Make sure we have space to insert our (pid,index)|-->name mapping. if pid.as_usize() >= self.captures.len() { for _ in 0..=(pid.as_usize() - self.captures.len()) { self.captures.push(vec![]); } } // In the case where 'group_index < self.captures[pid].len()', it means // that we are adding a duplicate capture group. This is somewhat // weird, but permissible because the capture group itself can be // repeated in the syntax. For example, '([a-z]){4}' will produce 4 // capture groups. In practice, only the last will be set at search // time when a match occurs. For duplicates, we don't need to push // anything other than a CaptureStart NFA state. if group_index.as_usize() >= self.captures[pid].len() { // For discontiguous indices, push placeholders for earlier capture // groups that weren't explicitly added. for _ in 0..(group_index.as_usize() - self.captures[pid].len()) { self.captures[pid].push(None); } self.captures[pid].push(name); } self.add(State::CaptureStart { pattern_id: pid, group_index, next }) } /// Add a "end capture" NFA state. /// /// A "end capture" NFA state corresponds to a state with exactly one /// outgoing unconditional epsilon transition to another state. Unlike /// "empty" states, a "end capture" state also carries with it an /// instruction for saving the current position of input to a particular /// location in memory. NFA simulations, like the Pike VM, may use this /// information to report the match locations of capturing groups in a /// /// Callers may provide a "dummy" state ID (typically [`StateID::ZERO`]), /// and then change it later with [`patch`](Builder::patch). /// /// Note that unlike `start_pattern`/`finish_pattern`, capturing start and /// end states may be interleaved. Indeed, it is typical for many "start /// capture" NFA states to appear before the first "end capture" state. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded or if the given /// capture index overflows `usize`. /// /// While the above are the only conditions in which this routine can /// currently return an error, it is possible to call this method with an /// inputs that results in the final `build()` step failing to produce an /// NFA. For example, if one adds two distinct capturing groups with the /// same name, then that will result in `build()` failing with an error. /// /// See the [`GroupInfo`](crate::util::captures::GroupInfo) type for /// more information on what qualifies as valid capturing groups. pub fn add_capture_end( &mut self, next: StateID, group_index: u32, ) -> Result { let pid = self.current_pattern_id(); let group_index = match SmallIndex::try_from(group_index) { Err(_) => { return Err(BuildError::invalid_capture_index(group_index)) } Ok(group_index) => group_index, }; self.add(State::CaptureEnd { pattern_id: pid, group_index, next }) } /// Adds a "fail" NFA state. /// /// A "fail" state is simply a state that has no outgoing transitions. It /// acts as a way to cause a search to stop without reporting a match. /// For example, one way to represent an NFA with zero patterns is with a /// single "fail" state. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. pub fn add_fail(&mut self) -> Result { self.add(State::Fail) } /// Adds a "match" NFA state. /// /// A "match" state has no outgoing transitions (just like a "fail" /// state), but it has special significance in that if a search enters /// this state, then a match has been found. The match state that is added /// automatically has the current pattern ID associated with it. This is /// used to report the matching pattern ID at search time. /// /// # Errors /// /// This returns an error if the state identifier space is exhausted, or if /// the configured heap size limit has been exceeded. /// /// # Panics /// /// This must be called after a `start_pattern` call but before the /// corresponding `finish_pattern` call. Otherwise, it panics. pub fn add_match(&mut self) -> Result { let pattern_id = self.current_pattern_id(); let sid = self.add(State::Match { pattern_id })?; Ok(sid) } /// The common implementation of "add a state." It handles the common /// error cases of state ID exhausting (by owning state ID allocation) and /// whether the size limit has been exceeded. fn add(&mut self, state: State) -> Result { let id = StateID::new(self.states.len()) .map_err(|_| BuildError::too_many_states(self.states.len()))?; self.memory_states += state.memory_usage(); self.states.push(state); self.check_size_limit()?; Ok(id) } /// Add a transition from one state to another. /// /// This routine is called "patch" since it is very common to add the /// states you want, typically with "dummy" state ID transitions, and then /// "patch" in the real state IDs later. This is because you don't always /// know all of the necessary state IDs to add because they might not /// exist yet. /// /// # Errors /// /// This may error if patching leads to an increase in heap usage beyond /// the configured size limit. Heap usage only grows when patching adds a /// new transition (as in the case of a "union" state). /// /// # Panics /// /// This panics if `from` corresponds to a "sparse" state. When "sparse" /// states are added, there is no way to patch them after-the-fact. (If you /// have a use case where this would be helpful, please file an issue. It /// will likely require a new API.) pub fn patch( &mut self, from: StateID, to: StateID, ) -> Result<(), BuildError> { let old_memory_states = self.memory_states; match self.states[from] { State::Empty { ref mut next } => { *next = to; } State::ByteRange { ref mut trans } => { trans.next = to; } State::Sparse { .. } => { panic!("cannot patch from a sparse NFA state") } State::Look { ref mut next, .. } => { *next = to; } State::Union { ref mut alternates } => { alternates.push(to); self.memory_states += mem::size_of::(); } State::UnionReverse { ref mut alternates } => { alternates.push(to); self.memory_states += mem::size_of::(); } State::CaptureStart { ref mut next, .. } => { *next = to; } State::CaptureEnd { ref mut next, .. } => { *next = to; } State::Fail => {} State::Match { .. } => {} } if old_memory_states != self.memory_states { self.check_size_limit()?; } Ok(()) } /// Set whether the NFA produced by this builder should only match UTF-8. /// /// This should be set when both of the following are true: /// /// 1. The caller guarantees that the NFA created by this build will only /// report non-empty matches with spans that are valid UTF-8. /// 2. The caller desires regex engines using this NFA to avoid reporting /// empty matches with a span that splits a valid UTF-8 encoded codepoint. /// /// Property (1) is not checked. Instead, this requires the caller to /// promise that it is true. Property (2) corresponds to the behavior of /// regex engines using the NFA created by this builder. Namely, there /// is no way in the NFA's graph itself to say that empty matches found /// by, for example, the regex `a*` will fall on valid UTF-8 boundaries. /// Instead, this option is used to communicate the UTF-8 semantic to regex /// engines that will typically implement it as a post-processing step by /// filtering out empty matches that don't fall on UTF-8 boundaries. /// /// If you're building an NFA from an HIR (and not using a /// [`thompson::Compiler`](crate::nfa::thompson::Compiler)), then you can /// use the [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) /// option to guarantee that if the HIR detects a non-empty match, then it /// is guaranteed to be valid UTF-8. /// /// Note that property (2) does *not* specify the behavior of executing /// a search on a haystack that is not valid UTF-8. Therefore, if you're /// *not* running this NFA on strings that are guaranteed to be valid /// UTF-8, you almost certainly do not want to enable this option. /// Similarly, if you are running the NFA on strings that *are* guaranteed /// to be valid UTF-8, then you almost certainly want to enable this option /// unless you can guarantee that your NFA will never produce a zero-width /// match. /// /// It is disabled by default. pub fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; } /// Returns whether UTF-8 mode is enabled for this builder. /// /// See [`Builder::set_utf8`] for more details about what "UTF-8 mode" is. pub fn get_utf8(&self) -> bool { self.utf8 } /// Sets whether the NFA produced by this builder should be matched in /// reverse or not. Generally speaking, when enabled, the NFA produced /// should be matched by moving backwards through a haystack, from a higher /// memory address to a lower memory address. /// /// See also [`NFA::is_reverse`] for more details. /// /// This is disabled by default, which means NFAs are by default matched /// in the forward direction. pub fn set_reverse(&mut self, yes: bool) { self.reverse = yes; } /// Returns whether reverse mode is enabled for this builder. /// /// See [`Builder::set_reverse`] for more details about what "reverse mode" /// is. pub fn get_reverse(&self) -> bool { self.reverse } /// Sets the look-around matcher that should be used for the resulting NFA. /// /// A look-around matcher can be used to configure how look-around /// assertions are matched. For example, a matcher might carry /// configuration that changes the line terminator used for `(?m:^)` and /// `(?m:$)` assertions. pub fn set_look_matcher(&mut self, m: LookMatcher) { self.look_matcher = m; } /// Returns the look-around matcher used for this builder. /// /// If a matcher was not explicitly set, then `LookMatcher::default()` is /// returned. pub fn get_look_matcher(&self) -> &LookMatcher { &self.look_matcher } /// Set the size limit on this builder. /// /// Setting the size limit will also check whether the NFA built so far /// fits within the given size limit. If it doesn't, then an error is /// returned. /// /// By default, there is no configured size limit. pub fn set_size_limit( &mut self, limit: Option, ) -> Result<(), BuildError> { self.size_limit = limit; self.check_size_limit() } /// Return the currently configured size limit. /// /// By default, this returns `None`, which corresponds to no configured /// size limit. pub fn get_size_limit(&self) -> Option { self.size_limit } /// Returns the heap memory usage, in bytes, used by the NFA states added /// so far. /// /// Note that this is an approximation of how big the final NFA will be. /// In practice, the final NFA will likely be a bit smaller because of /// its simpler state representation. (For example, using things like /// `Box<[StateID]>` instead of `Vec`.) pub fn memory_usage(&self) -> usize { self.states.len() * mem::size_of::() + self.memory_states } fn check_size_limit(&self) -> Result<(), BuildError> { if let Some(limit) = self.size_limit { if self.memory_usage() > limit { return Err(BuildError::exceeded_size_limit(limit)); } } Ok(()) } } #[cfg(test)] mod tests { use super::*; // This asserts that a builder state doesn't have its size changed. It is // *really* easy to accidentally increase the size, and thus potentially // dramatically increase the memory usage of NFA builder. // // This assert doesn't mean we absolutely cannot increase the size of a // builder state. We can. It's just here to make sure we do it knowingly // and intentionally. // // A builder state is unfortunately a little bigger than an NFA state, // since we really want to support adding things to a pre-existing state. // i.e., We use Vec instead of Box<[thing]>. So we end up using an // extra 8 bytes per state. Sad, but at least it gets freed once the NFA // is built. #[test] fn state_has_small_size() { #[cfg(target_pointer_width = "64")] assert_eq!(32, core::mem::size_of::()); #[cfg(target_pointer_width = "32")] assert_eq!(16, core::mem::size_of::()); } } regex-automata-0.4.9/src/nfa/thompson/compiler.rs000064400000000000000000002501251046102023000201500ustar 00000000000000use core::{borrow::Borrow, cell::RefCell}; use alloc::{sync::Arc, vec, vec::Vec}; use regex_syntax::{ hir::{self, Hir}, utf8::{Utf8Range, Utf8Sequences}, ParserBuilder, }; use crate::{ nfa::thompson::{ builder::Builder, error::BuildError, literal_trie::LiteralTrie, map::{Utf8BoundedMap, Utf8SuffixKey, Utf8SuffixMap}, nfa::{Transition, NFA}, range_trie::RangeTrie, }, util::{ look::{Look, LookMatcher}, primitives::{PatternID, StateID}, }, }; /// The configuration used for a Thompson NFA compiler. #[derive(Clone, Debug, Default)] pub struct Config { utf8: Option, reverse: Option, nfa_size_limit: Option>, shrink: Option, which_captures: Option, look_matcher: Option, #[cfg(test)] unanchored_prefix: Option, } impl Config { /// Return a new default Thompson NFA compiler configuration. pub fn new() -> Config { Config::default() } /// Whether to enable UTF-8 mode during search or not. /// /// A regex engine is said to be in UTF-8 mode when it guarantees that /// all matches returned by it have spans consisting of only valid UTF-8. /// That is, it is impossible for a match span to be returned that /// contains any invalid UTF-8. /// /// UTF-8 mode generally consists of two things: /// /// 1. Whether the NFA's states are constructed such that all paths to a /// match state that consume at least one byte always correspond to valid /// UTF-8. /// 2. Whether all paths to a match state that do _not_ consume any bytes /// should always correspond to valid UTF-8 boundaries. /// /// (1) is a guarantee made by whoever constructs the NFA. /// If you're parsing a regex from its concrete syntax, then /// [`syntax::Config::utf8`](crate::util::syntax::Config::utf8) can make /// this guarantee for you. It does it by returning an error if the regex /// pattern could every report a non-empty match span that contains invalid /// UTF-8. So long as `syntax::Config::utf8` mode is enabled and your regex /// successfully parses, then you're guaranteed that the corresponding NFA /// will only ever report non-empty match spans containing valid UTF-8. /// /// (2) is a trickier guarantee because it cannot be enforced by the NFA /// state graph itself. Consider, for example, the regex `a*`. It matches /// the empty strings in `☃` at positions `0`, `1`, `2` and `3`, where /// positions `1` and `2` occur within the UTF-8 encoding of a codepoint, /// and thus correspond to invalid UTF-8 boundaries. Therefore, this /// guarantee must be made at a higher level than the NFA state graph /// itself. This crate deals with this case in each regex engine. Namely, /// when a zero-width match that splits a codepoint is found and UTF-8 /// mode enabled, then it is ignored and the engine moves on looking for /// the next match. /// /// Thus, UTF-8 mode is both a promise that the NFA built only reports /// non-empty matches that are valid UTF-8, and an *instruction* to regex /// engines that empty matches that split codepoints should be banned. /// /// Because UTF-8 mode is fundamentally about avoiding invalid UTF-8 spans, /// it only makes sense to enable this option when you *know* your haystack /// is valid UTF-8. (For example, a `&str`.) Enabling UTF-8 mode and /// searching a haystack that contains invalid UTF-8 leads to **unspecified /// behavior**. /// /// Therefore, it may make sense to enable `syntax::Config::utf8` while /// simultaneously *disabling* this option. That would ensure all non-empty /// match spans are valid UTF-8, but that empty match spans may still split /// a codepoint or match at other places that aren't valid UTF-8. /// /// In general, this mode is only relevant if your regex can match the /// empty string. Most regexes don't. /// /// This is enabled by default. /// /// # Example /// /// This example shows how UTF-8 mode can impact the match spans that may /// be reported in certain cases. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// Match, Input, /// }; /// /// let re = PikeVM::new("")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// // UTF-8 mode is enabled by default. /// let mut input = Input::new("☃"); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); /// /// // Even though an empty regex matches at 1..1, our next match is /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is /// // three bytes long). /// input.set_start(1); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); /// /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build("")?; /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); /// /// input.set_start(2); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); /// /// input.set_start(3); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); /// /// input.set_start(4); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn utf8(mut self, yes: bool) -> Config { self.utf8 = Some(yes); self } /// Reverse the NFA. /// /// A NFA reversal is performed by reversing all of the concatenated /// sub-expressions in the original pattern, recursively. (Look around /// operators are also inverted.) The resulting NFA can be used to match /// the pattern starting from the end of a string instead of the beginning /// of a string. /// /// Reversing the NFA is useful for building a reverse DFA, which is most /// useful for finding the start of a match after its ending position has /// been found. NFA execution engines typically do not work on reverse /// NFAs. For example, currently, the Pike VM reports the starting location /// of matches without a reverse NFA. /// /// Currently, enabling this setting requires disabling the /// [`captures`](Config::captures) setting. If both are enabled, then the /// compiler will return an error. It is expected that this limitation will /// be lifted in the future. /// /// This is disabled by default. /// /// # Example /// /// This example shows how to build a DFA from a reverse NFA, and then use /// the DFA to search backwards. /// /// ``` /// use regex_automata::{ /// dfa::{self, Automaton}, /// nfa::thompson::{NFA, WhichCaptures}, /// HalfMatch, Input, /// }; /// /// let dfa = dfa::dense::Builder::new() /// .thompson(NFA::config() /// .which_captures(WhichCaptures::None) /// .reverse(true) /// ) /// .build("baz[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!( /// expected, /// dfa.try_search_rev(&Input::new("foobaz12345bar"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reverse(mut self, yes: bool) -> Config { self.reverse = Some(yes); self } /// Sets an approximate size limit on the total heap used by the NFA being /// compiled. /// /// This permits imposing constraints on the size of a compiled NFA. This /// may be useful in contexts where the regex pattern is untrusted and one /// wants to avoid using too much memory. /// /// This size limit does not apply to auxiliary heap used during /// compilation that is not part of the built NFA. /// /// Note that this size limit is applied during compilation in order for /// the limit to prevent too much heap from being used. However, the /// implementation may use an intermediate NFA representation that is /// otherwise slightly bigger than the final public form. Since the size /// limit may be applied to an intermediate representation, there is not /// necessarily a precise correspondence between the configured size limit /// and the heap usage of the final NFA. /// /// There is no size limit by default. /// /// # Example /// /// This example demonstrates how Unicode mode can greatly increase the /// size of the NFA. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::NFA; /// /// // 400KB isn't enough! /// NFA::compiler() /// .configure(NFA::config().nfa_size_limit(Some(400_000))) /// .build(r"\w{20}") /// .unwrap_err(); /// /// // ... but 500KB probably is. /// let nfa = NFA::compiler() /// .configure(NFA::config().nfa_size_limit(Some(500_000))) /// .build(r"\w{20}")?; /// /// assert_eq!(nfa.pattern_len(), 1); /// /// # Ok::<(), Box>(()) /// ``` pub fn nfa_size_limit(mut self, bytes: Option) -> Config { self.nfa_size_limit = Some(bytes); self } /// Apply best effort heuristics to shrink the NFA at the expense of more /// time/memory. /// /// Generally speaking, if one is using an NFA to compile a DFA, then the /// extra time used to shrink the NFA will be more than made up for during /// DFA construction (potentially by a lot). In other words, enabling this /// can substantially decrease the overall amount of time it takes to build /// a DFA. /// /// A reason to keep this disabled is if you want to compile an NFA and /// start using it as quickly as possible without needing to build a DFA, /// and you don't mind using a bit of extra memory for the NFA. e.g., for /// an NFA simulation or for a lazy DFA. /// /// NFA shrinking is currently most useful when compiling a reverse /// NFA with large Unicode character classes. In particular, it trades /// additional CPU time during NFA compilation in favor of generating fewer /// NFA states. /// /// This is disabled by default because it can increase compile times /// quite a bit if you aren't building a full DFA. /// /// # Example /// /// This example shows that NFA shrinking can lead to substantial space /// savings in some cases. Notice that, as noted above, we build a reverse /// DFA and use a pattern with a large Unicode character class. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Currently we have to disable captures when enabling reverse NFA. /// let config = NFA::config() /// .which_captures(WhichCaptures::None) /// .reverse(true); /// let not_shrunk = NFA::compiler() /// .configure(config.clone().shrink(false)) /// .build(r"\w")?; /// let shrunk = NFA::compiler() /// .configure(config.clone().shrink(true)) /// .build(r"\w")?; /// /// // While a specific shrink factor is not guaranteed, the savings can be /// // considerable in some cases. /// assert!(shrunk.states().len() * 2 < not_shrunk.states().len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn shrink(mut self, yes: bool) -> Config { self.shrink = Some(yes); self } /// Whether to include 'Capture' states in the NFA. /// /// Currently, enabling this setting requires disabling the /// [`reverse`](Config::reverse) setting. If both are enabled, then the /// compiler will return an error. It is expected that this limitation will /// be lifted in the future. /// /// This is enabled by default. /// /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, /// require capturing states to be present in the NFA to report match /// offsets. /// /// (Note that since this method is deprecated, the example below uses /// [`Config::which_captures`] to disable capture states.) /// /// ``` /// use regex_automata::nfa::thompson::{ /// pikevm::PikeVM, /// NFA, /// WhichCaptures, /// }; /// /// let re = PikeVM::builder() /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "abc")); /// assert_eq!(None, re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` #[deprecated(since = "0.3.5", note = "use which_captures instead")] pub fn captures(self, yes: bool) -> Config { self.which_captures(if yes { WhichCaptures::All } else { WhichCaptures::None }) } /// Configures what kinds of capture groups are compiled into /// [`State::Capture`](crate::nfa::thompson::State::Capture) states in a /// Thompson NFA. /// /// Currently, using any option except for [`WhichCaptures::None`] requires /// disabling the [`reverse`](Config::reverse) setting. If both are /// enabled, then the compiler will return an error. It is expected that /// this limitation will be lifted in the future. /// /// This is set to [`WhichCaptures::All`] by default. Callers may wish to /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the /// overhead of capture states for explicit groups. Usually this occurs /// when one wants to use the `PikeVM` only for determining the overall /// match. Otherwise, the `PikeVM` could use much more memory than is /// necessary. /// /// # Example /// /// This example demonstrates that some regex engines, like the Pike VM, /// require capturing states to be present in the NFA to report match /// offsets. /// /// ``` /// use regex_automata::nfa::thompson::{ /// pikevm::PikeVM, /// NFA, /// WhichCaptures, /// }; /// /// let re = PikeVM::builder() /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "abc")); /// assert_eq!(None, re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` /// /// The same applies to the bounded backtracker: /// /// ``` /// use regex_automata::nfa::thompson::{ /// backtrack::BoundedBacktracker, /// NFA, /// WhichCaptures, /// }; /// /// let re = BoundedBacktracker::builder() /// .thompson(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"[a-z]+")?; /// let mut cache = re.create_cache(); /// /// assert!(re.try_is_match(&mut cache, "abc")?); /// assert_eq!(None, re.try_find(&mut cache, "abc")?); /// /// # Ok::<(), Box>(()) /// ``` pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { self.which_captures = Some(which_captures); self } /// Sets the look-around matcher that should be used with this NFA. /// /// A look-around matcher determines how to match look-around assertions. /// In particular, some assertions are configurable. For example, the /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed /// from the default of `\n` to any other byte. /// /// # Example /// /// This shows how to change the line terminator for multi-line assertions. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::look::LookMatcher, /// Match, Input, /// }; /// /// let mut lookm = LookMatcher::new(); /// lookm.set_line_terminator(b'\x00'); /// /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().look_matcher(lookm)) /// .build(r"(?m)^[a-z]+$")?; /// let mut cache = re.create_cache(); /// /// // Multi-line assertions now use NUL as a terminator. /// assert_eq!( /// Some(Match::must(0, 1..4)), /// re.find(&mut cache, b"\x00abc\x00"), /// ); /// // ... and \n is no longer recognized as a terminator. /// assert_eq!( /// None, /// re.find(&mut cache, b"\nabc\n"), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn look_matcher(mut self, m: LookMatcher) -> Config { self.look_matcher = Some(m); self } /// Whether to compile an unanchored prefix into this NFA. /// /// This is enabled by default. It is made available for tests only to make /// it easier to unit test the output of the compiler. #[cfg(test)] fn unanchored_prefix(mut self, yes: bool) -> Config { self.unanchored_prefix = Some(yes); self } /// Returns whether this configuration has enabled UTF-8 mode. pub fn get_utf8(&self) -> bool { self.utf8.unwrap_or(true) } /// Returns whether this configuration has enabled reverse NFA compilation. pub fn get_reverse(&self) -> bool { self.reverse.unwrap_or(false) } /// Return the configured NFA size limit, if it exists, in the number of /// bytes of heap used. pub fn get_nfa_size_limit(&self) -> Option { self.nfa_size_limit.unwrap_or(None) } /// Return whether NFA shrinking is enabled. pub fn get_shrink(&self) -> bool { self.shrink.unwrap_or(false) } /// Return whether NFA compilation is configured to produce capture states. #[deprecated(since = "0.3.5", note = "use get_which_captures instead")] pub fn get_captures(&self) -> bool { self.get_which_captures().is_any() } /// Return what kinds of capture states will be compiled into an NFA. pub fn get_which_captures(&self) -> WhichCaptures { self.which_captures.unwrap_or(WhichCaptures::All) } /// Return the look-around matcher for this NFA. pub fn get_look_matcher(&self) -> LookMatcher { self.look_matcher.clone().unwrap_or(LookMatcher::default()) } /// Return whether NFA compilation is configured to include an unanchored /// prefix. /// /// This is always false when not in test mode. fn get_unanchored_prefix(&self) -> bool { #[cfg(test)] { self.unanchored_prefix.unwrap_or(true) } #[cfg(not(test))] { true } } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { utf8: o.utf8.or(self.utf8), reverse: o.reverse.or(self.reverse), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), shrink: o.shrink.or(self.shrink), which_captures: o.which_captures.or(self.which_captures), look_matcher: o.look_matcher.or_else(|| self.look_matcher.clone()), #[cfg(test)] unanchored_prefix: o.unanchored_prefix.or(self.unanchored_prefix), } } } /// A configuration indicating which kinds of /// [`State::Capture`](crate::nfa::thompson::State::Capture) states to include. /// /// This configuration can be used with [`Config::which_captures`] to control /// which capture states are compiled into a Thompson NFA. /// /// The default configuration is [`WhichCaptures::All`]. #[derive(Clone, Copy, Debug)] pub enum WhichCaptures { /// All capture states, including those corresponding to both implicit and /// explicit capture groups, are included in the Thompson NFA. All, /// Only capture states corresponding to implicit capture groups are /// included. Implicit capture groups appear in every pattern implicitly /// and correspond to the overall match of a pattern. /// /// This is useful when one only cares about the overall match of a /// pattern. By excluding capture states from explicit capture groups, /// one might be able to reduce the memory usage of a multi-pattern regex /// substantially if it was otherwise written to have many explicit capture /// groups. Implicit, /// No capture states are compiled into the Thompson NFA. /// /// This is useful when capture states are either not needed (for example, /// if one is only trying to build a DFA) or if they aren't supported (for /// example, a reverse NFA). None, } impl Default for WhichCaptures { fn default() -> WhichCaptures { WhichCaptures::All } } impl WhichCaptures { /// Returns true if this configuration indicates that no capture states /// should be produced in an NFA. pub fn is_none(&self) -> bool { matches!(*self, WhichCaptures::None) } /// Returns true if this configuration indicates that some capture states /// should be added to an NFA. Note that this might only include capture /// states for implicit capture groups. pub fn is_any(&self) -> bool { !self.is_none() } } /* This compiler below uses Thompson's construction algorithm. The compiler takes a regex-syntax::Hir as input and emits an NFA graph as output. The NFA graph is structured in a way that permits it to be executed by a virtual machine and also used to efficiently build a DFA. The compiler deals with a slightly expanded set of NFA states than what is in a final NFA (as exhibited by builder::State and nfa::State). Notably a compiler state includes an empty node that has exactly one unconditional epsilon transition to the next state. In other words, it's a "goto" instruction if one views Thompson's NFA as a set of bytecode instructions. These goto instructions are removed in a subsequent phase before returning the NFA to the caller. The purpose of these empty nodes is that they make the construction algorithm substantially simpler to implement. We remove them before returning to the caller because they can represent substantial overhead when traversing the NFA graph (either while searching using the NFA directly or while building a DFA). In the future, it would be nice to provide a Glushkov compiler as well, as it would work well as a bit-parallel NFA for smaller regexes. But the Thompson construction is one I'm more familiar with and seems more straight-forward to deal with when it comes to large Unicode character classes. Internally, the compiler uses interior mutability to improve composition in the face of the borrow checker. In particular, we'd really like to be able to write things like this: self.c_concat(exprs.iter().map(|e| self.c(e))) Which elegantly uses iterators to build up a sequence of compiled regex sub-expressions and then hands it off to the concatenating compiler routine. Without interior mutability, the borrow checker won't let us borrow `self` mutably both inside and outside the closure at the same time. */ /// A builder for compiling an NFA from a regex's high-level intermediate /// representation (HIR). /// /// This compiler provides a way to translate a parsed regex pattern into an /// NFA state graph. The NFA state graph can either be used directly to execute /// a search (e.g., with a Pike VM), or it can be further used to build a DFA. /// /// This compiler provides APIs both for compiling regex patterns directly from /// their concrete syntax, or via a [`regex_syntax::hir::Hir`]. /// /// This compiler has various options that may be configured via /// [`thompson::Config`](Config). /// /// Note that a compiler is not the same as a [`thompson::Builder`](Builder). /// A `Builder` provides a lower level API that is uncoupled from a regex /// pattern's concrete syntax or even its HIR. Instead, it permits stitching /// together an NFA by hand. See its docs for examples. /// /// # Example: compilation from concrete syntax /// /// This shows how to compile an NFA from a pattern string while setting a size /// limit on how big the NFA is allowed to be (in terms of bytes of heap used). /// /// ``` /// use regex_automata::{ /// nfa::thompson::{NFA, pikevm::PikeVM}, /// Match, /// }; /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: compilation from HIR /// /// This shows how to hand assemble a regular expression via its HIR, and then /// compile an NFA directly from it. /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Compiler { /// A regex parser, used when compiling an NFA directly from a pattern /// string. parser: ParserBuilder, /// The compiler configuration. config: Config, /// The builder for actually constructing an NFA. This provides a /// convenient abstraction for writing a compiler. builder: RefCell, /// State used for compiling character classes to UTF-8 byte automata. /// State is not retained between character class compilations. This just /// serves to amortize allocation to the extent possible. utf8_state: RefCell, /// State used for arranging character classes in reverse into a trie. trie_state: RefCell, /// State used for caching common suffixes when compiling reverse UTF-8 /// automata (for Unicode character classes). utf8_suffix: RefCell, } impl Compiler { /// Create a new NFA builder with its default configuration. pub fn new() -> Compiler { Compiler { parser: ParserBuilder::new(), config: Config::default(), builder: RefCell::new(Builder::new()), utf8_state: RefCell::new(Utf8State::new()), trie_state: RefCell::new(RangeTrie::new()), utf8_suffix: RefCell::new(Utf8SuffixMap::new(1000)), } } /// Compile the given regular expression pattern into an NFA. /// /// If there was a problem parsing the regex, then that error is returned. /// /// Otherwise, if there was a problem building the NFA, then an error is /// returned. The only error that can occur is if the compiled regex would /// exceed the size limits configured on this builder, or if any part of /// the NFA would exceed the integer representations used. (For example, /// too many states might plausibly occur on a 16-bit target.) /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Compile the given regular expression patterns into a single NFA. /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_many(&[ /// r"(?-u)\s", /// r"(?-u)\w", /// ])?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(1, 1..2)); /// re.captures(&mut cache, "!A! !A!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_many>( &self, patterns: &[P], ) -> Result { let mut hirs = vec![]; for p in patterns { hirs.push( self.parser .build() .parse(p.as_ref()) .map_err(BuildError::syntax)?, ); debug!("parsed: {:?}", p.as_ref()); } self.build_many_from_hir(&hirs) } /// Compile the given high level intermediate representation of a regular /// expression into an NFA. /// /// If there was a problem building the NFA, then an error is returned. The /// only error that can occur is if the compiled regex would exceed the /// size limits configured on this builder, or if any part of the NFA would /// exceed the integer representations used. (For example, too many states /// might plausibly occur on a 16-bit target.) /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_from_hir(&self, expr: &Hir) -> Result { self.build_many_from_hir(&[expr]) } /// Compile the given high level intermediate representations of regular /// expressions into a single NFA. /// /// When matches are returned, the pattern ID corresponds to the index of /// the pattern in the slice given. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hirs = &[ /// Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'\t', b'\r'), /// ClassBytesRange::new(b' ', b' '), /// ]))), /// Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))), /// ]; /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_many_from_hir(hirs)?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// let expected = Some(Match::must(1, 1..2)); /// re.captures(&mut cache, "!A! !A!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_many_from_hir>( &self, exprs: &[H], ) -> Result { self.compile(exprs) } /// Apply the given NFA configuration options to this builder. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build(r"(?-u)\w")?; /// assert_eq!(nfa.pattern_len(), 1); /// /// # Ok::<(), Box>(()) /// ``` pub fn configure(&mut self, config: Config) -> &mut Compiler { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// This syntax configuration only applies when an NFA is built directly /// from a pattern string. If an NFA is built from an HIR, then all syntax /// settings are ignored. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::syntax}; /// /// let syntax_config = syntax::Config::new().unicode(false); /// let nfa = NFA::compiler().syntax(syntax_config).build(r"\w")?; /// // If Unicode were enabled, the number of states would be much bigger. /// assert!(nfa.states().len() < 15); /// /// # Ok::<(), Box>(()) /// ``` pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Compiler { config.apply(&mut self.parser); self } } impl Compiler { /// Compile the sequence of HIR expressions given. Pattern IDs are /// allocated starting from 0, in correspondence with the slice given. /// /// It is legal to provide an empty slice. In that case, the NFA returned /// has no patterns and will never match anything. fn compile>(&self, exprs: &[H]) -> Result { if exprs.len() > PatternID::LIMIT { return Err(BuildError::too_many_patterns(exprs.len())); } if self.config.get_reverse() && self.config.get_which_captures().is_any() { return Err(BuildError::unsupported_captures()); } self.builder.borrow_mut().clear(); self.builder.borrow_mut().set_utf8(self.config.get_utf8()); self.builder.borrow_mut().set_reverse(self.config.get_reverse()); self.builder .borrow_mut() .set_look_matcher(self.config.get_look_matcher()); self.builder .borrow_mut() .set_size_limit(self.config.get_nfa_size_limit())?; // We always add an unanchored prefix unless we were specifically told // not to (for tests only), or if we know that the regex is anchored // for all matches. When an unanchored prefix is not added, then the // NFA's anchored and unanchored start states are equivalent. let all_anchored = exprs.iter().all(|e| { let props = e.borrow().properties(); if self.config.get_reverse() { props.look_set_suffix().contains(hir::Look::End) } else { props.look_set_prefix().contains(hir::Look::Start) } }); let anchored = !self.config.get_unanchored_prefix() || all_anchored; let unanchored_prefix = if anchored { self.c_empty()? } else { self.c_at_least(&Hir::dot(hir::Dot::AnyByte), false, 0)? }; let compiled = self.c_alt_iter(exprs.iter().map(|e| { let _ = self.start_pattern()?; let one = self.c_cap(0, None, e.borrow())?; let match_state_id = self.add_match()?; self.patch(one.end, match_state_id)?; let _ = self.finish_pattern(one.start)?; Ok(ThompsonRef { start: one.start, end: match_state_id }) }))?; self.patch(unanchored_prefix.end, compiled.start)?; let nfa = self .builder .borrow_mut() .build(compiled.start, unanchored_prefix.start)?; debug!("HIR-to-NFA compilation complete, config: {:?}", self.config); Ok(nfa) } /// Compile an arbitrary HIR expression. fn c(&self, expr: &Hir) -> Result { use regex_syntax::hir::{Class, HirKind::*}; match *expr.kind() { Empty => self.c_empty(), Literal(hir::Literal(ref bytes)) => self.c_literal(bytes), Class(Class::Bytes(ref c)) => self.c_byte_class(c), Class(Class::Unicode(ref c)) => self.c_unicode_class(c), Look(ref look) => self.c_look(look), Repetition(ref rep) => self.c_repetition(rep), Capture(ref c) => self.c_cap(c.index, c.name.as_deref(), &c.sub), Concat(ref es) => self.c_concat(es.iter().map(|e| self.c(e))), Alternation(ref es) => self.c_alt_slice(es), } } /// Compile a concatenation of the sub-expressions yielded by the given /// iterator. If the iterator yields no elements, then this compiles down /// to an "empty" state that always matches. /// /// If the compiler is in reverse mode, then the expressions given are /// automatically compiled in reverse. fn c_concat(&self, mut it: I) -> Result where I: DoubleEndedIterator>, { let first = if self.is_reverse() { it.next_back() } else { it.next() }; let ThompsonRef { start, mut end } = match first { Some(result) => result?, None => return self.c_empty(), }; loop { let next = if self.is_reverse() { it.next_back() } else { it.next() }; let compiled = match next { Some(result) => result?, None => break, }; self.patch(end, compiled.start)?; end = compiled.end; } Ok(ThompsonRef { start, end }) } /// Compile an alternation of the given HIR values. /// /// This is like 'c_alt_iter', but it accepts a slice of HIR values instead /// of an iterator of compiled NFA subgraphs. The point of accepting a /// slice here is that it opens up some optimization opportunities. For /// example, if all of the HIR values are literals, then this routine might /// re-shuffle them to make NFA epsilon closures substantially faster. fn c_alt_slice(&self, exprs: &[Hir]) -> Result { // self.c_alt_iter(exprs.iter().map(|e| self.c(e))) let literal_count = exprs .iter() .filter(|e| { matches!(*e.kind(), hir::HirKind::Literal(hir::Literal(_))) }) .count(); if literal_count <= 1 || literal_count < exprs.len() { return self.c_alt_iter(exprs.iter().map(|e| self.c(e))); } let mut trie = if self.is_reverse() { LiteralTrie::reverse() } else { LiteralTrie::forward() }; for expr in exprs.iter() { let literal = match *expr.kind() { hir::HirKind::Literal(hir::Literal(ref bytes)) => bytes, _ => unreachable!(), }; trie.add(literal)?; } trie.compile(&mut self.builder.borrow_mut()) } /// Compile an alternation, where each element yielded by the given /// iterator represents an item in the alternation. If the iterator yields /// no elements, then this compiles down to a "fail" state. /// /// In an alternation, expressions appearing earlier are "preferred" at /// match time over expressions appearing later. At least, this is true /// when using "leftmost first" match semantics. (If "leftmost longest" are /// ever added in the future, then this preference order of priority would /// not apply in that mode.) fn c_alt_iter(&self, mut it: I) -> Result where I: Iterator>, { let first = match it.next() { None => return self.c_fail(), Some(result) => result?, }; let second = match it.next() { None => return Ok(first), Some(result) => result?, }; let union = self.add_union()?; let end = self.add_empty()?; self.patch(union, first.start)?; self.patch(first.end, end)?; self.patch(union, second.start)?; self.patch(second.end, end)?; for result in it { let compiled = result?; self.patch(union, compiled.start)?; self.patch(compiled.end, end)?; } Ok(ThompsonRef { start: union, end }) } /// Compile the given capture sub-expression. `expr` should be the /// sub-expression contained inside the capture. If "capture" states are /// enabled, then they are added as appropriate. /// /// This accepts the pieces of a capture instead of a `hir::Capture` so /// that it's easy to manufacture a "fake" group when necessary, e.g., for /// adding the entire pattern as if it were a group in order to create /// appropriate "capture" states in the NFA. fn c_cap( &self, index: u32, name: Option<&str>, expr: &Hir, ) -> Result { match self.config.get_which_captures() { // No capture states means we always skip them. WhichCaptures::None => return self.c(expr), // Implicit captures states means we only add when index==0 since // index==0 implies the group is implicit. WhichCaptures::Implicit if index > 0 => return self.c(expr), _ => {} } let start = self.add_capture_start(index, name)?; let inner = self.c(expr)?; let end = self.add_capture_end(index)?; self.patch(start, inner.start)?; self.patch(inner.end, end)?; Ok(ThompsonRef { start, end }) } /// Compile the given repetition expression. This handles all types of /// repetitions and greediness. fn c_repetition( &self, rep: &hir::Repetition, ) -> Result { match (rep.min, rep.max) { (0, Some(1)) => self.c_zero_or_one(&rep.sub, rep.greedy), (min, None) => self.c_at_least(&rep.sub, rep.greedy, min), (min, Some(max)) if min == max => self.c_exactly(&rep.sub, min), (min, Some(max)) => self.c_bounded(&rep.sub, rep.greedy, min, max), } } /// Compile the given expression such that it matches at least `min` times, /// but no more than `max` times. /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result { let prefix = self.c_exactly(expr, min)?; if min == max { return Ok(prefix); } // It is tempting here to compile the rest here as a concatenation // of zero-or-one matches. i.e., for `a{2,5}`, compile it as if it // were `aaa?a?a?`. The problem here is that it leads to this program: // // >000000: 61 => 01 // 000001: 61 => 02 // 000002: union(03, 04) // 000003: 61 => 04 // 000004: union(05, 06) // 000005: 61 => 06 // 000006: union(07, 08) // 000007: 61 => 08 // 000008: MATCH // // And effectively, once you hit state 2, the epsilon closure will // include states 3, 5, 6, 7 and 8, which is quite a bit. It is better // to instead compile it like so: // // >000000: 61 => 01 // 000001: 61 => 02 // 000002: union(03, 08) // 000003: 61 => 04 // 000004: union(05, 08) // 000005: 61 => 06 // 000006: union(07, 08) // 000007: 61 => 08 // 000008: MATCH // // So that the epsilon closure of state 2 is now just 3 and 8. let empty = self.add_empty()?; let mut prev_end = prefix.end; for _ in min..max { let union = if greedy { self.add_union() } else { self.add_union_reverse() }?; let compiled = self.c(expr)?; self.patch(prev_end, union)?; self.patch(union, compiled.start)?; self.patch(union, empty)?; prev_end = compiled.end; } self.patch(prev_end, empty)?; Ok(ThompsonRef { start: prefix.start, end: empty }) } /// Compile the given expression such that it may be matched `n` or more /// times, where `n` can be any integer. (Although a particularly large /// integer is likely to run afoul of any configured size limits.) /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result { if n == 0 { // When the expression cannot match the empty string, then we // can get away with something much simpler: just one 'alt' // instruction that optionally repeats itself. But if the expr // can match the empty string... see below. if expr.properties().minimum_len().map_or(false, |len| len > 0) { let union = if greedy { self.add_union() } else { self.add_union_reverse() }?; let compiled = self.c(expr)?; self.patch(union, compiled.start)?; self.patch(compiled.end, union)?; return Ok(ThompsonRef { start: union, end: union }); } // What's going on here? Shouldn't x* be simpler than this? It // turns out that when implementing leftmost-first (Perl-like) // match semantics, x* results in an incorrect preference order // when computing the transitive closure of states if and only if // 'x' can match the empty string. So instead, we compile x* as // (x+)?, which preserves the correct preference order. // // See: https://github.com/rust-lang/regex/issues/779 let compiled = self.c(expr)?; let plus = if greedy { self.add_union() } else { self.add_union_reverse() }?; self.patch(compiled.end, plus)?; self.patch(plus, compiled.start)?; let question = if greedy { self.add_union() } else { self.add_union_reverse() }?; let empty = self.add_empty()?; self.patch(question, compiled.start)?; self.patch(question, empty)?; self.patch(plus, empty)?; Ok(ThompsonRef { start: question, end: empty }) } else if n == 1 { let compiled = self.c(expr)?; let union = if greedy { self.add_union() } else { self.add_union_reverse() }?; self.patch(compiled.end, union)?; self.patch(union, compiled.start)?; Ok(ThompsonRef { start: compiled.start, end: union }) } else { let prefix = self.c_exactly(expr, n - 1)?; let last = self.c(expr)?; let union = if greedy { self.add_union() } else { self.add_union_reverse() }?; self.patch(prefix.end, last.start)?; self.patch(last.end, union)?; self.patch(union, last.start)?; Ok(ThompsonRef { start: prefix.start, end: union }) } } /// Compile the given expression such that it may be matched zero or one /// times. /// /// When `greedy` is true, then the preference is for the expression to /// match as much as possible. Otherwise, it will match as little as /// possible. fn c_zero_or_one( &self, expr: &Hir, greedy: bool, ) -> Result { let union = if greedy { self.add_union() } else { self.add_union_reverse() }?; let compiled = self.c(expr)?; let empty = self.add_empty()?; self.patch(union, compiled.start)?; self.patch(union, empty)?; self.patch(compiled.end, empty)?; Ok(ThompsonRef { start: union, end: empty }) } /// Compile the given HIR expression exactly `n` times. fn c_exactly( &self, expr: &Hir, n: u32, ) -> Result { let it = (0..n).map(|_| self.c(expr)); self.c_concat(it) } /// Compile the given byte oriented character class. /// /// This uses "sparse" states to represent an alternation between ranges in /// this character class. We can use "sparse" states instead of stitching /// together a "union" state because all ranges in a character class have /// equal priority *and* are non-overlapping (thus, only one can match, so /// there's never a question of priority in the first place). This saves a /// fair bit of overhead when traversing an NFA. /// /// This routine compiles an empty character class into a "fail" state. fn c_byte_class( &self, cls: &hir::ClassBytes, ) -> Result { let end = self.add_empty()?; let mut trans = Vec::with_capacity(cls.ranges().len()); for r in cls.iter() { trans.push(Transition { start: r.start(), end: r.end(), next: end, }); } Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) } /// Compile the given Unicode character class. /// /// This routine specifically tries to use various types of compression, /// since UTF-8 automata of large classes can get quite large. The specific /// type of compression used depends on forward vs reverse compilation, and /// whether NFA shrinking is enabled or not. /// /// Aside from repetitions causing lots of repeat group, this is like the /// single most expensive part of regex compilation. Therefore, a large part /// of the expense of compilation may be reduce by disabling Unicode in the /// pattern. /// /// This routine compiles an empty character class into a "fail" state. fn c_unicode_class( &self, cls: &hir::ClassUnicode, ) -> Result { // If all we have are ASCII ranges wrapped in a Unicode package, then // there is zero reason to bring out the big guns. We can fit all ASCII // ranges within a single sparse state. if cls.is_ascii() { let end = self.add_empty()?; let mut trans = Vec::with_capacity(cls.ranges().len()); for r in cls.iter() { // The unwraps below are OK because we've verified that this // class only contains ASCII codepoints. trans.push(Transition { // FIXME(1.59): use the 'TryFrom for u8' impl. start: u8::try_from(u32::from(r.start())).unwrap(), end: u8::try_from(u32::from(r.end())).unwrap(), next: end, }); } Ok(ThompsonRef { start: self.add_sparse(trans)?, end }) } else if self.is_reverse() { if !self.config.get_shrink() { // When we don't want to spend the extra time shrinking, we // compile the UTF-8 automaton in reverse using something like // the "naive" approach, but will attempt to re-use common // suffixes. self.c_unicode_class_reverse_with_suffix(cls) } else { // When we want to shrink our NFA for reverse UTF-8 automata, // we cannot feed UTF-8 sequences directly to the UTF-8 // compiler, since the UTF-8 compiler requires all sequences // to be lexicographically sorted. Instead, we organize our // sequences into a range trie, which can then output our // sequences in the correct order. Unfortunately, building the // range trie is fairly expensive (but not nearly as expensive // as building a DFA). Hence the reason why the 'shrink' option // exists, so that this path can be toggled off. For example, // we might want to turn this off if we know we won't be // compiling a DFA. let mut trie = self.trie_state.borrow_mut(); trie.clear(); for rng in cls.iter() { for mut seq in Utf8Sequences::new(rng.start(), rng.end()) { seq.reverse(); trie.insert(seq.as_slice()); } } let mut builder = self.builder.borrow_mut(); let mut utf8_state = self.utf8_state.borrow_mut(); let mut utf8c = Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; trie.iter(|seq| { utf8c.add(&seq)?; Ok(()) })?; utf8c.finish() } } else { // In the forward direction, we always shrink our UTF-8 automata // because we can stream it right into the UTF-8 compiler. There // is almost no downside (in either memory or time) to using this // approach. let mut builder = self.builder.borrow_mut(); let mut utf8_state = self.utf8_state.borrow_mut(); let mut utf8c = Utf8Compiler::new(&mut *builder, &mut *utf8_state)?; for rng in cls.iter() { for seq in Utf8Sequences::new(rng.start(), rng.end()) { utf8c.add(seq.as_slice())?; } } utf8c.finish() } // For reference, the code below is the "naive" version of compiling a // UTF-8 automaton. It is deliciously simple (and works for both the // forward and reverse cases), but will unfortunately produce very // large NFAs. When compiling a forward automaton, the size difference // can sometimes be an order of magnitude. For example, the '\w' regex // will generate about ~3000 NFA states using the naive approach below, // but only 283 states when using the approach above. This is because // the approach above actually compiles a *minimal* (or near minimal, // because of the bounded hashmap for reusing equivalent states) UTF-8 // automaton. // // The code below is kept as a reference point in order to make it // easier to understand the higher level goal here. Although, it will // almost certainly bit-rot, so keep that in mind. Also, if you try to // use it, some of the tests in this module will fail because they look // for terser byte code produce by the more optimized handling above. // But the integration test suite should still pass. // // One good example of the substantial difference this can make is to // compare and contrast performance of the Pike VM when the code below // is active vs the code above. Here's an example to try: // // regex-cli find match pikevm -b -p '(?m)^\w{20}' non-ascii-file // // With Unicode classes generated below, this search takes about 45s on // my machine. But with the compressed version above, the search takes // only around 1.4s. The NFA is also 20% smaller. This is in part due // to the compression, but also because of the utilization of 'sparse' // NFA states. They lead to much less state shuffling during the NFA // search. /* let it = cls .iter() .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) .map(|seq| { let it = seq .as_slice() .iter() .map(|rng| self.c_range(rng.start, rng.end)); self.c_concat(it) }); self.c_alt_iter(it) */ } /// Compile the given Unicode character class in reverse with suffix /// caching. /// /// This is a "quick" way to compile large Unicode classes into reverse /// UTF-8 automata while doing a small amount of compression on that /// automata by reusing common suffixes. /// /// A more comprehensive compression scheme can be accomplished by using /// a range trie to efficiently sort a reverse sequence of UTF-8 byte /// rqanges, and then use Daciuk's algorithm via `Utf8Compiler`. /// /// This is the technique used when "NFA shrinking" is disabled. /// /// (This also tries to use "sparse" states where possible, just like /// `c_byte_class` does.) fn c_unicode_class_reverse_with_suffix( &self, cls: &hir::ClassUnicode, ) -> Result { // N.B. It would likely be better to cache common *prefixes* in the // reverse direction, but it's not quite clear how to do that. The // advantage of caching suffixes is that it does give us a win, and // has a very small additional overhead. let mut cache = self.utf8_suffix.borrow_mut(); cache.clear(); let union = self.add_union()?; let alt_end = self.add_empty()?; for urng in cls.iter() { for seq in Utf8Sequences::new(urng.start(), urng.end()) { let mut end = alt_end; for brng in seq.as_slice() { let key = Utf8SuffixKey { from: end, start: brng.start, end: brng.end, }; let hash = cache.hash(&key); if let Some(id) = cache.get(&key, hash) { end = id; continue; } let compiled = self.c_range(brng.start, brng.end)?; self.patch(compiled.end, end)?; end = compiled.start; cache.set(key, hash, end); } self.patch(union, end)?; } } Ok(ThompsonRef { start: union, end: alt_end }) } /// Compile the given HIR look-around assertion to an NFA look-around /// assertion. fn c_look(&self, anchor: &hir::Look) -> Result { let look = match *anchor { hir::Look::Start => Look::Start, hir::Look::End => Look::End, hir::Look::StartLF => Look::StartLF, hir::Look::EndLF => Look::EndLF, hir::Look::StartCRLF => Look::StartCRLF, hir::Look::EndCRLF => Look::EndCRLF, hir::Look::WordAscii => Look::WordAscii, hir::Look::WordAsciiNegate => Look::WordAsciiNegate, hir::Look::WordUnicode => Look::WordUnicode, hir::Look::WordUnicodeNegate => Look::WordUnicodeNegate, hir::Look::WordStartAscii => Look::WordStartAscii, hir::Look::WordEndAscii => Look::WordEndAscii, hir::Look::WordStartUnicode => Look::WordStartUnicode, hir::Look::WordEndUnicode => Look::WordEndUnicode, hir::Look::WordStartHalfAscii => Look::WordStartHalfAscii, hir::Look::WordEndHalfAscii => Look::WordEndHalfAscii, hir::Look::WordStartHalfUnicode => Look::WordStartHalfUnicode, hir::Look::WordEndHalfUnicode => Look::WordEndHalfUnicode, }; let id = self.add_look(look)?; Ok(ThompsonRef { start: id, end: id }) } /// Compile the given byte string to a concatenation of bytes. fn c_literal(&self, bytes: &[u8]) -> Result { self.c_concat(bytes.iter().copied().map(|b| self.c_range(b, b))) } /// Compile a "range" state with one transition that may only be followed /// if the input byte is in the (inclusive) range given. /// /// Both the `start` and `end` locations point to the state created. /// Callers will likely want to keep the `start`, but patch the `end` to /// point to some other state. fn c_range(&self, start: u8, end: u8) -> Result { let id = self.add_range(start, end)?; Ok(ThompsonRef { start: id, end: id }) } /// Compile an "empty" state with one unconditional epsilon transition. /// /// Both the `start` and `end` locations point to the state created. /// Callers will likely want to keep the `start`, but patch the `end` to /// point to some other state. fn c_empty(&self) -> Result { let id = self.add_empty()?; Ok(ThompsonRef { start: id, end: id }) } /// Compile a "fail" state that can never have any outgoing transitions. fn c_fail(&self) -> Result { let id = self.add_fail()?; Ok(ThompsonRef { start: id, end: id }) } // The below helpers are meant to be simple wrappers around the // corresponding Builder methods. For the most part, they let us write // 'self.add_foo()' instead of 'self.builder.borrow_mut().add_foo()', where // the latter is a mouthful. Some of the methods do inject a little bit // of extra logic. e.g., Flipping look-around operators when compiling in // reverse mode. fn patch(&self, from: StateID, to: StateID) -> Result<(), BuildError> { self.builder.borrow_mut().patch(from, to) } fn start_pattern(&self) -> Result { self.builder.borrow_mut().start_pattern() } fn finish_pattern( &self, start_id: StateID, ) -> Result { self.builder.borrow_mut().finish_pattern(start_id) } fn add_empty(&self) -> Result { self.builder.borrow_mut().add_empty() } fn add_range(&self, start: u8, end: u8) -> Result { self.builder.borrow_mut().add_range(Transition { start, end, next: StateID::ZERO, }) } fn add_sparse( &self, ranges: Vec, ) -> Result { self.builder.borrow_mut().add_sparse(ranges) } fn add_look(&self, mut look: Look) -> Result { if self.is_reverse() { look = look.reversed(); } self.builder.borrow_mut().add_look(StateID::ZERO, look) } fn add_union(&self) -> Result { self.builder.borrow_mut().add_union(vec![]) } fn add_union_reverse(&self) -> Result { self.builder.borrow_mut().add_union_reverse(vec![]) } fn add_capture_start( &self, capture_index: u32, name: Option<&str>, ) -> Result { let name = name.map(|n| Arc::from(n)); self.builder.borrow_mut().add_capture_start( StateID::ZERO, capture_index, name, ) } fn add_capture_end( &self, capture_index: u32, ) -> Result { self.builder.borrow_mut().add_capture_end(StateID::ZERO, capture_index) } fn add_fail(&self) -> Result { self.builder.borrow_mut().add_fail() } fn add_match(&self) -> Result { self.builder.borrow_mut().add_match() } fn is_reverse(&self) -> bool { self.config.get_reverse() } } /// A value that represents the result of compiling a sub-expression of a /// regex's HIR. Specifically, this represents a sub-graph of the NFA that /// has an initial state at `start` and a final state at `end`. #[derive(Clone, Copy, Debug)] pub(crate) struct ThompsonRef { pub(crate) start: StateID, pub(crate) end: StateID, } /// A UTF-8 compiler based on Daciuk's algorithm for compilining minimal DFAs /// from a lexicographically sorted sequence of strings in linear time. /// /// The trick here is that any Unicode codepoint range can be converted to /// a sequence of byte ranges that form a UTF-8 automaton. Connecting them /// together via an alternation is trivial, and indeed, it works. However, /// there is a lot of redundant structure in many UTF-8 automatons. Since our /// UTF-8 ranges are in lexicographic order, we can use Daciuk's algorithm /// to build nearly minimal DFAs in linear time. (They are guaranteed to be /// minimal because we use a bounded cache of previously build DFA states.) /// /// The drawback is that this sadly doesn't work for reverse automata, since /// the ranges are no longer in lexicographic order. For that, we invented the /// range trie (which gets its own module). Once a range trie is built, we then /// use this same Utf8Compiler to build a reverse UTF-8 automaton. /// /// The high level idea is described here: /// https://blog.burntsushi.net/transducers/#finite-state-machines-as-data-structures /// /// There is also another implementation of this in the `fst` crate. #[derive(Debug)] struct Utf8Compiler<'a> { builder: &'a mut Builder, state: &'a mut Utf8State, target: StateID, } #[derive(Clone, Debug)] struct Utf8State { compiled: Utf8BoundedMap, uncompiled: Vec, } #[derive(Clone, Debug)] struct Utf8Node { trans: Vec, last: Option, } #[derive(Clone, Debug)] struct Utf8LastTransition { start: u8, end: u8, } impl Utf8State { fn new() -> Utf8State { Utf8State { compiled: Utf8BoundedMap::new(10_000), uncompiled: vec![] } } fn clear(&mut self) { self.compiled.clear(); self.uncompiled.clear(); } } impl<'a> Utf8Compiler<'a> { fn new( builder: &'a mut Builder, state: &'a mut Utf8State, ) -> Result, BuildError> { let target = builder.add_empty()?; state.clear(); let mut utf8c = Utf8Compiler { builder, state, target }; utf8c.add_empty(); Ok(utf8c) } fn finish(&mut self) -> Result { self.compile_from(0)?; let node = self.pop_root(); let start = self.compile(node)?; Ok(ThompsonRef { start, end: self.target }) } fn add(&mut self, ranges: &[Utf8Range]) -> Result<(), BuildError> { let prefix_len = ranges .iter() .zip(&self.state.uncompiled) .take_while(|&(range, node)| { node.last.as_ref().map_or(false, |t| { (t.start, t.end) == (range.start, range.end) }) }) .count(); assert!(prefix_len < ranges.len()); self.compile_from(prefix_len)?; self.add_suffix(&ranges[prefix_len..]); Ok(()) } fn compile_from(&mut self, from: usize) -> Result<(), BuildError> { let mut next = self.target; while from + 1 < self.state.uncompiled.len() { let node = self.pop_freeze(next); next = self.compile(node)?; } self.top_last_freeze(next); Ok(()) } fn compile( &mut self, node: Vec, ) -> Result { let hash = self.state.compiled.hash(&node); if let Some(id) = self.state.compiled.get(&node, hash) { return Ok(id); } let id = self.builder.add_sparse(node.clone())?; self.state.compiled.set(node, hash, id); Ok(id) } fn add_suffix(&mut self, ranges: &[Utf8Range]) { assert!(!ranges.is_empty()); let last = self .state .uncompiled .len() .checked_sub(1) .expect("non-empty nodes"); assert!(self.state.uncompiled[last].last.is_none()); self.state.uncompiled[last].last = Some(Utf8LastTransition { start: ranges[0].start, end: ranges[0].end, }); for r in &ranges[1..] { self.state.uncompiled.push(Utf8Node { trans: vec![], last: Some(Utf8LastTransition { start: r.start, end: r.end }), }); } } fn add_empty(&mut self) { self.state.uncompiled.push(Utf8Node { trans: vec![], last: None }); } fn pop_freeze(&mut self, next: StateID) -> Vec { let mut uncompiled = self.state.uncompiled.pop().unwrap(); uncompiled.set_last_transition(next); uncompiled.trans } fn pop_root(&mut self) -> Vec { assert_eq!(self.state.uncompiled.len(), 1); assert!(self.state.uncompiled[0].last.is_none()); self.state.uncompiled.pop().expect("non-empty nodes").trans } fn top_last_freeze(&mut self, next: StateID) { let last = self .state .uncompiled .len() .checked_sub(1) .expect("non-empty nodes"); self.state.uncompiled[last].set_last_transition(next); } } impl Utf8Node { fn set_last_transition(&mut self, next: StateID) { if let Some(last) = self.last.take() { self.trans.push(Transition { start: last.start, end: last.end, next, }); } } } #[cfg(test)] mod tests { use alloc::vec; use crate::{ nfa::thompson::{SparseTransitions, State}, util::primitives::SmallIndex, }; use super::*; fn build(pattern: &str) -> NFA { NFA::compiler() .configure( NFA::config() .which_captures(WhichCaptures::None) .unanchored_prefix(false), ) .build(pattern) .unwrap() } fn pid(id: usize) -> PatternID { PatternID::new(id).unwrap() } fn sid(id: usize) -> StateID { StateID::new(id).unwrap() } fn s_byte(byte: u8, next: usize) -> State { let next = sid(next); let trans = Transition { start: byte, end: byte, next }; State::ByteRange { trans } } fn s_range(start: u8, end: u8, next: usize) -> State { let next = sid(next); let trans = Transition { start, end, next }; State::ByteRange { trans } } fn s_sparse(transitions: &[(u8, u8, usize)]) -> State { let transitions = transitions .iter() .map(|&(start, end, next)| Transition { start, end, next: sid(next), }) .collect(); State::Sparse(SparseTransitions { transitions }) } fn s_look(look: Look, next: usize) -> State { let next = sid(next); State::Look { look, next } } fn s_bin_union(alt1: usize, alt2: usize) -> State { State::BinaryUnion { alt1: sid(alt1), alt2: sid(alt2) } } fn s_union(alts: &[usize]) -> State { State::Union { alternates: alts .iter() .map(|&id| sid(id)) .collect::>() .into_boxed_slice(), } } fn s_cap(next: usize, pattern: usize, index: usize, slot: usize) -> State { State::Capture { next: sid(next), pattern_id: pid(pattern), group_index: SmallIndex::new(index).unwrap(), slot: SmallIndex::new(slot).unwrap(), } } fn s_fail() -> State { State::Fail } fn s_match(id: usize) -> State { State::Match { pattern_id: pid(id) } } // Test that building an unanchored NFA has an appropriate `(?s:.)*?` // prefix. #[test] fn compile_unanchored_prefix() { let nfa = NFA::compiler() .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a") .unwrap(); assert_eq!( nfa.states(), &[ s_bin_union(2, 1), s_range(0, 255, 0), s_byte(b'a', 3), s_match(0), ] ); } #[test] fn compile_no_unanchored_prefix_with_start_anchor() { let nfa = NFA::compiler() .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"^a") .unwrap(); assert_eq!( nfa.states(), &[s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0)] ); } #[test] fn compile_yes_unanchored_prefix_with_end_anchor() { let nfa = NFA::compiler() .configure(NFA::config().which_captures(WhichCaptures::None)) .build(r"a$") .unwrap(); assert_eq!( nfa.states(), &[ s_bin_union(2, 1), s_range(0, 255, 0), s_byte(b'a', 3), s_look(Look::End, 4), s_match(0), ] ); } #[test] fn compile_yes_reverse_unanchored_prefix_with_start_anchor() { let nfa = NFA::compiler() .configure( NFA::config() .reverse(true) .which_captures(WhichCaptures::None), ) .build(r"^a") .unwrap(); assert_eq!( nfa.states(), &[ s_bin_union(2, 1), s_range(0, 255, 0), s_byte(b'a', 3), // Anchors get flipped in a reverse automaton. s_look(Look::End, 4), s_match(0), ], ); } #[test] fn compile_no_reverse_unanchored_prefix_with_end_anchor() { let nfa = NFA::compiler() .configure( NFA::config() .reverse(true) .which_captures(WhichCaptures::None), ) .build(r"a$") .unwrap(); assert_eq!( nfa.states(), &[ // Anchors get flipped in a reverse automaton. s_look(Look::Start, 1), s_byte(b'a', 2), s_match(0), ], ); } #[test] fn compile_empty() { assert_eq!(build("").states(), &[s_match(0),]); } #[test] fn compile_literal() { assert_eq!(build("a").states(), &[s_byte(b'a', 1), s_match(0),]); assert_eq!( build("ab").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0),] ); assert_eq!( build("☃").states(), &[s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(0)] ); // Check that non-UTF-8 literals work. let nfa = NFA::compiler() .configure( NFA::config() .which_captures(WhichCaptures::None) .unanchored_prefix(false), ) .syntax(crate::util::syntax::Config::new().utf8(false)) .build(r"(?-u)\xFF") .unwrap(); assert_eq!(nfa.states(), &[s_byte(b'\xFF', 1), s_match(0),]); } #[test] fn compile_class_ascii() { assert_eq!( build(r"[a-z]").states(), &[s_range(b'a', b'z', 1), s_match(0),] ); assert_eq!( build(r"[x-za-c]").states(), &[s_sparse(&[(b'a', b'c', 1), (b'x', b'z', 1)]), s_match(0)] ); } #[test] #[cfg(not(miri))] fn compile_class_unicode() { assert_eq!( build(r"[\u03B1-\u03B4]").states(), &[s_range(0xB1, 0xB4, 2), s_byte(0xCE, 0), s_match(0)] ); assert_eq!( build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states(), &[ s_range(0xB1, 0xB4, 5), s_range(0x99, 0x9E, 5), s_byte(0xA4, 1), s_byte(0x9F, 2), s_sparse(&[(0xCE, 0xCE, 0), (0xF0, 0xF0, 3)]), s_match(0), ] ); assert_eq!( build(r"[a-z☃]").states(), &[ s_byte(0x83, 3), s_byte(0x98, 0), s_sparse(&[(b'a', b'z', 3), (0xE2, 0xE2, 1)]), s_match(0), ] ); } #[test] fn compile_repetition() { assert_eq!( build(r"a?").states(), &[s_bin_union(1, 2), s_byte(b'a', 2), s_match(0),] ); assert_eq!( build(r"a??").states(), &[s_bin_union(2, 1), s_byte(b'a', 2), s_match(0),] ); } #[test] fn compile_group() { assert_eq!( build(r"ab+").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(1, 3), s_match(0)] ); assert_eq!( build(r"(ab)").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_match(0)] ); assert_eq!( build(r"(ab)+").states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_bin_union(0, 3), s_match(0)] ); } #[test] fn compile_alternation() { assert_eq!( build(r"a|b").states(), &[s_range(b'a', b'b', 1), s_match(0)] ); assert_eq!( build(r"ab|cd").states(), &[ s_byte(b'b', 3), s_byte(b'd', 3), s_sparse(&[(b'a', b'a', 0), (b'c', b'c', 1)]), s_match(0) ], ); assert_eq!( build(r"|b").states(), &[s_byte(b'b', 2), s_bin_union(2, 0), s_match(0)] ); assert_eq!( build(r"a|").states(), &[s_byte(b'a', 2), s_bin_union(0, 2), s_match(0)] ); } // This tests the use of a non-binary union, i.e., a state with more than // 2 unconditional epsilon transitions. The only place they tend to appear // is in reverse NFAs when shrinking is disabled. Otherwise, 'binary-union' // and 'sparse' tend to cover all other cases of alternation. #[test] fn compile_non_binary_union() { let nfa = NFA::compiler() .configure( NFA::config() .which_captures(WhichCaptures::None) .reverse(true) .shrink(false) .unanchored_prefix(false), ) .build(r"[\u1000\u2000\u3000]") .unwrap(); assert_eq!( nfa.states(), &[ s_union(&[3, 6, 9]), s_byte(0xE1, 10), s_byte(0x80, 1), s_byte(0x80, 2), s_byte(0xE2, 10), s_byte(0x80, 4), s_byte(0x80, 5), s_byte(0xE3, 10), s_byte(0x80, 7), s_byte(0x80, 8), s_match(0), ] ); } #[test] fn compile_many_start_pattern() { let nfa = NFA::compiler() .configure( NFA::config() .which_captures(WhichCaptures::None) .unanchored_prefix(false), ) .build_many(&["a", "b"]) .unwrap(); assert_eq!( nfa.states(), &[ s_byte(b'a', 1), s_match(0), s_byte(b'b', 3), s_match(1), s_bin_union(0, 2), ] ); assert_eq!(nfa.start_anchored().as_usize(), 4); assert_eq!(nfa.start_unanchored().as_usize(), 4); // Test that the start states for each individual pattern are correct. assert_eq!(nfa.start_pattern(pid(0)).unwrap(), sid(0)); assert_eq!(nfa.start_pattern(pid(1)).unwrap(), sid(2)); } // This tests that our compiler can handle an empty character class. At the // time of writing, the regex parser forbids it, so the only way to test it // is to provide a hand written HIR. #[test] fn empty_class_bytes() { use regex_syntax::hir::{Class, ClassBytes, Hir}; let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![]))); let config = NFA::config() .which_captures(WhichCaptures::None) .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } // Like empty_class_bytes, but for a Unicode class. #[test] fn empty_class_unicode() { use regex_syntax::hir::{Class, ClassUnicode, Hir}; let hir = Hir::class(Class::Unicode(ClassUnicode::new(vec![]))); let config = NFA::config() .which_captures(WhichCaptures::None) .unanchored_prefix(false); let nfa = NFA::compiler().configure(config).build_from_hir(&hir).unwrap(); assert_eq!(nfa.states(), &[s_fail(), s_match(0)]); } #[test] fn compile_captures_all() { let nfa = NFA::compiler() .configure( NFA::config() .unanchored_prefix(false) .which_captures(WhichCaptures::All), ) .build("a(b)c") .unwrap(); assert_eq!( nfa.states(), &[ s_cap(1, 0, 0, 0), s_byte(b'a', 2), s_cap(3, 0, 1, 2), s_byte(b'b', 4), s_cap(5, 0, 1, 3), s_byte(b'c', 6), s_cap(7, 0, 0, 1), s_match(0) ] ); let ginfo = nfa.group_info(); assert_eq!(2, ginfo.all_group_len()); } #[test] fn compile_captures_implicit() { let nfa = NFA::compiler() .configure( NFA::config() .unanchored_prefix(false) .which_captures(WhichCaptures::Implicit), ) .build("a(b)c") .unwrap(); assert_eq!( nfa.states(), &[ s_cap(1, 0, 0, 0), s_byte(b'a', 2), s_byte(b'b', 3), s_byte(b'c', 4), s_cap(5, 0, 0, 1), s_match(0) ] ); let ginfo = nfa.group_info(); assert_eq!(1, ginfo.all_group_len()); } #[test] fn compile_captures_none() { let nfa = NFA::compiler() .configure( NFA::config() .unanchored_prefix(false) .which_captures(WhichCaptures::None), ) .build("a(b)c") .unwrap(); assert_eq!( nfa.states(), &[s_byte(b'a', 1), s_byte(b'b', 2), s_byte(b'c', 3), s_match(0)] ); let ginfo = nfa.group_info(); assert_eq!(0, ginfo.all_group_len()); } } regex-automata-0.4.9/src/nfa/thompson/error.rs000064400000000000000000000155271046102023000174740ustar 00000000000000use crate::util::{ captures, look, primitives::{PatternID, StateID}, }; /// An error that can occurred during the construction of a thompson NFA. /// /// This error does not provide many introspection capabilities. There are /// generally only two things you can do with it: /// /// * Obtain a human readable message via its `std::fmt::Display` impl. /// * Access an underlying [`regex_syntax::Error`] type from its `source` /// method via the `std::error::Error` trait. This error only occurs when using /// convenience routines for building an NFA directly from a pattern string. /// /// Otherwise, errors typically occur when a limit has been breeched. For /// example, if the total heap usage of the compiled NFA exceeds the limit /// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then /// building the NFA will fail. #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } /// The kind of error that occurred during the construction of a thompson NFA. #[derive(Clone, Debug)] enum BuildErrorKind { /// An error that occurred while parsing a regular expression. Note that /// this error may be printed over multiple lines, and is generally /// intended to be end user readable on its own. #[cfg(feature = "syntax")] Syntax(regex_syntax::Error), /// An error that occurs if the capturing groups provided to an NFA builder /// do not satisfy the documented invariants. For example, things like /// too many groups, missing groups, having the first (zeroth) group be /// named or duplicate group names within the same pattern. Captures(captures::GroupInfoError), /// An error that occurs when an NFA contains a Unicode word boundary, but /// where the crate was compiled without the necessary data for dealing /// with Unicode word boundaries. Word(look::UnicodeWordBoundaryError), /// An error that occurs if too many patterns were given to the NFA /// compiler. TooManyPatterns { /// The number of patterns given, which exceeds the limit. given: usize, /// The limit on the number of patterns. limit: usize, }, /// An error that occurs if too states are produced while building an NFA. TooManyStates { /// The minimum number of states that are desired, which exceeds the /// limit. given: usize, /// The limit on the number of states. limit: usize, }, /// An error that occurs when NFA compilation exceeds a configured heap /// limit. ExceededSizeLimit { /// The configured limit, in bytes. limit: usize, }, /// An error that occurs when an invalid capture group index is added to /// the NFA. An "invalid" index can be one that would otherwise overflow /// a `usize` on the current target. InvalidCaptureIndex { /// The invalid index that was given. index: u32, }, /// An error that occurs when one tries to build a reverse NFA with /// captures enabled. Currently, this isn't supported, but we probably /// should support it at some point. #[cfg(feature = "syntax")] UnsupportedCaptures, } impl BuildError { /// If this error occurred because the NFA exceeded the configured size /// limit before being built, then this returns the configured size limit. /// /// The limit returned is what was configured, and corresponds to the /// maximum amount of heap usage in bytes. pub fn size_limit(&self) -> Option { match self.kind { BuildErrorKind::ExceededSizeLimit { limit } => Some(limit), _ => None, } } fn kind(&self) -> &BuildErrorKind { &self.kind } #[cfg(feature = "syntax")] pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError { BuildError { kind: BuildErrorKind::Syntax(err) } } pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError { BuildError { kind: BuildErrorKind::Captures(err) } } pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError { BuildError { kind: BuildErrorKind::Word(err) } } pub(crate) fn too_many_patterns(given: usize) -> BuildError { let limit = PatternID::LIMIT; BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } } } pub(crate) fn too_many_states(given: usize) -> BuildError { let limit = StateID::LIMIT; BuildError { kind: BuildErrorKind::TooManyStates { given, limit } } } pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError { BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } } } pub(crate) fn invalid_capture_index(index: u32) -> BuildError { BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } } } #[cfg(feature = "syntax")] pub(crate) fn unsupported_captures() -> BuildError { BuildError { kind: BuildErrorKind::UnsupportedCaptures } } } #[cfg(feature = "std")] impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind() { #[cfg(feature = "syntax")] BuildErrorKind::Syntax(ref err) => Some(err), BuildErrorKind::Captures(ref err) => Some(err), _ => None, } } } impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind() { #[cfg(feature = "syntax")] BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"), BuildErrorKind::Captures(_) => { write!(f, "error with capture groups") } BuildErrorKind::Word(_) => { write!(f, "NFA contains Unicode word boundary") } BuildErrorKind::TooManyPatterns { given, limit } => write!( f, "attempted to compile {} patterns, \ which exceeds the limit of {}", given, limit, ), BuildErrorKind::TooManyStates { given, limit } => write!( f, "attempted to compile {} NFA states, \ which exceeds the limit of {}", given, limit, ), BuildErrorKind::ExceededSizeLimit { limit } => write!( f, "heap usage during NFA compilation exceeded limit of {}", limit, ), BuildErrorKind::InvalidCaptureIndex { index } => write!( f, "capture group index {} is invalid (too big or discontinuous)", index, ), #[cfg(feature = "syntax")] BuildErrorKind::UnsupportedCaptures => write!( f, "currently captures must be disabled when compiling \ a reverse NFA", ), } } } regex-automata-0.4.9/src/nfa/thompson/literal_trie.rs000064400000000000000000000526551046102023000210250ustar 00000000000000use core::mem; use alloc::{vec, vec::Vec}; use crate::{ nfa::thompson::{self, compiler::ThompsonRef, BuildError, Builder}, util::primitives::{IteratorIndexExt, StateID}, }; /// A trie that preserves leftmost-first match semantics. /// /// This is a purpose-built data structure for optimizing 'lit1|lit2|..|litN' /// patterns. It can *only* handle alternations of literals, which makes it /// somewhat restricted in its scope, but literal alternations are fairly /// common. /// /// At a 5,000 foot level, the main idea of this trie is make an alternation of /// literals look more like a DFA than an NFA via epsilon removal. /// /// More precisely, the main issue is in how alternations are compiled into /// a Thompson NFA. Namely, each alternation gets a single NFA "union" state /// with an epsilon transition for every branch of the alternation pointing to /// an NFA state corresponding to the start of that branch. The main problem /// with this representation is the cost of computing an epsilon closure. Once /// you hit the alternation's start state, it acts as a sort of "clog" that /// requires you to traverse all of the epsilon transitions to compute the full /// closure. /// /// While fixing such clogs in the general case is pretty tricky without going /// to a DFA (or perhaps a Glushkov NFA, but that comes with other problems). /// But at least in the case of an alternation of literals, we can convert /// that to a prefix trie without too much cost. In theory, that's all you /// really need to do: build the trie and then compile it to a Thompson NFA. /// For example, if you have the pattern 'bar|baz|foo', then using a trie, it /// is transformed to something like 'b(a(r|z))|f'. This reduces the clog by /// reducing the number of epsilon transitions out of the alternation's start /// state from 3 to 2 (it actually gets down to 1 when you use a sparse state, /// which we do below). It's a small effect here, but when your alternation is /// huge, the savings is also huge. /// /// And that is... essentially what a LiteralTrie does. But there is one /// hiccup. Consider a regex like 'sam|samwise'. How does a prefix trie compile /// that when leftmost-first semantics are used? If 'sam|samwise' was the /// entire regex, then you could just drop the 'samwise' branch entirely since /// it is impossible to match ('sam' will always take priority, and since it /// is a prefix of 'samwise', 'samwise' will never match). But what about the /// regex '\b(sam|samwise)\b'? In that case, you can't remove 'samwise' because /// it might match when 'sam' doesn't fall on a word boundary. /// /// The main idea is that 'sam|samwise' can be translated to 'sam(?:|wise)', /// which is a precisely equivalent regex that also gets rid of the clog. /// /// Another example is 'zapper|z|zap'. That gets translated to /// 'z(?:apper||ap)'. /// /// We accomplish this by giving each state in the trie multiple "chunks" of /// transitions. Each chunk barrier represents a match. The idea is that once /// you know a match occurs, none of the transitions after the match can be /// re-ordered and mixed in with the transitions before the match. Otherwise, /// the match semantics could be changed. /// /// See the 'State' data type for a bit more detail. /// /// Future work: /// /// * In theory, it would be nice to generalize the idea of removing clogs and /// apply it to the NFA graph itself. Then this could in theory work for /// case insensitive alternations of literals, or even just alternations where /// each branch starts with a non-epsilon transition. /// * Could we instead use the Aho-Corasick algorithm here? The aho-corasick /// crate deals with leftmost-first matches correctly, but I think this implies /// encoding failure transitions into a Thompson NFA somehow. Which seems fine, /// because failure transitions are just unconditional epsilon transitions? /// * Or perhaps even better, could we use an aho_corasick::AhoCorasick /// directly? At time of writing, 0.7 is the current version of the /// aho-corasick crate, and that definitely cannot be used as-is. But if we /// expose the underlying finite state machine API, then could we use it? That /// would be super. If we could figure that out, it might also lend itself to /// more general composition of finite state machines. #[derive(Clone)] pub(crate) struct LiteralTrie { /// The set of trie states. Each state contains one or more chunks, where /// each chunk is a sparse set of transitions to other states. A leaf state /// is always a match state that contains only empty chunks (i.e., no /// transitions). states: Vec, /// Whether to add literals in reverse to the trie. Useful when building /// a reverse NFA automaton. rev: bool, } impl LiteralTrie { /// Create a new literal trie that adds literals in the forward direction. pub(crate) fn forward() -> LiteralTrie { let root = State::default(); LiteralTrie { states: vec![root], rev: false } } /// Create a new literal trie that adds literals in reverse. pub(crate) fn reverse() -> LiteralTrie { let root = State::default(); LiteralTrie { states: vec![root], rev: true } } /// Add the given literal to this trie. /// /// If the literal could not be added because the `StateID` space was /// exhausted, then an error is returned. If an error returns, the trie /// is in an unspecified state. pub(crate) fn add(&mut self, bytes: &[u8]) -> Result<(), BuildError> { let mut prev = StateID::ZERO; let mut it = bytes.iter().copied(); while let Some(b) = if self.rev { it.next_back() } else { it.next() } { prev = self.get_or_add_state(prev, b)?; } self.states[prev].add_match(); Ok(()) } /// If the given transition is defined, then return the next state ID. /// Otherwise, add the transition to `from` and point it to a new state. /// /// If a new state ID could not be allocated, then an error is returned. fn get_or_add_state( &mut self, from: StateID, byte: u8, ) -> Result { let active = self.states[from].active_chunk(); match active.binary_search_by_key(&byte, |t| t.byte) { Ok(i) => Ok(active[i].next), Err(i) => { // Add a new state and get its ID. let next = StateID::new(self.states.len()).map_err(|_| { BuildError::too_many_states(self.states.len()) })?; self.states.push(State::default()); // Offset our position to account for all transitions and not // just the ones in the active chunk. let i = self.states[from].active_chunk_start() + i; let t = Transition { byte, next }; self.states[from].transitions.insert(i, t); Ok(next) } } } /// Compile this literal trie to the NFA builder given. /// /// This forwards any errors that may occur while using the given builder. pub(crate) fn compile( &self, builder: &mut Builder, ) -> Result { // Compilation proceeds via depth-first traversal of the trie. // // This is overall pretty brutal. The recursive version of this is // deliciously simple. (See 'compile_to_hir' below for what it might // look like.) But recursion on a trie means your call stack grows // in accordance with the longest literal, which just does not seem // appropriate. So we push the call stack to the heap. But as a result, // the trie traversal becomes pretty brutal because we essentially // have to encode the state of a double for-loop into an explicit call // frame. If someone can simplify this without using recursion, that'd // be great. // 'end' is our match state for this trie, but represented in the the // NFA. Any time we see a match in the trie, we insert a transition // from the current state we're in to 'end'. let end = builder.add_empty()?; let mut stack = vec![]; let mut f = Frame::new(&self.states[StateID::ZERO]); loop { if let Some(t) = f.transitions.next() { if self.states[t.next].is_leaf() { f.sparse.push(thompson::Transition { start: t.byte, end: t.byte, next: end, }); } else { f.sparse.push(thompson::Transition { start: t.byte, end: t.byte, // This is a little funny, but when the frame we create // below completes, it will pop this parent frame off // and modify this transition to point to the correct // state. next: StateID::ZERO, }); stack.push(f); f = Frame::new(&self.states[t.next]); } continue; } // At this point, we have visited all transitions in f.chunk, so // add it as a sparse NFA state. Unless the chunk was empty, in // which case, we don't do anything. if !f.sparse.is_empty() { let chunk_id = if f.sparse.len() == 1 { builder.add_range(f.sparse.pop().unwrap())? } else { let sparse = mem::replace(&mut f.sparse, vec![]); builder.add_sparse(sparse)? }; f.union.push(chunk_id); } // Now we need to look to see if there are other chunks to visit. if let Some(chunk) = f.chunks.next() { // If we're here, it means we're on the second (or greater) // chunk, which implies there is a match at this point. So // connect this state to the final end state. f.union.push(end); // Advance to the next chunk. f.transitions = chunk.iter(); continue; } // Now that we are out of chunks, we have completely visited // this state. So turn our union of chunks into an NFA union // state, and add that union state to the parent state's current // sparse state. (If there is no parent, we're done.) let start = builder.add_union(f.union)?; match stack.pop() { None => { return Ok(ThompsonRef { start, end }); } Some(mut parent) => { // OK because the only way a frame gets pushed on to the // stack (aside from the root) is when a transition has // been added to 'sparse'. parent.sparse.last_mut().unwrap().next = start; f = parent; } } } } /// Converts this trie to an equivalent HIR expression. /// /// We don't actually use this, but it's useful for tests. In particular, /// it provides a (somewhat) human readable representation of the trie /// itself. #[cfg(test)] fn compile_to_hir(&self) -> regex_syntax::hir::Hir { self.compile_state_to_hir(StateID::ZERO) } /// The recursive implementation of 'to_hir'. /// /// Notice how simple this is compared to 'compile' above. 'compile' could /// be similarly simple, but we opt to not use recursion in order to avoid /// overflowing the stack in the case of a longer literal. #[cfg(test)] fn compile_state_to_hir(&self, sid: StateID) -> regex_syntax::hir::Hir { use regex_syntax::hir::Hir; let mut alt = vec![]; for (i, chunk) in self.states[sid].chunks().enumerate() { if i > 0 { alt.push(Hir::empty()); } if chunk.is_empty() { continue; } let mut chunk_alt = vec![]; for t in chunk.iter() { chunk_alt.push(Hir::concat(vec![ Hir::literal(vec![t.byte]), self.compile_state_to_hir(t.next), ])); } alt.push(Hir::alternation(chunk_alt)); } Hir::alternation(alt) } } impl core::fmt::Debug for LiteralTrie { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { writeln!(f, "LiteralTrie(")?; for (sid, state) in self.states.iter().with_state_ids() { writeln!(f, "{:06?}: {:?}", sid.as_usize(), state)?; } writeln!(f, ")")?; Ok(()) } } /// An explicit stack frame used for traversing the trie without using /// recursion. /// /// Each frame is tied to the traversal of a single trie state. The frame is /// dropped once the entire state (and all of its children) have been visited. /// The "output" of compiling a state is the 'union' vector, which is turn /// converted to a NFA union state. Each branch of the union corresponds to a /// chunk in the trie state. /// /// 'sparse' corresponds to the set of transitions for a particular chunk in a /// trie state. It is ultimately converted to an NFA sparse state. The 'sparse' /// field, after being converted to a sparse NFA state, is reused for any /// subsequent chunks in the trie state, if any exist. #[derive(Debug)] struct Frame<'a> { /// The remaining chunks to visit for a trie state. chunks: StateChunksIter<'a>, /// The transitions of the current chunk that we're iterating over. Since /// every trie state has at least one chunk, every frame is initialized /// with the first chunk's transitions ready to be consumed. transitions: core::slice::Iter<'a, Transition>, /// The NFA state IDs pointing to the start of each chunk compiled by /// this trie state. This ultimately gets converted to an NFA union once /// the entire trie state (and all of its children) have been compiled. /// The order of these matters for leftmost-first match semantics, since /// earlier matches in the union are preferred over later ones. union: Vec, /// The actual NFA transitions for a single chunk in a trie state. This /// gets converted to an NFA sparse state, and its corresponding NFA state /// ID should get added to 'union'. sparse: Vec, } impl<'a> Frame<'a> { /// Create a new stack frame for trie traversal. This initializes the /// 'transitions' iterator to the transitions for the first chunk, with the /// 'chunks' iterator being every chunk after the first one. fn new(state: &'a State) -> Frame<'a> { let mut chunks = state.chunks(); // every state has at least 1 chunk let chunk = chunks.next().unwrap(); let transitions = chunk.iter(); Frame { chunks, transitions, union: vec![], sparse: vec![] } } } /// A state in a trie. /// /// This uses a sparse representation. Since we don't use literal tries /// for searching, and ultimately (and compilation requires visiting every /// transition anyway), we use a sparse representation for transitions. This /// means we save on memory, at the expense of 'LiteralTrie::add' being perhaps /// a bit slower. /// /// While 'transitions' is pretty standard as far as tries goes, the 'chunks' /// piece here is more unusual. In effect, 'chunks' defines a partitioning /// of 'transitions', where each chunk corresponds to a distinct set of /// transitions. The key invariant is that a transition in one chunk cannot /// be moved to another chunk. This is the secret sauce that preserve /// leftmost-first match semantics. /// /// A new chunk is added whenever we mark a state as a match state. Once a /// new chunk is added, the old active chunk is frozen and is never mutated /// again. The new chunk becomes the active chunk, which is defined as /// '&transitions[chunks.last().map_or(0, |c| c.1)..]'. Thus, a state where /// 'chunks' is empty actually contains one chunk. Thus, every state contains /// at least one (possibly empty) chunk. /// /// A "leaf" state is a state that has no outgoing transitions (so /// 'transitions' is empty). Note that there is no way for a leaf state to be a /// non-matching state. (Although while building the trie, within 'add', a leaf /// state may exist while not containing any matches. But this invariant is /// only broken within 'add'. Once 'add' returns, the invariant is upheld.) #[derive(Clone, Default)] struct State { transitions: Vec, chunks: Vec<(usize, usize)>, } impl State { /// Mark this state as a match state and freeze the active chunk such that /// it can not be further mutated. fn add_match(&mut self) { // This is not strictly necessary, but there's no point in recording // another match by adding another chunk if the state has no // transitions. Note though that we only skip this if we already know // this is a match state, which is only true if 'chunks' is not empty. // Basically, if we didn't do this, nothing semantically would change, // but we'd end up pushing another chunk and potentially triggering an // alloc. if self.transitions.is_empty() && !self.chunks.is_empty() { return; } let chunk_start = self.active_chunk_start(); let chunk_end = self.transitions.len(); self.chunks.push((chunk_start, chunk_end)); } /// Returns true if and only if this state is a leaf state. That is, a /// state that has no outgoing transitions. fn is_leaf(&self) -> bool { self.transitions.is_empty() } /// Returns an iterator over all of the chunks (including the currently /// active chunk) in this state. Since the active chunk is included, the /// iterator is guaranteed to always yield at least one chunk (although the /// chunk may be empty). fn chunks(&self) -> StateChunksIter<'_> { StateChunksIter { transitions: &*self.transitions, chunks: self.chunks.iter(), active: Some(self.active_chunk()), } } /// Returns the active chunk as a slice of transitions. fn active_chunk(&self) -> &[Transition] { let start = self.active_chunk_start(); &self.transitions[start..] } /// Returns the index into 'transitions' where the active chunk starts. fn active_chunk_start(&self) -> usize { self.chunks.last().map_or(0, |&(_, end)| end) } } impl core::fmt::Debug for State { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut spacing = " "; for (i, chunk) in self.chunks().enumerate() { if i > 0 { write!(f, "{}MATCH", spacing)?; } spacing = ""; for (j, t) in chunk.iter().enumerate() { spacing = " "; if j == 0 && i > 0 { write!(f, " ")?; } else if j > 0 { write!(f, ", ")?; } write!(f, "{:?}", t)?; } } Ok(()) } } /// An iterator over all of the chunks in a state, including the active chunk. /// /// This iterator is created by `State::chunks`. We name this iterator so that /// we can include it in the `Frame` type for non-recursive trie traversal. #[derive(Debug)] struct StateChunksIter<'a> { transitions: &'a [Transition], chunks: core::slice::Iter<'a, (usize, usize)>, active: Option<&'a [Transition]>, } impl<'a> Iterator for StateChunksIter<'a> { type Item = &'a [Transition]; fn next(&mut self) -> Option<&'a [Transition]> { if let Some(&(start, end)) = self.chunks.next() { return Some(&self.transitions[start..end]); } if let Some(chunk) = self.active.take() { return Some(chunk); } None } } /// A single transition in a trie to another state. #[derive(Clone, Copy)] struct Transition { byte: u8, next: StateID, } impl core::fmt::Debug for Transition { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "{:?} => {}", crate::util::escape::DebugByte(self.byte), self.next.as_usize() ) } } #[cfg(test)] mod tests { use bstr::B; use regex_syntax::hir::Hir; use super::*; #[test] fn zap() { let mut trie = LiteralTrie::forward(); trie.add(b"zapper").unwrap(); trie.add(b"z").unwrap(); trie.add(b"zap").unwrap(); let got = trie.compile_to_hir(); let expected = Hir::concat(vec![ Hir::literal(B("z")), Hir::alternation(vec![ Hir::literal(B("apper")), Hir::empty(), Hir::literal(B("ap")), ]), ]); assert_eq!(expected, got); } #[test] fn maker() { let mut trie = LiteralTrie::forward(); trie.add(b"make").unwrap(); trie.add(b"maple").unwrap(); trie.add(b"maker").unwrap(); let got = trie.compile_to_hir(); let expected = Hir::concat(vec![ Hir::literal(B("ma")), Hir::alternation(vec![ Hir::concat(vec![ Hir::literal(B("ke")), Hir::alternation(vec![Hir::empty(), Hir::literal(B("r"))]), ]), Hir::literal(B("ple")), ]), ]); assert_eq!(expected, got); } } regex-automata-0.4.9/src/nfa/thompson/map.rs000064400000000000000000000277451046102023000171250ustar 00000000000000// This module contains a couple simple and purpose built hash maps. The key // trade off they make is that they serve as caches rather than true maps. That // is, inserting a new entry may cause eviction of another entry. This gives // us two things. First, there's less overhead associated with inserts and // lookups. Secondly, it lets us control our memory usage. // // These maps are used in some fairly hot code when generating NFA states for // large Unicode character classes. // // Instead of exposing a rich hashmap entry API, we just permit the caller to // produce a hash of the key directly. The hash can then be reused for both // lookups and insertions at the cost of leaking abstraction a bit. But these // are for internal use only, so it's fine. // // The Utf8BoundedMap is used for Daciuk's algorithm for constructing a // (almost) minimal DFA for large Unicode character classes in linear time. // (Daciuk's algorithm is always used when compiling forward NFAs. For reverse // NFAs, it's only used when the compiler is configured to 'shrink' the NFA, // since there's a bit more expense in the reverse direction.) // // The Utf8SuffixMap is used when compiling large Unicode character classes for // reverse NFAs when 'shrink' is disabled. Specifically, it augments the naive // construction of UTF-8 automata by caching common suffixes. This doesn't // get the same space savings as Daciuk's algorithm, but it's basically as // fast as the naive approach and typically winds up using less memory (since // it generates smaller NFAs) despite the presence of the cache. // // These maps effectively represent caching mechanisms for sparse and // byte-range NFA states, respectively. The former represents a single NFA // state with many transitions of equivalent priority while the latter // represents a single NFA state with a single transition. (Neither state ever // has or is an epsilon transition.) Thus, they have different key types. It's // likely we could make one generic map, but the machinery didn't seem worth // it. They are simple enough. use alloc::{vec, vec::Vec}; use crate::{ nfa::thompson::Transition, util::{ int::{Usize, U64}, primitives::StateID, }, }; // Basic FNV-1a hash constants as described in: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function const PRIME: u64 = 1099511628211; const INIT: u64 = 14695981039346656037; /// A bounded hash map where the key is a sequence of NFA transitions and the /// value is a pre-existing NFA state ID. /// /// std's hashmap can be used for this, however, this map has two important /// advantages. Firstly, it has lower overhead. Secondly, it permits us to /// control our memory usage by limited the number of slots. In general, the /// cost here is that this map acts as a cache. That is, inserting a new entry /// may remove an old entry. We are okay with this, since it does not impact /// correctness in the cases where it is used. The only effect that dropping /// states from the cache has is that the resulting NFA generated may be bigger /// than it otherwise would be. /// /// This improves benchmarks that compile large Unicode character classes, /// since it makes the generation of (almost) minimal UTF-8 automaton faster. /// Specifically, one could observe the difference with std's hashmap via /// something like the following benchmark: /// /// hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" /// /// But to observe that difference, you'd have to modify the code to use /// std's hashmap. /// /// It is quite possible that there is a better way to approach this problem. /// For example, if there happens to be a very common state that collides with /// a lot of less frequent states, then we could wind up with very poor caching /// behavior. Alas, the effectiveness of this cache has not been measured. /// Instead, ad hoc experiments suggest that it is "good enough." Additional /// smarts (such as an LRU eviction policy) have to be weighed against the /// amount of extra time they cost. #[derive(Clone, Debug)] pub struct Utf8BoundedMap { /// The current version of this map. Only entries with matching versions /// are considered during lookups. If an entry is found with a mismatched /// version, then the map behaves as if the entry does not exist. /// /// This makes it possible to clear the map by simply incrementing the /// version number instead of actually deallocating any storage. version: u16, /// The total number of entries this map can store. capacity: usize, /// The actual entries, keyed by hash. Collisions between different states /// result in the old state being dropped. map: Vec, } /// An entry in this map. #[derive(Clone, Debug, Default)] struct Utf8BoundedEntry { /// The version of the map used to produce this entry. If this entry's /// version does not match the current version of the map, then the map /// should behave as if this entry does not exist. version: u16, /// The key, which is a sorted sequence of non-overlapping NFA transitions. key: Vec, /// The state ID corresponding to the state containing the transitions in /// this entry. val: StateID, } impl Utf8BoundedMap { /// Create a new bounded map with the given capacity. The map will never /// grow beyond the given size. /// /// Note that this does not allocate. Instead, callers must call `clear` /// before using this map. `clear` will allocate space if necessary. /// /// This avoids the need to pay for the allocation of this map when /// compiling regexes that lack large Unicode character classes. pub fn new(capacity: usize) -> Utf8BoundedMap { assert!(capacity > 0); Utf8BoundedMap { version: 0, capacity, map: vec![] } } /// Clear this map of all entries, but permit the reuse of allocation /// if possible. /// /// This must be called before the map can be used. pub fn clear(&mut self) { if self.map.is_empty() { self.map = vec![Utf8BoundedEntry::default(); self.capacity]; } else { self.version = self.version.wrapping_add(1); // If we loop back to version 0, then we forcefully clear the // entire map. Otherwise, it might be possible to incorrectly // match entries used to generate other NFAs. if self.version == 0 { self.map = vec![Utf8BoundedEntry::default(); self.capacity]; } } } /// Return a hash of the given transitions. pub fn hash(&self, key: &[Transition]) -> usize { let mut h = INIT; for t in key { h = (h ^ u64::from(t.start)).wrapping_mul(PRIME); h = (h ^ u64::from(t.end)).wrapping_mul(PRIME); h = (h ^ t.next.as_u64()).wrapping_mul(PRIME); } (h % self.map.len().as_u64()).as_usize() } /// Retrieve the cached state ID corresponding to the given key. The hash /// given must have been computed with `hash` using the same key value. /// /// If there is no cached state with the given transitions, then None is /// returned. pub fn get(&mut self, key: &[Transition], hash: usize) -> Option { let entry = &self.map[hash]; if entry.version != self.version { return None; } // There may be a hash collision, so we need to confirm real equality. if entry.key != key { return None; } Some(entry.val) } /// Add a cached state to this map with the given key. Callers should /// ensure that `state_id` points to a state that contains precisely the /// NFA transitions given. /// /// `hash` must have been computed using the `hash` method with the same /// key. pub fn set( &mut self, key: Vec, hash: usize, state_id: StateID, ) { self.map[hash] = Utf8BoundedEntry { version: self.version, key, val: state_id }; } } /// A cache of suffixes used to modestly compress UTF-8 automata for large /// Unicode character classes. #[derive(Clone, Debug)] pub struct Utf8SuffixMap { /// The current version of this map. Only entries with matching versions /// are considered during lookups. If an entry is found with a mismatched /// version, then the map behaves as if the entry does not exist. version: u16, /// The total number of entries this map can store. capacity: usize, /// The actual entries, keyed by hash. Collisions between different states /// result in the old state being dropped. map: Vec, } /// A key that uniquely identifies an NFA state. It is a triple that represents /// a transition from one state for a particular byte range. #[derive(Clone, Debug, Default, Eq, PartialEq)] pub struct Utf8SuffixKey { pub from: StateID, pub start: u8, pub end: u8, } /// An entry in this map. #[derive(Clone, Debug, Default)] struct Utf8SuffixEntry { /// The version of the map used to produce this entry. If this entry's /// version does not match the current version of the map, then the map /// should behave as if this entry does not exist. version: u16, /// The key, which consists of a transition in a particular state. key: Utf8SuffixKey, /// The identifier that the transition in the key maps to. val: StateID, } impl Utf8SuffixMap { /// Create a new bounded map with the given capacity. The map will never /// grow beyond the given size. /// /// Note that this does not allocate. Instead, callers must call `clear` /// before using this map. `clear` will allocate space if necessary. /// /// This avoids the need to pay for the allocation of this map when /// compiling regexes that lack large Unicode character classes. pub fn new(capacity: usize) -> Utf8SuffixMap { assert!(capacity > 0); Utf8SuffixMap { version: 0, capacity, map: vec![] } } /// Clear this map of all entries, but permit the reuse of allocation /// if possible. /// /// This must be called before the map can be used. pub fn clear(&mut self) { if self.map.is_empty() { self.map = vec![Utf8SuffixEntry::default(); self.capacity]; } else { self.version = self.version.wrapping_add(1); if self.version == 0 { self.map = vec![Utf8SuffixEntry::default(); self.capacity]; } } } /// Return a hash of the given transition. pub fn hash(&self, key: &Utf8SuffixKey) -> usize { // Basic FNV-1a hash as described: // https://en.wikipedia.org/wiki/Fowler%E2%80%93Noll%E2%80%93Vo_hash_function const PRIME: u64 = 1099511628211; const INIT: u64 = 14695981039346656037; let mut h = INIT; h = (h ^ key.from.as_u64()).wrapping_mul(PRIME); h = (h ^ u64::from(key.start)).wrapping_mul(PRIME); h = (h ^ u64::from(key.end)).wrapping_mul(PRIME); (h % self.map.len().as_u64()).as_usize() } /// Retrieve the cached state ID corresponding to the given key. The hash /// given must have been computed with `hash` using the same key value. /// /// If there is no cached state with the given key, then None is returned. pub fn get( &mut self, key: &Utf8SuffixKey, hash: usize, ) -> Option { let entry = &self.map[hash]; if entry.version != self.version { return None; } if key != &entry.key { return None; } Some(entry.val) } /// Add a cached state to this map with the given key. Callers should /// ensure that `state_id` points to a state that contains precisely the /// NFA transition given. /// /// `hash` must have been computed using the `hash` method with the same /// key. pub fn set(&mut self, key: Utf8SuffixKey, hash: usize, state_id: StateID) { self.map[hash] = Utf8SuffixEntry { version: self.version, key, val: state_id }; } } regex-automata-0.4.9/src/nfa/thompson/mod.rs000064400000000000000000000072031046102023000171120ustar 00000000000000/*! Defines a Thompson NFA and provides the [`PikeVM`](pikevm::PikeVM) and [`BoundedBacktracker`](backtrack::BoundedBacktracker) regex engines. A Thompson NFA (non-deterministic finite automaton) is arguably _the_ central data type in this library. It is the result of what is commonly referred to as "regex compilation." That is, turning a regex pattern from its concrete syntax string into something that can run a search looks roughly like this: * A `&str` is parsed into a [`regex-syntax::ast::Ast`](regex_syntax::ast::Ast). * An `Ast` is translated into a [`regex-syntax::hir::Hir`](regex_syntax::hir::Hir). * An `Hir` is compiled into a [`NFA`]. * The `NFA` is then used to build one of a few different regex engines: * An `NFA` is used directly in the `PikeVM` and `BoundedBacktracker` engines. * An `NFA` is used by a [hybrid NFA/DFA](crate::hybrid) to build out a DFA's transition table at search time. * An `NFA`, assuming it is one-pass, is used to build a full [one-pass DFA](crate::dfa::onepass) ahead of time. * An `NFA` is used to build a [full DFA](crate::dfa) ahead of time. The [`meta`](crate::meta) regex engine makes all of these choices for you based on various criteria. However, if you have a lower level use case, _you_ can build any of the above regex engines and use them directly. But you must start here by building an `NFA`. # Details It is perhaps worth expanding a bit more on what it means to go through the `&str`->`Ast`->`Hir`->`NFA` process. * Parsing a string into an `Ast` gives it a structured representation. Crucially, the size and amount of work done in this step is proportional to the size of the original string. No optimization or Unicode handling is done at this point. This means that parsing into an `Ast` has very predictable costs. Moreover, an `Ast` can be roundtripped back to its original pattern string as written. * Translating an `Ast` into an `Hir` is a process by which the structured representation is simplified down to its most fundamental components. Translation deals with flags such as case insensitivity by converting things like `(?i:a)` to `[Aa]`. Translation is also where Unicode tables are consulted to resolve things like `\p{Emoji}` and `\p{Greek}`. It also flattens each character class, regardless of how deeply nested it is, into a single sequence of non-overlapping ranges. All the various literal forms are thrown out in favor of one common representation. Overall, the `Hir` is small enough to fit into your head and makes analysis and other tasks much simpler. * Compiling an `Hir` into an `NFA` formulates the regex into a finite state machine whose transitions are defined over bytes. For example, an `Hir` might have a Unicode character class corresponding to a sequence of ranges defined in terms of `char`. Compilation is then responsible for turning those ranges into a UTF-8 automaton. That is, an automaton that matches the UTF-8 encoding of just the codepoints specified by those ranges. Otherwise, the main job of an `NFA` is to serve as a byte-code of sorts for a virtual machine. It can be seen as a sequence of instructions for how to match a regex. */ #[cfg(feature = "nfa-backtrack")] pub mod backtrack; mod builder; #[cfg(feature = "syntax")] mod compiler; mod error; #[cfg(feature = "syntax")] mod literal_trie; #[cfg(feature = "syntax")] mod map; mod nfa; #[cfg(feature = "nfa-pikevm")] pub mod pikevm; #[cfg(feature = "syntax")] mod range_trie; pub use self::{ builder::Builder, error::BuildError, nfa::{ DenseTransitions, PatternIter, SparseTransitions, State, Transition, NFA, }, }; #[cfg(feature = "syntax")] pub use compiler::{Compiler, Config, WhichCaptures}; regex-automata-0.4.9/src/nfa/thompson/nfa.rs000064400000000000000000002452411046102023000171050ustar 00000000000000use core::{fmt, mem}; use alloc::{boxed::Box, format, string::String, sync::Arc, vec, vec::Vec}; #[cfg(feature = "syntax")] use crate::nfa::thompson::{ compiler::{Compiler, Config}, error::BuildError, }; use crate::{ nfa::thompson::builder::Builder, util::{ alphabet::{self, ByteClassSet, ByteClasses}, captures::{GroupInfo, GroupInfoError}, look::{Look, LookMatcher, LookSet}, primitives::{ IteratorIndexExt, PatternID, PatternIDIter, SmallIndex, StateID, }, sparse_set::SparseSet, }, }; /// A byte oriented Thompson non-deterministic finite automaton (NFA). /// /// A Thompson NFA is a finite state machine that permits unconditional epsilon /// transitions, but guarantees that there exists at most one non-epsilon /// transition for each element in the alphabet for each state. /// /// An NFA may be used directly for searching, for analysis or to build /// a deterministic finite automaton (DFA). /// /// # Cheap clones /// /// Since an NFA is a core data type in this crate that many other regex /// engines are based on top of, it is convenient to give ownership of an NFA /// to said regex engines. Because of this, an NFA uses reference counting /// internally. Therefore, it is cheap to clone and it is encouraged to do so. /// /// # Capabilities /// /// Using an NFA for searching via the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) provides the most amount /// of "power" of any regex engine in this crate. Namely, it supports the /// following in all cases: /// /// 1. Detection of a match. /// 2. Location of a match, including both the start and end offset, in a /// single pass of the haystack. /// 3. Location of matching capturing groups. /// 4. Handles multiple patterns, including (1)-(3) when multiple patterns are /// present. /// /// # Capturing Groups /// /// Groups refer to parenthesized expressions inside a regex pattern. They look /// like this, where `exp` is an arbitrary regex: /// /// * `(exp)` - An unnamed capturing group. /// * `(?Pexp)` or `(?exp)` - A named capturing group. /// * `(?:exp)` - A non-capturing group. /// * `(?i:exp)` - A non-capturing group that sets flags. /// /// Only the first two forms are said to be _capturing_. Capturing /// means that the last position at which they match is reportable. The /// [`Captures`](crate::util::captures::Captures) type provides convenient /// access to the match positions of capturing groups, which includes looking /// up capturing groups by their name. /// /// # Byte oriented /// /// This NFA is byte oriented, which means that all of its transitions are /// defined on bytes. In other words, the alphabet of an NFA consists of the /// 256 different byte values. /// /// While DFAs nearly demand that they be byte oriented for performance /// reasons, an NFA could conceivably be *Unicode codepoint* oriented. Indeed, /// a previous version of this NFA supported both byte and codepoint oriented /// modes. A codepoint oriented mode can work because an NFA fundamentally uses /// a sparse representation of transitions, which works well with the large /// sparse space of Unicode codepoints. /// /// Nevertheless, this NFA is only byte oriented. This choice is primarily /// driven by implementation simplicity, and also in part memory usage. In /// practice, performance between the two is roughly comparable. However, /// building a DFA (including a hybrid DFA) really wants a byte oriented NFA. /// So if we do have a codepoint oriented NFA, then we also need to generate /// byte oriented NFA in order to build an hybrid NFA/DFA. Thus, by only /// generating byte oriented NFAs, we can produce one less NFA. In other words, /// if we made our NFA codepoint oriented, we'd need to *also* make it support /// a byte oriented mode, which is more complicated. But a byte oriented mode /// can support everything. /// /// # Differences with DFAs /// /// At the theoretical level, the precise difference between an NFA and a DFA /// is that, in a DFA, for every state, an input symbol unambiguously refers /// to a single transition _and_ that an input symbol is required for each /// transition. At a practical level, this permits DFA implementations to be /// implemented at their core with a small constant number of CPU instructions /// for each byte of input searched. In practice, this makes them quite a bit /// faster than NFAs _in general_. Namely, in order to execute a search for any /// Thompson NFA, one needs to keep track of a _set_ of states, and execute /// the possible transitions on all of those states for each input symbol. /// Overall, this results in much more overhead. To a first approximation, one /// can expect DFA searches to be about an order of magnitude faster. /// /// So why use an NFA at all? The main advantage of an NFA is that it takes /// linear time (in the size of the pattern string after repetitions have been /// expanded) to build and linear memory usage. A DFA, on the other hand, may /// take exponential time and/or space to build. Even in non-pathological /// cases, DFAs often take quite a bit more memory than their NFA counterparts, /// _especially_ if large Unicode character classes are involved. Of course, /// an NFA also provides additional capabilities. For example, it can match /// Unicode word boundaries on non-ASCII text and resolve the positions of /// capturing groups. /// /// Note that a [`hybrid::regex::Regex`](crate::hybrid::regex::Regex) strikes a /// good balance between an NFA and a DFA. It avoids the exponential build time /// of a DFA while maintaining its fast search time. The downside of a hybrid /// NFA/DFA is that in some cases it can be slower at search time than the NFA. /// (It also has less functionality than a pure NFA. It cannot handle Unicode /// word boundaries on non-ASCII text and cannot resolve capturing groups.) /// /// # Example /// /// This shows how to build an NFA with the default configuration and execute a /// search using the Pike VM. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new(r"foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let expected = Some(Match::must(0, 0..8)); /// re.captures(&mut cache, b"foo12345", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: resolving capturing groups /// /// This example shows how to parse some simple dates and extract the /// components of each date via capturing groups. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// }; /// /// let vm = PikeVM::new(r"(?P\d{4})-(?P\d{2})-(?P\d{2})")?; /// let mut cache = vm.create_cache(); /// /// let haystack = "2012-03-14, 2013-01-01 and 2014-07-05"; /// let all: Vec = vm.captures_iter( /// &mut cache, haystack.as_bytes() /// ).collect(); /// // There should be a total of 3 matches. /// assert_eq!(3, all.len()); /// // The year from the second match is '2013'. /// let span = all[1].get_group_by_name("y").unwrap(); /// assert_eq!("2013", &haystack[span]); /// /// # Ok::<(), Box>(()) /// ``` /// /// This example shows that only the last match of a capturing group is /// reported, even if it had to match multiple times for an overall match /// to occur. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new(r"([a-z]){4}")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let haystack = b"quux"; /// re.captures(&mut cache, haystack, &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(3..4)), caps.get_group(1)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] pub struct NFA( // We make NFAs reference counted primarily for two reasons. First is that // the NFA type itself is quite large (at least 0.5KB), and so it makes // sense to put it on the heap by default anyway. Second is that, for Arc // specifically, this enables cheap clones. This tends to be useful because // several structures (the backtracker, the Pike VM, the hybrid NFA/DFA) // all want to hang on to an NFA for use during search time. We could // provide the NFA at search time via a function argument, but this makes // for an unnecessarily annoying API. Instead, we just let each structure // share ownership of the NFA. Using a deep clone would not be smart, since // the NFA can use quite a bit of heap space. Arc, ); impl NFA { /// Parse the given regular expression using a default configuration and /// build an NFA from it. /// /// If you want a non-default configuration, then use the NFA /// [`Compiler`] with a [`Config`]. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new(r"foo[0-9]+")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let expected = Some(Match::must(0, 0..8)); /// re.captures(&mut cache, b"foo12345", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { NFA::compiler().build(pattern) } /// Parse the given regular expressions using a default configuration and /// build a multi-NFA from them. /// /// If you want a non-default configuration, then use the NFA /// [`Compiler`] with a [`Config`]. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let expected = Some(Match::must(1, 0..3)); /// re.captures(&mut cache, b"foo12345bar", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>(patterns: &[P]) -> Result { NFA::compiler().build_many(patterns) } /// Returns an NFA with a single regex pattern that always matches at every /// position. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// /// let re = PikeVM::new_from_nfa(NFA::always_match())?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let expected = Some(Match::must(0, 0..0)); /// re.captures(&mut cache, b"", &mut caps); /// assert_eq!(expected, caps.get_match()); /// re.captures(&mut cache, b"foo", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> NFA { // We could use NFA::new("") here and we'd get the same semantics, but // hand-assembling the NFA (as below) does the same thing with a fewer // number of states. It also avoids needing the 'syntax' feature // enabled. // // Technically all we need is the "match" state, but we add the // "capture" states so that the PikeVM can use this NFA. // // The unwraps below are OK because we add so few states that they will // never exhaust any default limits in any environment. let mut builder = Builder::new(); let pid = builder.start_pattern().unwrap(); assert_eq!(pid.as_usize(), 0); let start_id = builder.add_capture_start(StateID::ZERO, 0, None).unwrap(); let end_id = builder.add_capture_end(StateID::ZERO, 0).unwrap(); let match_id = builder.add_match().unwrap(); builder.patch(start_id, end_id).unwrap(); builder.patch(end_id, match_id).unwrap(); let pid = builder.finish_pattern(start_id).unwrap(); assert_eq!(pid.as_usize(), 0); builder.build(start_id, start_id).unwrap() } /// Returns an NFA that never matches at any position. /// /// This is a convenience routine for creating an NFA with zero patterns. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; /// /// let re = PikeVM::new_from_nfa(NFA::never_match())?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, b"", &mut caps); /// assert!(!caps.is_match()); /// re.captures(&mut cache, b"foo", &mut caps); /// assert!(!caps.is_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> NFA { // This always succeeds because it only requires one NFA state, which // will never exhaust any (default) limits. let mut builder = Builder::new(); let sid = builder.add_fail().unwrap(); builder.build(sid, sid).unwrap() } /// Return a default configuration for an `NFA`. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of an NFA. /// /// # Example /// /// This example shows how to build an NFA with a small size limit that /// results in a compilation error for any regex that tries to use more /// heap memory than the configured limit. /// /// ``` /// use regex_automata::nfa::thompson::{NFA, pikevm::PikeVM}; /// /// let result = PikeVM::builder() /// .thompson(NFA::config().nfa_size_limit(Some(1_000))) /// // Remember, \w is Unicode-aware by default and thus huge. /// .build(r"\w+"); /// assert!(result.is_err()); /// ``` #[cfg(feature = "syntax")] pub fn config() -> Config { Config::new() } /// Return a compiler for configuring the construction of an `NFA`. /// /// This is a convenience routine to avoid needing to import the /// [`Compiler`] type in common cases. /// /// # Example /// /// This example shows how to build an NFA that is permitted match invalid /// UTF-8. Without the additional syntax configuration here, compilation of /// `(?-u:.)` would fail because it is permitted to match invalid UTF-8. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::syntax, /// Match, /// }; /// /// let re = PikeVM::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .build(r"[a-z]+(?-u:.)")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let expected = Some(Match::must(0, 1..5)); /// re.captures(&mut cache, b"\xFFabc\xFF", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn compiler() -> Compiler { Compiler::new() } /// Returns an iterator over all pattern identifiers in this NFA. /// /// Pattern IDs are allocated in sequential order starting from zero, /// where the order corresponds to the order of patterns provided to the /// [`NFA::new_many`] constructor. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// let pids: Vec = nfa.patterns().collect(); /// assert_eq!(pids, vec![ /// PatternID::must(0), /// PatternID::must(1), /// PatternID::must(2), /// ]); /// /// # Ok::<(), Box>(()) /// ``` pub fn patterns(&self) -> PatternIter<'_> { PatternIter { it: PatternID::iter(self.pattern_len()), _marker: core::marker::PhantomData, } } /// Returns the total number of regex patterns in this NFA. /// /// This may return zero if the NFA was constructed with no patterns. In /// this case, the NFA can never produce a match for any input. /// /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because /// NFA construction will fail if too many patterns are added. /// /// It is always true that `nfa.patterns().count() == nfa.pattern_len()`. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(3, nfa.pattern_len()); /// /// let nfa = NFA::never_match(); /// assert_eq!(0, nfa.pattern_len()); /// /// let nfa = NFA::always_match(); /// assert_eq!(1, nfa.pattern_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn pattern_len(&self) -> usize { self.0.start_pattern.len() } /// Return the state identifier of the initial anchored state of this NFA. /// /// The returned identifier is guaranteed to be a valid index into the /// slice returned by [`NFA::states`], and is also a valid argument to /// [`NFA::state`]. /// /// # Example /// /// This example shows a somewhat contrived example where we can easily /// predict the anchored starting state. /// /// ``` /// use regex_automata::nfa::thompson::{NFA, State, WhichCaptures}; /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("a")?; /// let state = nfa.state(nfa.start_anchored()); /// match *state { /// State::ByteRange { trans } => { /// assert_eq!(b'a', trans.start); /// assert_eq!(b'a', trans.end); /// } /// _ => unreachable!("unexpected state"), /// } /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn start_anchored(&self) -> StateID { self.0.start_anchored } /// Return the state identifier of the initial unanchored state of this /// NFA. /// /// This is equivalent to the identifier returned by /// [`NFA::start_anchored`] when the NFA has no unanchored starting state. /// /// The returned identifier is guaranteed to be a valid index into the /// slice returned by [`NFA::states`], and is also a valid argument to /// [`NFA::state`]. /// /// # Example /// /// This example shows that the anchored and unanchored starting states /// are equivalent when an anchored NFA is built. /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let nfa = NFA::new("^a")?; /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn start_unanchored(&self) -> StateID { self.0.start_unanchored } /// Return the state identifier of the initial anchored state for the given /// pattern, or `None` if there is no pattern corresponding to the given /// identifier. /// /// If one uses the starting state for a particular pattern, then the only /// match that can be returned is for the corresponding pattern. /// /// The returned identifier is guaranteed to be a valid index into the /// slice returned by [`NFA::states`], and is also a valid argument to /// [`NFA::state`]. /// /// # Errors /// /// If the pattern doesn't exist in this NFA, then this returns an error. /// This occurs when `pid.as_usize() >= nfa.pattern_len()`. /// /// # Example /// /// This example shows that the anchored and unanchored starting states /// are equivalent when an anchored NFA is built. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new_many(&["^a", "^b"])?; /// // The anchored and unanchored states for the entire NFA are the same, /// // since all of the patterns are anchored. /// assert_eq!(nfa.start_anchored(), nfa.start_unanchored()); /// // But the anchored starting states for each pattern are distinct, /// // because these starting states can only lead to matches for the /// // corresponding pattern. /// let anchored = Some(nfa.start_anchored()); /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(0))); /// assert_ne!(anchored, nfa.start_pattern(PatternID::must(1))); /// // Requesting a pattern not in the NFA will result in None: /// assert_eq!(None, nfa.start_pattern(PatternID::must(2))); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn start_pattern(&self, pid: PatternID) -> Option { self.0.start_pattern.get(pid.as_usize()).copied() } /// Get the byte class set for this NFA. /// /// A byte class set is a partitioning of this NFA's alphabet into /// equivalence classes. Any two bytes in the same equivalence class are /// guaranteed to never discriminate between a match or a non-match. (The /// partitioning may not be minimal.) /// /// Byte classes are used internally by this crate when building DFAs. /// Namely, among other optimizations, they enable a space optimization /// where the DFA's internal alphabet is defined over the equivalence /// classes of bytes instead of all possible byte values. The former is /// often quite a bit smaller than the latter, which permits the DFA to use /// less space for its transition table. #[inline] pub(crate) fn byte_class_set(&self) -> &ByteClassSet { &self.0.byte_class_set } /// Get the byte classes for this NFA. /// /// Byte classes represent a partitioning of this NFA's alphabet into /// equivalence classes. Any two bytes in the same equivalence class are /// guaranteed to never discriminate between a match or a non-match. (The /// partitioning may not be minimal.) /// /// Byte classes are used internally by this crate when building DFAs. /// Namely, among other optimizations, they enable a space optimization /// where the DFA's internal alphabet is defined over the equivalence /// classes of bytes instead of all possible byte values. The former is /// often quite a bit smaller than the latter, which permits the DFA to use /// less space for its transition table. /// /// # Example /// /// This example shows how to query the class of various bytes. /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let nfa = NFA::new("[a-z]+")?; /// let classes = nfa.byte_classes(); /// // 'a' and 'z' are in the same class for this regex. /// assert_eq!(classes.get(b'a'), classes.get(b'z')); /// // But 'a' and 'A' are not. /// assert_ne!(classes.get(b'a'), classes.get(b'A')); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn byte_classes(&self) -> &ByteClasses { &self.0.byte_classes } /// Return a reference to the NFA state corresponding to the given ID. /// /// This is a convenience routine for `nfa.states()[id]`. /// /// # Panics /// /// This panics when the given identifier does not reference a valid state. /// That is, when `id.as_usize() >= nfa.states().len()`. /// /// # Example /// /// The anchored state for a pattern will typically correspond to a /// capturing state for that pattern. (Although, this is not an API /// guarantee!) /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, State}, PatternID}; /// /// let nfa = NFA::new("a")?; /// let state = nfa.state(nfa.start_pattern(PatternID::ZERO).unwrap()); /// match *state { /// State::Capture { slot, .. } => { /// assert_eq!(0, slot.as_usize()); /// } /// _ => unreachable!("unexpected state"), /// } /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn state(&self, id: StateID) -> &State { &self.states()[id] } /// Returns a slice of all states in this NFA. /// /// The slice returned is indexed by `StateID`. This provides a convenient /// way to access states while following transitions among those states. /// /// # Example /// /// This demonstrates that disabling UTF-8 mode can shrink the size of the /// NFA considerably in some cases, especially when using Unicode character /// classes. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::NFA; /// /// let nfa_unicode = NFA::new(r"\w")?; /// let nfa_ascii = NFA::new(r"(?-u)\w")?; /// // Yes, a factor of 45 difference. No lie. /// assert!(40 * nfa_ascii.states().len() < nfa_unicode.states().len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn states(&self) -> &[State] { &self.0.states } /// Returns the capturing group info for this NFA. /// /// The [`GroupInfo`] provides a way to map to and from capture index /// and capture name for each pattern. It also provides a mapping from /// each of the capturing groups in every pattern to their corresponding /// slot offsets encoded in [`State::Capture`] states. /// /// Note that `GroupInfo` uses reference counting internally, such that /// cloning a `GroupInfo` is very cheap. /// /// # Example /// /// This example shows how to get a list of all capture group names for /// a particular pattern. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new(r"(a)(?Pb)(c)(d)(?Pe)")?; /// // The first is the implicit group that is always unnammed. The next /// // 5 groups are the explicit groups found in the concrete syntax above. /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; /// let got: Vec> = /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); /// assert_eq!(expected, got); /// /// // Using an invalid pattern ID will result in nothing yielded. /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); /// assert_eq!(0, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn group_info(&self) -> &GroupInfo { &self.0.group_info() } /// Returns true if and only if this NFA has at least one /// [`Capture`](State::Capture) in its sequence of states. /// /// This is useful as a way to perform a quick test before attempting /// something that does or does not require capture states. For example, /// some regex engines (like the PikeVM) require capture states in order to /// work at all. /// /// # Example /// /// This example shows a few different NFAs and whether they have captures /// or not. /// /// ``` /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// // Obviously has capture states. /// let nfa = NFA::new("(a)")?; /// assert!(nfa.has_capture()); /// /// // Less obviously has capture states, because every pattern has at /// // least one anonymous capture group corresponding to the match for the /// // entire pattern. /// let nfa = NFA::new("a")?; /// assert!(nfa.has_capture()); /// /// // Other than hand building your own NFA, this is the only way to build /// // an NFA without capturing groups. In general, you should only do this /// // if you don't intend to use any of the NFA-oriented regex engines. /// // Overall, capturing groups don't have many downsides. Although they /// // can add a bit of noise to simple NFAs, so it can be nice to disable /// // them for debugging purposes. /// // /// // Notice that 'has_capture' is false here even when we have an /// // explicit capture group in the pattern. /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build("(a)")?; /// assert!(!nfa.has_capture()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn has_capture(&self) -> bool { self.0.has_capture } /// Returns true if and only if this NFA can match the empty string. /// When it returns false, all possible matches are guaranteed to have a /// non-zero length. /// /// This is useful as cheap way to know whether code needs to handle the /// case of a zero length match. This is particularly important when UTF-8 /// modes are enabled, as when UTF-8 mode is enabled, empty matches that /// split a codepoint must never be reported. This extra handling can /// sometimes be costly, and since regexes matching an empty string are /// somewhat rare, it can be beneficial to treat such regexes specially. /// /// # Example /// /// This example shows a few different NFAs and whether they match the /// empty string or not. Notice the empty string isn't merely a matter /// of a string of length literally `0`, but rather, whether a match can /// occur between specific pairs of bytes. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::syntax}; /// /// // The empty regex matches the empty string. /// let nfa = NFA::new("")?; /// assert!(nfa.has_empty(), "empty matches empty"); /// // The '+' repetition operator requires at least one match, and so /// // does not match the empty string. /// let nfa = NFA::new("a+")?; /// assert!(!nfa.has_empty(), "+ does not match empty"); /// // But the '*' repetition operator does. /// let nfa = NFA::new("a*")?; /// assert!(nfa.has_empty(), "* does match empty"); /// // And wrapping '+' in an operator that can match an empty string also /// // causes it to match the empty string too. /// let nfa = NFA::new("(a+)*")?; /// assert!(nfa.has_empty(), "+ inside of * matches empty"); /// /// // If a regex is just made of a look-around assertion, even if the /// // assertion requires some kind of non-empty string around it (such as /// // \b), then it is still treated as if it matches the empty string. /// // Namely, if a match occurs of just a look-around assertion, then the /// // match returned is empty. /// let nfa = NFA::compiler() /// .syntax(syntax::Config::new().utf8(false)) /// .build(r"^$\A\z\b\B(?-u:\b\B)")?; /// assert!(nfa.has_empty(), "assertions match empty"); /// // Even when an assertion is wrapped in a '+', it still matches the /// // empty string. /// let nfa = NFA::new(r"\b+")?; /// assert!(nfa.has_empty(), "+ of an assertion matches empty"); /// /// // An alternation with even one branch that can match the empty string /// // is also said to match the empty string overall. /// let nfa = NFA::new("foo|(bar)?|quux")?; /// assert!(nfa.has_empty(), "alternations can match empty"); /// /// // An NFA that matches nothing does not match the empty string. /// let nfa = NFA::new("[a&&b]")?; /// assert!(!nfa.has_empty(), "never matching means not matching empty"); /// // But if it's wrapped in something that doesn't require a match at /// // all, then it can match the empty string! /// let nfa = NFA::new("[a&&b]*")?; /// assert!(nfa.has_empty(), "* on never-match still matches empty"); /// // Since a '+' requires a match, using it on something that can never /// // match will itself produce a regex that can never match anything, /// // and thus does not match the empty string. /// let nfa = NFA::new("[a&&b]+")?; /// assert!(!nfa.has_empty(), "+ on never-match still matches nothing"); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn has_empty(&self) -> bool { self.0.has_empty } /// Whether UTF-8 mode is enabled for this NFA or not. /// /// When UTF-8 mode is enabled, all matches reported by a regex engine /// derived from this NFA are guaranteed to correspond to spans of valid /// UTF-8. This includes zero-width matches. For example, the regex engine /// must guarantee that the empty regex will not match at the positions /// between code units in the UTF-8 encoding of a single codepoint. /// /// See [`Config::utf8`] for more information. /// /// This is enabled by default. /// /// # Example /// /// This example shows how UTF-8 mode can impact the match spans that may /// be reported in certain cases. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// Match, Input, /// }; /// /// let re = PikeVM::new("")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// // UTF-8 mode is enabled by default. /// let mut input = Input::new("☃"); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 0..0)), caps.get_match()); /// /// // Even though an empty regex matches at 1..1, our next match is /// // 3..3 because 1..1 and 2..2 split the snowman codepoint (which is /// // three bytes long). /// input.set_start(1); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); /// /// // But if we disable UTF-8, then we'll get matches at 1..1 and 2..2: /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build("")?; /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 1..1)), caps.get_match()); /// /// input.set_start(2); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 2..2)), caps.get_match()); /// /// input.set_start(3); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 3..3)), caps.get_match()); /// /// input.set_start(4); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_utf8(&self) -> bool { self.0.utf8 } /// Returns true when this NFA is meant to be matched in reverse. /// /// Generally speaking, when this is true, it means the NFA is supposed to /// be used in conjunction with moving backwards through the haystack. That /// is, from a higher memory address to a lower memory address. /// /// It is often the case that lower level routines dealing with an NFA /// don't need to care about whether it is "meant" to be matched in reverse /// or not. However, there are some specific cases where it matters. For /// example, the implementation of CRLF-aware `^` and `$` line anchors /// needs to know whether the search is in the forward or reverse /// direction. In the forward direction, neither `^` nor `$` should match /// when a `\r` has been seen previously and a `\n` is next. However, in /// the reverse direction, neither `^` nor `$` should match when a `\n` /// has been seen previously and a `\r` is next. This fundamentally changes /// how the state machine is constructed, and thus needs to be altered /// based on the direction of the search. /// /// This is automatically set when using a [`Compiler`] with a configuration /// where [`Config::reverse`] is enabled. If you're building your own NFA /// by hand via a [`Builder`] #[inline] pub fn is_reverse(&self) -> bool { self.0.reverse } /// Returns true if and only if all starting states for this NFA correspond /// to the beginning of an anchored search. /// /// Typically, an NFA will have both an anchored and an unanchored starting /// state. Namely, because it tends to be useful to have both and the cost /// of having an unanchored starting state is almost zero (for an NFA). /// However, if all patterns in the NFA are themselves anchored, then even /// the unanchored starting state will correspond to an anchored search /// since the pattern doesn't permit anything else. /// /// # Example /// /// This example shows a few different scenarios where this method's /// return value varies. /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// // The unanchored starting state permits matching this pattern anywhere /// // in a haystack, instead of just at the beginning. /// let nfa = NFA::new("a")?; /// assert!(!nfa.is_always_start_anchored()); /// /// // In this case, the pattern is itself anchored, so there is no way /// // to run an unanchored search. /// let nfa = NFA::new("^a")?; /// assert!(nfa.is_always_start_anchored()); /// /// // When multiline mode is enabled, '^' can match at the start of a line /// // in addition to the start of a haystack, so an unanchored search is /// // actually possible. /// let nfa = NFA::new("(?m)^a")?; /// assert!(!nfa.is_always_start_anchored()); /// /// // Weird cases also work. A pattern is only considered anchored if all /// // matches may only occur at the start of a haystack. /// let nfa = NFA::new("(^a)|a")?; /// assert!(!nfa.is_always_start_anchored()); /// /// // When multiple patterns are present, if they are all anchored, then /// // the NFA is always anchored too. /// let nfa = NFA::new_many(&["^a", "^b", "^c"])?; /// assert!(nfa.is_always_start_anchored()); /// /// // But if one pattern is unanchored, then the NFA must permit an /// // unanchored search. /// let nfa = NFA::new_many(&["^a", "b", "^c"])?; /// assert!(!nfa.is_always_start_anchored()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_always_start_anchored(&self) -> bool { self.start_anchored() == self.start_unanchored() } /// Returns the look-around matcher associated with this NFA. /// /// A look-around matcher determines how to match look-around assertions. /// In particular, some assertions are configurable. For example, the /// `(?m:^)` and `(?m:$)` assertions can have their line terminator changed /// from the default of `\n` to any other byte. /// /// If the NFA was built using a [`Compiler`], then this matcher /// can be set via the [`Config::look_matcher`] configuration /// knob. Otherwise, if you've built an NFA by hand, it is set via /// [`Builder::set_look_matcher`]. /// /// # Example /// /// This shows how to change the line terminator for multi-line assertions. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::look::LookMatcher, /// Match, Input, /// }; /// /// let mut lookm = LookMatcher::new(); /// lookm.set_line_terminator(b'\x00'); /// /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().look_matcher(lookm)) /// .build(r"(?m)^[a-z]+$")?; /// let mut cache = re.create_cache(); /// /// // Multi-line assertions now use NUL as a terminator. /// assert_eq!( /// Some(Match::must(0, 1..4)), /// re.find(&mut cache, b"\x00abc\x00"), /// ); /// // ... and \n is no longer recognized as a terminator. /// assert_eq!( /// None, /// re.find(&mut cache, b"\nabc\n"), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn look_matcher(&self) -> &LookMatcher { &self.0.look_matcher } /// Returns the union of all look-around assertions used throughout this /// NFA. When the returned set is empty, it implies that the NFA has no /// look-around assertions and thus zero conditional epsilon transitions. /// /// This is useful in some cases enabling optimizations. It is not /// unusual, for example, for optimizations to be of the form, "for any /// regex with zero conditional epsilon transitions, do ..." where "..." /// is some kind of optimization. /// /// This isn't only helpful for optimizations either. Sometimes look-around /// assertions are difficult to support. For example, many of the DFAs in /// this crate don't support Unicode word boundaries or handle them using /// heuristics. Handling that correctly typically requires some kind of /// cheap check of whether the NFA has a Unicode word boundary in the first /// place. /// /// # Example /// /// This example shows how this routine varies based on the regex pattern: /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; /// /// // No look-around at all. /// let nfa = NFA::new("a")?; /// assert!(nfa.look_set_any().is_empty()); /// /// // When multiple patterns are present, since this returns the union, /// // it will include look-around assertions that only appear in one /// // pattern. /// let nfa = NFA::new_many(&["a", "b", "a^b", "c"])?; /// assert!(nfa.look_set_any().contains(Look::Start)); /// /// // Some groups of assertions have various shortcuts. For example: /// let nfa = NFA::new(r"(?-u:\b)")?; /// assert!(nfa.look_set_any().contains_word()); /// assert!(!nfa.look_set_any().contains_word_unicode()); /// assert!(nfa.look_set_any().contains_word_ascii()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn look_set_any(&self) -> LookSet { self.0.look_set_any } /// Returns the union of all prefix look-around assertions for every /// pattern in this NFA. When the returned set is empty, it implies none of /// the patterns require moving through a conditional epsilon transition /// before inspecting the first byte in the haystack. /// /// This can be useful for determining what kinds of assertions need to be /// satisfied at the beginning of a search. For example, typically DFAs /// in this crate will build a distinct starting state for each possible /// starting configuration that might result in look-around assertions /// being satisfied differently. However, if the set returned here is /// empty, then you know that the start state is invariant because there /// are no conditional epsilon transitions to consider. /// /// # Example /// /// This example shows how this routine varies based on the regex pattern: /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; /// /// // No look-around at all. /// let nfa = NFA::new("a")?; /// assert!(nfa.look_set_prefix_any().is_empty()); /// /// // When multiple patterns are present, since this returns the union, /// // it will include look-around assertions that only appear in one /// // pattern. But it will only include assertions that are in the prefix /// // of a pattern. For example, this includes '^' but not '$' even though /// // '$' does appear. /// let nfa = NFA::new_many(&["a", "b", "^ab$", "c"])?; /// assert!(nfa.look_set_prefix_any().contains(Look::Start)); /// assert!(!nfa.look_set_prefix_any().contains(Look::End)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn look_set_prefix_any(&self) -> LookSet { self.0.look_set_prefix_any } // FIXME: The `look_set_prefix_all` computation was not correct, and it // seemed a little tricky to fix it. Since I wasn't actually using it for // anything, I just decided to remove it in the run up to the regex 1.9 // release. If you need this, please file an issue. /* /// Returns the intersection of all prefix look-around assertions for every /// pattern in this NFA. When the returned set is empty, it implies at /// least one of the patterns does not require moving through a conditional /// epsilon transition before inspecting the first byte in the haystack. /// Conversely, when the set contains an assertion, it implies that every /// pattern in the NFA also contains that assertion in its prefix. /// /// This can be useful for determining what kinds of assertions need to be /// satisfied at the beginning of a search. For example, if you know that /// [`Look::Start`] is in the prefix intersection set returned here, then /// you know that all searches, regardless of input configuration, will be /// anchored. /// /// # Example /// /// This example shows how this routine varies based on the regex pattern: /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::look::Look}; /// /// // No look-around at all. /// let nfa = NFA::new("a")?; /// assert!(nfa.look_set_prefix_all().is_empty()); /// /// // When multiple patterns are present, since this returns the /// // intersection, it will only include assertions present in every /// // prefix, and only the prefix. /// let nfa = NFA::new_many(&["^a$", "^b$", "$^ab$", "^c$"])?; /// assert!(nfa.look_set_prefix_all().contains(Look::Start)); /// assert!(!nfa.look_set_prefix_all().contains(Look::End)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn look_set_prefix_all(&self) -> LookSet { self.0.look_set_prefix_all } */ /// Returns the memory usage, in bytes, of this NFA. /// /// This does **not** include the stack size used up by this NFA. To /// compute that, use `std::mem::size_of::()`. /// /// # Example /// /// This example shows that large Unicode character classes can use quite /// a bit of memory. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::NFA; /// /// let nfa_unicode = NFA::new(r"\w")?; /// let nfa_ascii = NFA::new(r"(?-u:\w)")?; /// /// assert!(10 * nfa_ascii.memory_usage() < nfa_unicode.memory_usage()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn memory_usage(&self) -> usize { use core::mem::size_of; size_of::() // allocated on the heap via Arc + self.0.states.len() * size_of::() + self.0.start_pattern.len() * size_of::() + self.0.group_info.memory_usage() + self.0.memory_extra } } impl fmt::Debug for NFA { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { self.0.fmt(f) } } /// The "inner" part of the NFA. We split this part out so that we can easily /// wrap it in an `Arc` above in the definition of `NFA`. /// /// See builder.rs for the code that actually builds this type. This module /// does provide (internal) mutable methods for adding things to this /// NFA before finalizing it, but the high level construction process is /// controlled by the builder abstraction. (Which is complicated enough to /// get its own module.) #[derive(Default)] pub(super) struct Inner { /// The state sequence. This sequence is guaranteed to be indexable by all /// starting state IDs, and it is also guaranteed to contain at most one /// `Match` state for each pattern compiled into this NFA. (A pattern may /// not have a corresponding `Match` state if a `Match` state is impossible /// to reach.) states: Vec, /// The anchored starting state of this NFA. start_anchored: StateID, /// The unanchored starting state of this NFA. start_unanchored: StateID, /// The starting states for each individual pattern. Starting at any /// of these states will result in only an anchored search for the /// corresponding pattern. The vec is indexed by pattern ID. When the NFA /// contains a single regex, then `start_pattern[0]` and `start_anchored` /// are always equivalent. start_pattern: Vec, /// Info about the capturing groups in this NFA. This is responsible for /// mapping groups to slots, mapping groups to names and names to groups. group_info: GroupInfo, /// A representation of equivalence classes over the transitions in this /// NFA. Two bytes in the same equivalence class must not discriminate /// between a match or a non-match. This map can be used to shrink the /// total size of a DFA's transition table with a small match-time cost. /// /// Note that the NFA's transitions are *not* defined in terms of these /// equivalence classes. The NFA's transitions are defined on the original /// byte values. For the most part, this is because they wouldn't really /// help the NFA much since the NFA already uses a sparse representation /// to represent transitions. Byte classes are most effective in a dense /// representation. byte_class_set: ByteClassSet, /// This is generated from `byte_class_set`, and essentially represents the /// same thing but supports different access patterns. Namely, this permits /// looking up the equivalence class of a byte very cheaply. /// /// Ideally we would just store this, but because of annoying code /// structure reasons, we keep both this and `byte_class_set` around for /// now. I think I would prefer that `byte_class_set` were computed in the /// `Builder`, but right now, we compute it as states are added to the /// `NFA`. byte_classes: ByteClasses, /// Whether this NFA has a `Capture` state anywhere. has_capture: bool, /// When the empty string is in the language matched by this NFA. has_empty: bool, /// Whether UTF-8 mode is enabled for this NFA. Briefly, this means that /// all non-empty matches produced by this NFA correspond to spans of valid /// UTF-8, and any empty matches produced by this NFA that split a UTF-8 /// encoded codepoint should be filtered out by the corresponding regex /// engine. utf8: bool, /// Whether this NFA is meant to be matched in reverse or not. reverse: bool, /// The matcher to be used for look-around assertions. look_matcher: LookMatcher, /// The union of all look-around assertions that occur anywhere within /// this NFA. If this set is empty, then it means there are precisely zero /// conditional epsilon transitions in the NFA. look_set_any: LookSet, /// The union of all look-around assertions that occur as a zero-length /// prefix for any of the patterns in this NFA. look_set_prefix_any: LookSet, /* /// The intersection of all look-around assertions that occur as a /// zero-length prefix for any of the patterns in this NFA. look_set_prefix_all: LookSet, */ /// Heap memory used indirectly by NFA states and other things (like the /// various capturing group representations above). Since each state /// might use a different amount of heap, we need to keep track of this /// incrementally. memory_extra: usize, } impl Inner { /// Runs any last finalization bits and turns this into a full NFA. pub(super) fn into_nfa(mut self) -> NFA { self.byte_classes = self.byte_class_set.byte_classes(); // Do epsilon closure from the start state of every pattern in order // to compute various properties such as look-around assertions and // whether the empty string can be matched. let mut stack = vec![]; let mut seen = SparseSet::new(self.states.len()); for &start_id in self.start_pattern.iter() { stack.push(start_id); seen.clear(); // let mut prefix_all = LookSet::full(); let mut prefix_any = LookSet::empty(); while let Some(sid) = stack.pop() { if !seen.insert(sid) { continue; } match self.states[sid] { State::ByteRange { .. } | State::Dense { .. } | State::Fail => continue, State::Sparse(_) => { // This snippet below will rewrite this sparse state // as a dense state. By doing it here, we apply this // optimization to all hot "sparse" states since these // are the states that are reachable from the start // state via an epsilon closure. // // Unfortunately, this optimization did not seem to // help much in some very limited ad hoc benchmarking. // // I left the 'Dense' state type in place in case we // want to revisit this, but I suspect the real way // to make forward progress is a more fundamental // rearchitecting of how data in the NFA is laid out. // I think we should consider a single contiguous // allocation instead of all this indirection and // potential heap allocations for every state. But this // is a large re-design and will require API breaking // changes. // self.memory_extra -= self.states[sid].memory_usage(); // let trans = DenseTransitions::from_sparse(sparse); // self.states[sid] = State::Dense(trans); // self.memory_extra += self.states[sid].memory_usage(); continue; } State::Match { .. } => self.has_empty = true, State::Look { look, next } => { prefix_any = prefix_any.insert(look); stack.push(next); } State::Union { ref alternates } => { // Order doesn't matter here, since we're just dealing // with look-around sets. But if we do richer analysis // here that needs to care about preference order, then // this should be done in reverse. stack.extend(alternates.iter()); } State::BinaryUnion { alt1, alt2 } => { stack.push(alt2); stack.push(alt1); } State::Capture { next, .. } => { stack.push(next); } } } self.look_set_prefix_any = self.look_set_prefix_any.union(prefix_any); } NFA(Arc::new(self)) } /// Returns the capturing group info for this NFA. pub(super) fn group_info(&self) -> &GroupInfo { &self.group_info } /// Add the given state to this NFA after allocating a fresh identifier for /// it. /// /// This panics if too many states are added such that a fresh identifier /// could not be created. (Currently, the only caller of this routine is /// a `Builder`, and it upholds this invariant.) pub(super) fn add(&mut self, state: State) -> StateID { match state { State::ByteRange { ref trans } => { self.byte_class_set.set_range(trans.start, trans.end); } State::Sparse(ref sparse) => { for trans in sparse.transitions.iter() { self.byte_class_set.set_range(trans.start, trans.end); } } State::Dense { .. } => unreachable!(), State::Look { look, .. } => { self.look_matcher .add_to_byteset(look, &mut self.byte_class_set); self.look_set_any = self.look_set_any.insert(look); } State::Capture { .. } => { self.has_capture = true; } State::Union { .. } | State::BinaryUnion { .. } | State::Fail | State::Match { .. } => {} } let id = StateID::new(self.states.len()).unwrap(); self.memory_extra += state.memory_usage(); self.states.push(state); id } /// Set the starting state identifiers for this NFA. /// /// `start_anchored` and `start_unanchored` may be equivalent. When they /// are, then the NFA can only execute anchored searches. This might /// occur, for example, for patterns that are unconditionally anchored. /// e.g., `^foo`. pub(super) fn set_starts( &mut self, start_anchored: StateID, start_unanchored: StateID, start_pattern: &[StateID], ) { self.start_anchored = start_anchored; self.start_unanchored = start_unanchored; self.start_pattern = start_pattern.to_vec(); } /// Sets the UTF-8 mode of this NFA. pub(super) fn set_utf8(&mut self, yes: bool) { self.utf8 = yes; } /// Sets the reverse mode of this NFA. pub(super) fn set_reverse(&mut self, yes: bool) { self.reverse = yes; } /// Sets the look-around assertion matcher for this NFA. pub(super) fn set_look_matcher(&mut self, m: LookMatcher) { self.look_matcher = m; } /// Set the capturing groups for this NFA. /// /// The given slice should contain the capturing groups for each pattern, /// The capturing groups in turn should correspond to the total number of /// capturing groups in the pattern, including the anonymous first capture /// group for each pattern. If a capturing group does have a name, then it /// should be provided as a Arc. /// /// This returns an error if a corresponding `GroupInfo` could not be /// built. pub(super) fn set_captures( &mut self, captures: &[Vec>>], ) -> Result<(), GroupInfoError> { self.group_info = GroupInfo::new( captures.iter().map(|x| x.iter().map(|y| y.as_ref())), )?; Ok(()) } /// Remap the transitions in every state of this NFA using the given map. /// The given map should be indexed according to state ID namespace used by /// the transitions of the states currently in this NFA. /// /// This is particularly useful to the NFA builder, since it is convenient /// to add NFA states in order to produce their final IDs. Then, after all /// of the intermediate "empty" states (unconditional epsilon transitions) /// have been removed from the builder's representation, we can re-map all /// of the transitions in the states already added to their final IDs. pub(super) fn remap(&mut self, old_to_new: &[StateID]) { for state in &mut self.states { state.remap(old_to_new); } self.start_anchored = old_to_new[self.start_anchored]; self.start_unanchored = old_to_new[self.start_unanchored]; for id in self.start_pattern.iter_mut() { *id = old_to_new[*id]; } } } impl fmt::Debug for Inner { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "thompson::NFA(")?; for (sid, state) in self.states.iter().with_state_ids() { let status = if sid == self.start_anchored { '^' } else if sid == self.start_unanchored { '>' } else { ' ' }; writeln!(f, "{}{:06?}: {:?}", status, sid.as_usize(), state)?; } let pattern_len = self.start_pattern.len(); if pattern_len > 1 { writeln!(f, "")?; for pid in 0..pattern_len { let sid = self.start_pattern[pid]; writeln!(f, "START({:06?}): {:?}", pid, sid.as_usize())?; } } writeln!(f, "")?; writeln!( f, "transition equivalence classes: {:?}", self.byte_classes, )?; writeln!(f, ")")?; Ok(()) } } /// A state in an NFA. /// /// In theory, it can help to conceptualize an `NFA` as a graph consisting of /// `State`s. Each `State` contains its complete set of outgoing transitions. /// /// In practice, it can help to conceptualize an `NFA` as a sequence of /// instructions for a virtual machine. Each `State` says what to do and where /// to go next. /// /// Strictly speaking, the practical interpretation is the most correct one, /// because of the [`Capture`](State::Capture) state. Namely, a `Capture` /// state always forwards execution to another state unconditionally. Its only /// purpose is to cause a side effect: the recording of the current input /// position at a particular location in memory. In this sense, an `NFA` /// has more power than a theoretical non-deterministic finite automaton. /// /// For most uses of this crate, it is likely that one may never even need to /// be aware of this type at all. The main use cases for looking at `State`s /// directly are if you need to write your own search implementation or if you /// need to do some kind of analysis on the NFA. #[derive(Clone, Eq, PartialEq)] pub enum State { /// A state with a single transition that can only be taken if the current /// input symbol is in a particular range of bytes. ByteRange { /// The transition from this state to the next. trans: Transition, }, /// A state with possibly many transitions represented in a sparse fashion. /// Transitions are non-overlapping and ordered lexicographically by input /// range. /// /// In practice, this is used for encoding UTF-8 automata. Its presence is /// primarily an optimization that avoids many additional unconditional /// epsilon transitions (via [`Union`](State::Union) states), and thus /// decreases the overhead of traversing the NFA. This can improve both /// matching time and DFA construction time. Sparse(SparseTransitions), /// A dense representation of a state with multiple transitions. Dense(DenseTransitions), /// A conditional epsilon transition satisfied via some sort of /// look-around. Look-around is limited to anchor and word boundary /// assertions. /// /// Look-around states are meant to be evaluated while performing epsilon /// closure (computing the set of states reachable from a particular state /// via only epsilon transitions). If the current position in the haystack /// satisfies the look-around assertion, then you're permitted to follow /// that epsilon transition. Look { /// The look-around assertion that must be satisfied before moving /// to `next`. look: Look, /// The state to transition to if the look-around assertion is /// satisfied. next: StateID, }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. Union { /// An ordered sequence of unconditional epsilon transitions to other /// states. Transitions earlier in the sequence are preferred over /// transitions later in the sequence. alternates: Box<[StateID]>, }, /// An alternation such that there exists precisely two unconditional /// epsilon transitions, where matches found via `alt1` are preferred over /// matches found via `alt2`. /// /// This state exists as a common special case of Union where there are /// only two alternates. In this case, we don't need any allocations to /// represent the state. This saves a bit of memory and also saves an /// additional memory access when traversing the NFA. BinaryUnion { /// An unconditional epsilon transition to another NFA state. This /// is preferred over `alt2`. alt1: StateID, /// An unconditional epsilon transition to another NFA state. Matches /// reported via this transition should only be reported if no matches /// were found by following `alt1`. alt2: StateID, }, /// An empty state that records a capture location. /// /// From the perspective of finite automata, this is precisely equivalent /// to an unconditional epsilon transition, but serves the purpose of /// instructing NFA simulations to record additional state when the finite /// state machine passes through this epsilon transition. /// /// `slot` in this context refers to the specific capture group slot /// offset that is being recorded. Each capturing group has two slots /// corresponding to the start and end of the matching portion of that /// group. /// /// The pattern ID and capture group index are also included in this state /// in case they are useful. But mostly, all you'll need is `next` and /// `slot`. Capture { /// The state to transition to, unconditionally. next: StateID, /// The pattern ID that this capture belongs to. pattern_id: PatternID, /// The capture group index that this capture belongs to. Capture group /// indices are local to each pattern. For example, when capturing /// groups are enabled, every pattern has a capture group at index /// `0`. group_index: SmallIndex, /// The slot index for this capture. Every capturing group has two /// slots: one for the start haystack offset and one for the end /// haystack offset. Unlike capture group indices, slot indices are /// global across all patterns in this NFA. That is, each slot belongs /// to a single pattern, but there is only one slot at index `i`. slot: SmallIndex, }, /// A state that cannot be transitioned out of. This is useful for cases /// where you want to prevent matching from occurring. For example, if your /// regex parser permits empty character classes, then one could choose /// a `Fail` state to represent them. (An empty character class can be /// thought of as an empty set. Since nothing is in an empty set, they can /// never match anything.) Fail, /// A match state. There is at least one such occurrence of this state for /// each regex that can match that is in this NFA. Match { /// The matching pattern ID. pattern_id: PatternID, }, } impl State { /// Returns true if and only if this state contains one or more epsilon /// transitions. /// /// In practice, a state has no outgoing transitions (like `Match`), has /// only non-epsilon transitions (like `ByteRange`) or has only epsilon /// transitions (like `Union`). /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::{State, Transition}, /// util::primitives::{PatternID, StateID, SmallIndex}, /// }; /// /// // Capture states are epsilon transitions. /// let state = State::Capture { /// next: StateID::ZERO, /// pattern_id: PatternID::ZERO, /// group_index: SmallIndex::ZERO, /// slot: SmallIndex::ZERO, /// }; /// assert!(state.is_epsilon()); /// /// // ByteRange states are not. /// let state = State::ByteRange { /// trans: Transition { start: b'a', end: b'z', next: StateID::ZERO }, /// }; /// assert!(!state.is_epsilon()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_epsilon(&self) -> bool { match *self { State::ByteRange { .. } | State::Sparse { .. } | State::Dense { .. } | State::Fail | State::Match { .. } => false, State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } => true, } } /// Returns the heap memory usage of this NFA state in bytes. fn memory_usage(&self) -> usize { match *self { State::ByteRange { .. } | State::Look { .. } | State::BinaryUnion { .. } | State::Capture { .. } | State::Match { .. } | State::Fail => 0, State::Sparse(SparseTransitions { ref transitions }) => { transitions.len() * mem::size_of::() } State::Dense { .. } => 256 * mem::size_of::(), State::Union { ref alternates } => { alternates.len() * mem::size_of::() } } } /// Remap the transitions in this state using the given map. Namely, the /// given map should be indexed according to the transitions currently /// in this state. /// /// This is used during the final phase of the NFA compiler, which turns /// its intermediate NFA into the final NFA. fn remap(&mut self, remap: &[StateID]) { match *self { State::ByteRange { ref mut trans } => { trans.next = remap[trans.next] } State::Sparse(SparseTransitions { ref mut transitions }) => { for t in transitions.iter_mut() { t.next = remap[t.next]; } } State::Dense(DenseTransitions { ref mut transitions }) => { for sid in transitions.iter_mut() { *sid = remap[*sid]; } } State::Look { ref mut next, .. } => *next = remap[*next], State::Union { ref mut alternates } => { for alt in alternates.iter_mut() { *alt = remap[*alt]; } } State::BinaryUnion { ref mut alt1, ref mut alt2 } => { *alt1 = remap[*alt1]; *alt2 = remap[*alt2]; } State::Capture { ref mut next, .. } => *next = remap[*next], State::Fail => {} State::Match { .. } => {} } } } impl fmt::Debug for State { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { match *self { State::ByteRange { ref trans } => trans.fmt(f), State::Sparse(SparseTransitions { ref transitions }) => { let rs = transitions .iter() .map(|t| format!("{:?}", t)) .collect::>() .join(", "); write!(f, "sparse({})", rs) } State::Dense(ref dense) => { write!(f, "dense(")?; for (i, t) in dense.iter().enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?}", t)?; } write!(f, ")") } State::Look { ref look, next } => { write!(f, "{:?} => {:?}", look, next.as_usize()) } State::Union { ref alternates } => { let alts = alternates .iter() .map(|id| format!("{:?}", id.as_usize())) .collect::>() .join(", "); write!(f, "union({})", alts) } State::BinaryUnion { alt1, alt2 } => { write!( f, "binary-union({}, {})", alt1.as_usize(), alt2.as_usize() ) } State::Capture { next, pattern_id, group_index, slot } => { write!( f, "capture(pid={:?}, group={:?}, slot={:?}) => {:?}", pattern_id.as_usize(), group_index.as_usize(), slot.as_usize(), next.as_usize(), ) } State::Fail => write!(f, "FAIL"), State::Match { pattern_id } => { write!(f, "MATCH({:?})", pattern_id.as_usize()) } } } } /// A sequence of transitions used to represent a sparse state. /// /// This is the primary representation of a [`Sparse`](State::Sparse) state. /// It corresponds to a sorted sequence of transitions with non-overlapping /// byte ranges. If the byte at the current position in the haystack matches /// one of the byte ranges, then the finite state machine should take the /// corresponding transition. #[derive(Clone, Debug, Eq, PartialEq)] pub struct SparseTransitions { /// The sorted sequence of non-overlapping transitions. pub transitions: Box<[Transition]>, } impl SparseTransitions { /// This follows the matching transition for a particular byte. /// /// The matching transition is found by looking for a matching byte /// range (there is at most one) corresponding to the position `at` in /// `haystack`. /// /// If `at >= haystack.len()`, then this returns `None`. #[inline] pub fn matches(&self, haystack: &[u8], at: usize) -> Option { haystack.get(at).and_then(|&b| self.matches_byte(b)) } /// This follows the matching transition for any member of the alphabet. /// /// The matching transition is found by looking for a matching byte /// range (there is at most one) corresponding to the position `at` in /// `haystack`. If the given alphabet unit is [`EOI`](alphabet::Unit::eoi), /// then this always returns `None`. #[inline] pub(crate) fn matches_unit( &self, unit: alphabet::Unit, ) -> Option { unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) } /// This follows the matching transition for a particular byte. /// /// The matching transition is found by looking for a matching byte range /// (there is at most one) corresponding to the byte given. #[inline] pub fn matches_byte(&self, byte: u8) -> Option { for t in self.transitions.iter() { if t.start > byte { break; } else if t.matches_byte(byte) { return Some(t.next); } } None /* // This is an alternative implementation that uses binary search. In // some ad hoc experiments, like // // regex-cli find match pikevm -b -p '\b\w+\b' non-ascii-file // // I could not observe any improvement, and in fact, things seemed to // be a bit slower. I can see an improvement in at least one benchmark: // // regex-cli find match pikevm -b -p '\pL{100}' all-codepoints-utf8 // // Where total search time goes from 3.2s to 2.4s when using binary // search. self.transitions .binary_search_by(|t| { if t.end < byte { core::cmp::Ordering::Less } else if t.start > byte { core::cmp::Ordering::Greater } else { core::cmp::Ordering::Equal } }) .ok() .map(|i| self.transitions[i].next) */ } } /// A sequence of transitions used to represent a dense state. /// /// This is the primary representation of a [`Dense`](State::Dense) state. It /// provides constant time matching. That is, given a byte in a haystack and /// a `DenseTransitions`, one can determine if the state matches in constant /// time. /// /// This is in contrast to `SparseTransitions`, whose time complexity is /// necessarily bigger than constant time. Also in contrast, `DenseTransitions` /// usually requires (much) more heap memory. #[derive(Clone, Debug, Eq, PartialEq)] pub struct DenseTransitions { /// A dense representation of this state's transitions on the heap. This /// always has length 256. pub transitions: Box<[StateID]>, } impl DenseTransitions { /// This follows the matching transition for a particular byte. /// /// The matching transition is found by looking for a transition that /// doesn't correspond to `StateID::ZERO` for the byte `at` the given /// position in `haystack`. /// /// If `at >= haystack.len()`, then this returns `None`. #[inline] pub fn matches(&self, haystack: &[u8], at: usize) -> Option { haystack.get(at).and_then(|&b| self.matches_byte(b)) } /// This follows the matching transition for any member of the alphabet. /// /// The matching transition is found by looking for a transition that /// doesn't correspond to `StateID::ZERO` for the byte `at` the given /// position in `haystack`. /// /// If `at >= haystack.len()` or if the given alphabet unit is /// [`EOI`](alphabet::Unit::eoi), then this returns `None`. #[inline] pub(crate) fn matches_unit( &self, unit: alphabet::Unit, ) -> Option { unit.as_u8().map_or(None, |byte| self.matches_byte(byte)) } /// This follows the matching transition for a particular byte. /// /// The matching transition is found by looking for a transition that /// doesn't correspond to `StateID::ZERO` for the given `byte`. /// /// If `at >= haystack.len()`, then this returns `None`. #[inline] pub fn matches_byte(&self, byte: u8) -> Option { let next = self.transitions[usize::from(byte)]; if next == StateID::ZERO { None } else { Some(next) } } /* /// The dense state optimization isn't currently enabled, so permit a /// little bit of dead code. pub(crate) fn from_sparse(sparse: &SparseTransitions) -> DenseTransitions { let mut dense = vec![StateID::ZERO; 256]; for t in sparse.transitions.iter() { for b in t.start..=t.end { dense[usize::from(b)] = t.next; } } DenseTransitions { transitions: dense.into_boxed_slice() } } */ /// Returns an iterator over all transitions that don't point to /// `StateID::ZERO`. pub(crate) fn iter(&self) -> impl Iterator + '_ { use crate::util::int::Usize; self.transitions .iter() .enumerate() .filter(|&(_, &sid)| sid != StateID::ZERO) .map(|(byte, &next)| Transition { start: byte.as_u8(), end: byte.as_u8(), next, }) } } /// A single transition to another state. /// /// This transition may only be followed if the current byte in the haystack /// falls in the inclusive range of bytes specified. #[derive(Clone, Copy, Eq, Hash, PartialEq)] pub struct Transition { /// The inclusive start of the byte range. pub start: u8, /// The inclusive end of the byte range. pub end: u8, /// The identifier of the state to transition to. pub next: StateID, } impl Transition { /// Returns true if the position `at` in `haystack` falls in this /// transition's range of bytes. /// /// If `at >= haystack.len()`, then this returns `false`. pub fn matches(&self, haystack: &[u8], at: usize) -> bool { haystack.get(at).map_or(false, |&b| self.matches_byte(b)) } /// Returns true if the given alphabet unit falls in this transition's /// range of bytes. If the given unit is [`EOI`](alphabet::Unit::eoi), then /// this returns `false`. pub fn matches_unit(&self, unit: alphabet::Unit) -> bool { unit.as_u8().map_or(false, |byte| self.matches_byte(byte)) } /// Returns true if the given byte falls in this transition's range of /// bytes. pub fn matches_byte(&self, byte: u8) -> bool { self.start <= byte && byte <= self.end } } impl fmt::Debug for Transition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { use crate::util::escape::DebugByte; let Transition { start, end, next } = *self; if self.start == self.end { write!(f, "{:?} => {:?}", DebugByte(start), next.as_usize()) } else { write!( f, "{:?}-{:?} => {:?}", DebugByte(start), DebugByte(end), next.as_usize(), ) } } } /// An iterator over all pattern IDs in an NFA. /// /// This iterator is created by [`NFA::patterns`]. /// /// The lifetime parameter `'a` refers to the lifetime of the NFA from which /// this pattern iterator was created. #[derive(Debug)] pub struct PatternIter<'a> { it: PatternIDIter, /// We explicitly associate a lifetime with this iterator even though we /// don't actually borrow anything from the NFA. We do this for backward /// compatibility purposes. If we ever do need to borrow something from /// the NFA, then we can and just get rid of this marker without breaking /// the public API. _marker: core::marker::PhantomData<&'a ()>, } impl<'a> Iterator for PatternIter<'a> { type Item = PatternID; fn next(&mut self) -> Option { self.it.next() } } #[cfg(all(test, feature = "nfa-pikevm"))] mod tests { use super::*; use crate::{nfa::thompson::pikevm::PikeVM, Input}; // This asserts that an NFA state doesn't have its size changed. It is // *really* easy to accidentally increase the size, and thus potentially // dramatically increase the memory usage of every NFA. // // This assert doesn't mean we absolutely cannot increase the size of an // NFA state. We can. It's just here to make sure we do it knowingly and // intentionally. #[test] fn state_has_small_size() { #[cfg(target_pointer_width = "64")] assert_eq!(24, core::mem::size_of::()); #[cfg(target_pointer_width = "32")] assert_eq!(20, core::mem::size_of::()); } #[test] fn always_match() { let re = PikeVM::new_from_nfa(NFA::always_match()).unwrap(); let mut cache = re.create_cache(); let mut caps = re.create_captures(); let mut find = |haystack, start, end| { let input = Input::new(haystack).range(start..end); re.search(&mut cache, &input, &mut caps); caps.get_match().map(|m| m.end()) }; assert_eq!(Some(0), find("", 0, 0)); assert_eq!(Some(0), find("a", 0, 1)); assert_eq!(Some(1), find("a", 1, 1)); assert_eq!(Some(0), find("ab", 0, 2)); assert_eq!(Some(1), find("ab", 1, 2)); assert_eq!(Some(2), find("ab", 2, 2)); } #[test] fn never_match() { let re = PikeVM::new_from_nfa(NFA::never_match()).unwrap(); let mut cache = re.create_cache(); let mut caps = re.create_captures(); let mut find = |haystack, start, end| { let input = Input::new(haystack).range(start..end); re.search(&mut cache, &input, &mut caps); caps.get_match().map(|m| m.end()) }; assert_eq!(None, find("", 0, 0)); assert_eq!(None, find("a", 0, 1)); assert_eq!(None, find("a", 1, 1)); assert_eq!(None, find("ab", 0, 2)); assert_eq!(None, find("ab", 1, 2)); assert_eq!(None, find("ab", 2, 2)); } } regex-automata-0.4.9/src/nfa/thompson/pikevm.rs000064400000000000000000002757171046102023000176470ustar 00000000000000/*! An NFA backed Pike VM for executing regex searches with capturing groups. This module provides a [`PikeVM`] that works by simulating an NFA and resolving all spans of capturing groups that participate in a match. */ #[cfg(feature = "internal-instrument-pikevm")] use core::cell::RefCell; use alloc::{vec, vec::Vec}; use crate::{ nfa::thompson::{self, BuildError, State, NFA}, util::{ captures::Captures, empty, iter, prefilter::Prefilter, primitives::{NonMaxUsize, PatternID, SmallIndex, StateID}, search::{ Anchored, HalfMatch, Input, Match, MatchKind, PatternSet, Span, }, sparse_set::SparseSet, }, }; /// A simple macro for conditionally executing instrumentation logic when /// the 'trace' log level is enabled. This is a compile-time no-op when the /// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that /// this makes it easier to avoid doing extra work when instrumentation isn't /// enabled. /// /// This macro accepts a closure of type `|&mut Counters|`. The closure can /// then increment counters (or whatever) in accordance with what one wants /// to track. macro_rules! instrument { ($fun:expr) => { #[cfg(feature = "internal-instrument-pikevm")] { let fun: &mut dyn FnMut(&mut Counters) = &mut $fun; COUNTERS.with(|c: &RefCell| fun(&mut *c.borrow_mut())); } }; } #[cfg(feature = "internal-instrument-pikevm")] std::thread_local! { /// Effectively global state used to keep track of instrumentation /// counters. The "proper" way to do this is to thread it through the /// PikeVM, but it makes the code quite icky. Since this is just a /// debugging feature, we're content to relegate it to thread local /// state. When instrumentation is enabled, the counters are reset at the /// beginning of every search and printed (with the 'trace' log level) at /// the end of every search. static COUNTERS: RefCell = RefCell::new(Counters::empty()); } /// The configuration used for building a [`PikeVM`]. /// /// A PikeVM configuration is a simple data object that is typically used with /// [`Builder::configure`]. It can be cheaply cloned. /// /// A default configuration can be created either with `Config::new`, or /// perhaps more conveniently, with [`PikeVM::config`]. #[derive(Clone, Debug, Default)] pub struct Config { match_kind: Option, pre: Option>, } impl Config { /// Return a new default PikeVM configuration. pub fn new() -> Config { Config::default() } /// Set the desired match semantics. /// /// The default is [`MatchKind::LeftmostFirst`], which corresponds to the /// match semantics of Perl-like regex engines. That is, when multiple /// patterns would match at the same leftmost position, the pattern that /// appears first in the concrete syntax is chosen. /// /// Currently, the only other kind of match semantics supported is /// [`MatchKind::All`]. This corresponds to "classical DFA" construction /// where all possible matches are visited in the NFA by the `PikeVM`. /// /// Typically, `All` is used when one wants to execute an overlapping /// search and `LeftmostFirst` otherwise. In particular, it rarely makes /// sense to use `All` with the various "leftmost" find routines, since the /// leftmost routines depend on the `LeftmostFirst` automata construction /// strategy. Specifically, `LeftmostFirst` results in the `PikeVM` /// simulating dead states as a way to terminate the search and report a /// match. `LeftmostFirst` also supports non-greedy matches using this /// strategy where as `All` does not. pub fn match_kind(mut self, kind: MatchKind) -> Config { self.match_kind = Some(kind); self } /// Set a prefilter to be used whenever a start state is entered. /// /// A [`Prefilter`] in this context is meant to accelerate searches by /// looking for literal prefixes that every match for the corresponding /// pattern (or patterns) must start with. Once a prefilter produces a /// match, the underlying search routine continues on to try and confirm /// the match. /// /// Be warned that setting a prefilter does not guarantee that the search /// will be faster. While it's usually a good bet, if the prefilter /// produces a lot of false positive candidates (i.e., positions matched /// by the prefilter but not by the regex), then the overall result can /// be slower than if you had just executed the regex engine without any /// prefilters. /// /// By default no prefilter is set. /// /// # Example /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Input, Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "bar"]); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// assert_eq!(Some(Match::must(0, 5..11)), re.find(&mut cache, input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// Be warned though that an incorrect prefilter can lead to incorrect /// results! /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Input, HalfMatch, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["foo", "car"]); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(pre)) /// .build(r"(foo|bar)[a-z]+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("foo1 barfox bar"); /// // No match reported even though there clearly is one! /// assert_eq!(None, re.find(&mut cache, input)); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(mut self, pre: Option) -> Config { self.pre = Some(pre); self } /// Returns the match semantics set in this configuration. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns the prefilter set in this configuration, if one at all. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), pre: o.pre.or_else(|| self.pre.clone()), } } } /// A builder for a `PikeVM`. /// /// This builder permits configuring options for the syntax of a pattern, /// the NFA construction and the `PikeVM` construction. This builder is /// different from a general purpose regex builder in that it permits fine /// grain configuration of the construction process. The trade off for this is /// complexity, and the possibility of setting a configuration that might not /// make sense. For example, there are two different UTF-8 modes: /// /// * [`util::syntax::Config::utf8`](crate::util::syntax::Config::utf8) /// controls whether the pattern itself can contain sub-expressions that match /// invalid UTF-8. /// * [`thompson::Config::utf8`] controls whether empty matches that split a /// Unicode codepoint are reported or not. /// /// Generally speaking, callers will want to either enable all of these or /// disable all of these. /// /// # Example /// /// This example shows how to disable UTF-8 mode in the syntax and the regex /// itself. This is generally what you want for matching on arbitrary bytes. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::syntax, /// Match, /// }; /// /// let re = PikeVM::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let mut cache = re.create_cache(); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// let got = re.find_iter(&mut cache, haystack).next(); /// assert_eq!(expected, got); /// // Notice that `(?-u:[^b])` matches invalid UTF-8, /// // but the subsequent `.*` does not! Disabling UTF-8 /// // on the syntax permits this. /// // /// // N.B. This example does not show the impact of /// // disabling UTF-8 mode on a PikeVM Config, since that /// // only impacts regexes that can produce matches of /// // length 0. /// assert_eq!(b"foo\xFFarzz", &haystack[got.unwrap().range()]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, #[cfg(feature = "syntax")] thompson: thompson::Compiler, } impl Builder { /// Create a new PikeVM builder with its default configuration. pub fn new() -> Builder { Builder { config: Config::default(), #[cfg(feature = "syntax")] thompson: thompson::Compiler::new(), } } /// Build a `PikeVM` from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. #[cfg(feature = "syntax")] pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Build a `PikeVM` from the given patterns. #[cfg(feature = "syntax")] pub fn build_many>( &self, patterns: &[P], ) -> Result { let nfa = self.thompson.build_many(patterns)?; self.build_from_nfa(nfa) } /// Build a `PikeVM` directly from its NFA. /// /// Note that when using this method, any configuration that applies to the /// construction of the NFA itself will of course be ignored, since the NFA /// given here is already built. pub fn build_from_nfa(&self, nfa: NFA) -> Result { nfa.look_set_any().available().map_err(BuildError::word)?; Ok(PikeVM { config: self.config.clone(), nfa }) } /// Apply the given `PikeVM` configuration options to this builder. pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Set the syntax configuration for this builder using /// [`syntax::Config`](crate::util::syntax::Config). /// /// This permits setting things like case insensitivity, Unicode and multi /// line mode. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. #[cfg(feature = "syntax")] pub fn syntax( &mut self, config: crate::util::syntax::Config, ) -> &mut Builder { self.thompson.syntax(config); self } /// Set the Thompson NFA configuration for this builder using /// [`nfa::thompson::Config`](crate::nfa::thompson::Config). /// /// This permits setting things like if additional time should be spent /// shrinking the size of the NFA. /// /// These settings only apply when constructing a PikeVM directly from a /// pattern. #[cfg(feature = "syntax")] pub fn thompson(&mut self, config: thompson::Config) -> &mut Builder { self.thompson.configure(config); self } } /// A virtual machine for executing regex searches with capturing groups. /// /// # Infallible APIs /// /// Unlike most other regex engines in this crate, a `PikeVM` never returns an /// error at search time. It supports all [`Anchored`] configurations, never /// quits and works on haystacks of arbitrary length. /// /// There are two caveats to mention though: /// /// * If an invalid pattern ID is given to a search via [`Anchored::Pattern`], /// then the PikeVM will report "no match." This is consistent with all other /// regex engines in this crate. /// * When using [`PikeVM::which_overlapping_matches`] with a [`PatternSet`] /// that has insufficient capacity to store all valid pattern IDs, then if a /// match occurs for a `PatternID` that cannot be inserted, it is silently /// dropped as if it did not match. /// /// # Advice /// /// The `PikeVM` is generally the most "powerful" regex engine in this crate. /// "Powerful" in this context means that it can handle any regular expression /// that is parseable by `regex-syntax` and any size haystack. Regretably, /// the `PikeVM` is also simultaneously often the _slowest_ regex engine in /// practice. This results in an annoying situation where one generally tries /// to pick any other regex engine (or perhaps none at all) before being /// forced to fall back to a `PikeVM`. /// /// For example, a common strategy for dealing with capturing groups is to /// actually look for the overall match of the regex using a faster regex /// engine, like a [lazy DFA](crate::hybrid::regex::Regex). Once the overall /// match is found, one can then run the `PikeVM` on just the match span to /// find the spans of the capturing groups. In this way, the faster regex /// engine does the majority of the work, while the `PikeVM` only lends its /// power in a more limited role. /// /// Unfortunately, this isn't always possible because the faster regex engines /// don't support all of the regex features in `regex-syntax`. This notably /// includes (and is currently limited to) Unicode word boundaries. So if /// your pattern has Unicode word boundaries, you typically can't use a /// DFA-based regex engine at all (unless you [enable heuristic support for /// it](crate::hybrid::dfa::Config::unicode_word_boundary)). (The [one-pass /// DFA](crate::dfa::onepass::DFA) can handle Unicode word boundaries for /// anchored searches only, but in a cruel sort of joke, many Unicode features /// tend to result in making the regex _not_ one-pass.) /// /// # Example /// /// This example shows that the `PikeVM` implements Unicode word boundaries /// correctly by default. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new(r"\b\w+\b")?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "Шерлок Холмс"); /// assert_eq!(Some(Match::must(0, 0..12)), it.next()); /// assert_eq!(Some(Match::must(0, 13..23)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct PikeVM { config: Config, nfa: NFA, } impl PikeVM { /// Parse the given regular expression using the default configuration and /// return the corresponding `PikeVM`. /// /// If you want a non-default configuration, then use the [`Builder`] to /// set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..14)), /// re.find_iter(&mut cache, "zzzfoo12345barzzz").next(), /// ); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new(pattern: &str) -> Result { PikeVM::builder().build(pattern) } /// Like `new`, but parses multiple patterns into a single "multi regex." /// This similarly uses the default regex configuration. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new_many(&["[a-z]+", "[0-9]+"])?; /// let mut cache = re.create_cache(); /// /// let mut it = re.find_iter(&mut cache, "abc 1 foo 4567 0 quux"); /// assert_eq!(Some(Match::must(0, 0..3)), it.next()); /// assert_eq!(Some(Match::must(1, 4..5)), it.next()); /// assert_eq!(Some(Match::must(0, 6..9)), it.next()); /// assert_eq!(Some(Match::must(1, 10..14)), it.next()); /// assert_eq!(Some(Match::must(1, 15..16)), it.next()); /// assert_eq!(Some(Match::must(0, 17..21)), it.next()); /// assert_eq!(None, it.next()); /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn new_many>( patterns: &[P], ) -> Result { PikeVM::builder().build_many(patterns) } /// Like `new`, but builds a PikeVM directly from an NFA. This is useful /// if you already have an NFA, or even if you hand-assembled the NFA. /// /// # Example /// /// This shows how to hand assemble a regular expression via its HIR, /// compile an NFA from it and build a PikeVM from the NFA. /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, pikevm::PikeVM}, Match}; /// use regex_syntax::hir::{Hir, Class, ClassBytes, ClassBytesRange}; /// /// let hir = Hir::class(Class::Bytes(ClassBytes::new(vec![ /// ClassBytesRange::new(b'0', b'9'), /// ClassBytesRange::new(b'A', b'Z'), /// ClassBytesRange::new(b'_', b'_'), /// ClassBytesRange::new(b'a', b'z'), /// ]))); /// /// let config = NFA::config().nfa_size_limit(Some(1_000)); /// let nfa = NFA::compiler().configure(config).build_from_hir(&hir)?; /// /// let re = PikeVM::new_from_nfa(nfa)?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let expected = Some(Match::must(0, 3..4)); /// re.captures(&mut cache, "!@#A#@!", &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_from_nfa(nfa: NFA) -> Result { PikeVM::builder().build_from_nfa(nfa) } /// Create a new `PikeVM` that matches every input. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::always_match()?; /// let mut cache = re.create_cache(); /// /// let expected = Match::must(0, 0..0); /// assert_eq!(Some(expected), re.find_iter(&mut cache, "").next()); /// assert_eq!(Some(expected), re.find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn always_match() -> Result { let nfa = thompson::NFA::always_match(); PikeVM::new_from_nfa(nfa) } /// Create a new `PikeVM` that never matches any input. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::never_match()?; /// let mut cache = re.create_cache(); /// /// assert_eq!(None, re.find_iter(&mut cache, "").next()); /// assert_eq!(None, re.find_iter(&mut cache, "foo").next()); /// # Ok::<(), Box>(()) /// ``` pub fn never_match() -> Result { let nfa = thompson::NFA::never_match(); PikeVM::new_from_nfa(nfa) } /// Return a default configuration for a `PikeVM`. /// /// This is a convenience routine to avoid needing to import the `Config` /// type when customizing the construction of a `PikeVM`. /// /// # Example /// /// This example shows how to disable UTF-8 mode. When UTF-8 mode is /// disabled, zero-width matches that split a codepoint are allowed. /// Otherwise they are never reported. /// /// In the code below, notice that `""` is permitted to match positions /// that split the encoding of a codepoint. /// /// ``` /// use regex_automata::{nfa::thompson::{self, pikevm::PikeVM}, Match}; /// /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"")?; /// let mut cache = re.create_cache(); /// /// let haystack = "a☃z"; /// let mut it = re.find_iter(&mut cache, haystack); /// assert_eq!(Some(Match::must(0, 0..0)), it.next()); /// assert_eq!(Some(Match::must(0, 1..1)), it.next()); /// assert_eq!(Some(Match::must(0, 2..2)), it.next()); /// assert_eq!(Some(Match::must(0, 3..3)), it.next()); /// assert_eq!(Some(Match::must(0, 4..4)), it.next()); /// assert_eq!(Some(Match::must(0, 5..5)), it.next()); /// assert_eq!(None, it.next()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `PikeVM`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example /// /// This example shows how to use the builder to disable UTF-8 mode /// everywhere. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::syntax, /// Match, /// }; /// /// let re = PikeVM::builder() /// .syntax(syntax::Config::new().utf8(false)) /// .thompson(thompson::Config::new().utf8(false)) /// .build(r"foo(?-u:[^b])ar.*")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = b"\xFEfoo\xFFarzz\xE2\x98\xFF\n"; /// let expected = Some(Match::must(0, 1..9)); /// re.captures(&mut cache, haystack, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } /// Create a new empty set of capturing groups that is guaranteed to be /// valid for the search APIs on this `PikeVM`. /// /// A `Captures` value created for a specific `PikeVM` cannot be used with /// any other `PikeVM`. /// /// This is a convenience function for [`Captures::all`]. See the /// [`Captures`] documentation for an explanation of its alternative /// constructors that permit the `PikeVM` to do less work during a search, /// and thus might make it faster. pub fn create_captures(&self) -> Captures { Captures::all(self.get_nfa().group_info().clone()) } /// Create a new cache for this `PikeVM`. /// /// The cache returned should only be used for searches for this /// `PikeVM`. If you want to reuse the cache for another `PikeVM`, then /// you must call [`Cache::reset`] with that `PikeVM` (or, equivalently, /// [`PikeVM::reset_cache`]). pub fn create_cache(&self) -> Cache { Cache::new(self) } /// Reset the given cache such that it can be used for searching with the /// this `PikeVM` (and only this `PikeVM`). /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `PikeVM`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `PikeVM`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re1 = PikeVM::new(r"\w")?; /// let re2 = PikeVM::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the PikeVM we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// re2.reset_cache(&mut cache); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset_cache(&self, cache: &mut Cache) { cache.reset(self); } /// Returns the total number of patterns compiled into this `PikeVM`. /// /// In the case of a `PikeVM` that contains no patterns, this returns `0`. /// /// # Example /// /// This example shows the pattern length for a `PikeVM` that never /// matches: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::never_match()?; /// assert_eq!(re.pattern_len(), 0); /// # Ok::<(), Box>(()) /// ``` /// /// And another example for a `PikeVM` that matches at every position: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::always_match()?; /// assert_eq!(re.pattern_len(), 1); /// # Ok::<(), Box>(()) /// ``` /// /// And finally, a `PikeVM` that was constructed from multiple patterns: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(re.pattern_len(), 3); /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.nfa.pattern_len() } /// Return the config for this `PikeVM`. #[inline] pub fn get_config(&self) -> &Config { &self.config } /// Returns a reference to the underlying NFA. #[inline] pub fn get_nfa(&self) -> &NFA { &self.nfa } } impl PikeVM { /// Returns true if and only if this `PikeVM` matches the given haystack. /// /// This routine may short circuit if it knows that scanning future /// input will never lead to a different result. In particular, if the /// underlying NFA enters a match state, then this routine will return /// `true` immediately without inspecting any future input. (Consider how /// this might make a difference given the regex `a+` on the haystack /// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`, /// but routines like `find` need to continue searching because `+` is /// greedy by default.) /// /// # Example /// /// This shows basic usage: /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new("foo[0-9]+bar")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, "foo12345bar")); /// assert!(!re.is_match(&mut cache, "foobar")); /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `find` returns a /// match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input}; /// /// let re = PikeVM::new("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input}; /// /// let re = PikeVM::builder() /// .thompson(NFA::config().utf8(false)) /// .build("a*")?; /// let mut cache = re.create_cache(); /// /// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2))); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> bool { let input = input.into().earliest(true); self.search_slots(cache, &input, &mut []).is_some() } /// Executes a leftmost forward search and returns a `Match` if one exists. /// /// This routine only includes the overall match span. To get access to the /// individual spans of each capturing group, use [`PikeVM::captures`]. /// /// # Example /// /// Leftmost first match semantics corresponds to the match with the /// smallest starting offset, but where the end offset is determined by /// preferring earlier branches in the original regular expression. For /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` /// will match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. (This crate does not currently support /// leftmost longest semantics.) /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..8); /// assert_eq!(Some(expected), re.find(&mut cache, "foo12345")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over later parts. /// let re = PikeVM::new("abc|a")?; /// let mut cache = re.create_cache(); /// let expected = Match::must(0, 0..3); /// assert_eq!(Some(expected), re.find(&mut cache, "abc")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find<'h, I: Into>>( &self, cache: &mut Cache, input: I, ) -> Option { let input = input.into(); if self.get_nfa().pattern_len() == 1 { let mut slots = [None, None]; let pid = self.search_slots(cache, &input, &mut slots)?; let start = slots[0]?.get(); let end = slots[1]?.get(); return Some(Match::new(pid, Span { start, end })); } let ginfo = self.get_nfa().group_info(); let slots_len = ginfo.implicit_slot_len(); let mut slots = vec![None; slots_len]; let pid = self.search_slots(cache, &input, &mut slots)?; let start = slots[pid.as_usize() * 2]?.get(); let end = slots[pid.as_usize() * 2 + 1]?.get(); Some(Match::new(pid, Span { start, end })) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures<'h, I: Into>>( &self, cache: &mut Cache, input: I, caps: &mut Captures, ) { self.search(cache, &input.into(), caps) } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> FindMatches<'r, 'c, 'h> { let caps = Captures::matches(self.get_nfa().group_info().clone()); let it = iter::Searcher::new(input.into()); FindMatches { re: self, cache, caps, it } } /// Returns an iterator over all non-overlapping `Captures` values. If no /// match exists, then the iterator yields no elements. /// /// This yields the same matches as [`PikeVM::find_iter`], but it includes /// the spans of all capturing groups that participate in each match. /// /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for /// how to correctly iterate over all matches in a haystack while avoiding /// the creation of a new `Captures` value for every match. (Which you are /// forced to do with an `Iterator`.) /// /// # Example /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new("foo(?P[0-9]+)")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re /// .captures_iter(&mut cache, text) /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) /// .collect(); /// assert_eq!(matches, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures_iter<'r, 'c, 'h, I: Into>>( &'r self, cache: &'c mut Cache, input: I, ) -> CapturesMatches<'r, 'c, 'h> { let caps = self.create_captures(); let it = iter::Searcher::new(input.into()); CapturesMatches { re: self, cache, caps, it } } } impl PikeVM { /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// This is like [`PikeVM::captures`], but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-PikeVM that permits searching /// for specific patterns. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search(&mut cache, &Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; /// /// let re = PikeVM::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search( &self, cache: &mut Cache, input: &Input<'_>, caps: &mut Captures, ) { caps.set_pattern(None); let pid = self.search_slots(cache, input, caps.slots_mut()); caps.set_pattern(pid); } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided `slots`, and /// returns the matching pattern ID. The contents of the slots for patterns /// other than the matching pattern are unspecified. If no match was found, /// then `None` is returned and the contents of `slots` is unspecified. /// /// This is like [`PikeVM::search`], but it accepts a raw slots slice /// instead of a `Captures` value. This is useful in contexts where you /// don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with /// [`pattern_len() * 2`](crate::nfa::thompson::NFA::pattern_len) /// slots, if you only care about the overall match spans for each matching /// pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID, Input}; /// /// let re = PikeVM::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots(&mut cache, &input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); if !utf8empty { let hm = self.search_slots_imp(cache, input, slots)?; return Some(hm.pattern()); } // There is an unfortunate special case where if the regex can // match the empty string and UTF-8 mode is enabled, the search // implementation requires that the slots have at least as much space // to report the bounds of any match. This is so zero-width matches // that split a codepoint can be filtered out. // // Note that if utf8empty is true, we specialize the case for when // the number of patterns is 1. In that case, we can just use a stack // allocation. Otherwise we resort to a heap allocation, which we // convince ourselves we're fine with due to the pathological nature of // this case. let min = self.get_nfa().group_info().implicit_slot_len(); if slots.len() >= min { let hm = self.search_slots_imp(cache, input, slots)?; return Some(hm.pattern()); } if self.get_nfa().pattern_len() == 1 { let mut enough = [None, None]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than // `slots`, otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); return got.map(|hm| hm.pattern()); } let mut enough = vec![None; min]; let got = self.search_slots_imp(cache, input, &mut enough); // This is OK because we know `enough` is strictly bigger than `slots`, // otherwise this special case isn't reached. slots.copy_from_slice(&enough[..slots.len()]); got.map(|hm| hm.pattern()) } /// This is the actual implementation of `search_slots_imp` that /// doesn't account for the special case when 1) the NFA has UTF-8 mode /// enabled, 2) the NFA can match the empty string and 3) the caller has /// provided an insufficient number of slots to record match offsets. #[inline(never)] fn search_slots_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let hm = match self.search_imp(cache, input, slots) { None => return None, Some(hm) if !utf8empty => return Some(hm), Some(hm) => hm, }; empty::skip_splits_fwd(input, hm, hm.offset(), |input| { Ok(self .search_imp(cache, input, slots) .map(|hm| (hm, hm.offset()))) }) // OK because the PikeVM never errors. .unwrap() } /// Writes the set of patterns that match anywhere in the given search /// configuration to `patset`. If multiple patterns match at the same /// position and this `PikeVM` was configured with [`MatchKind::All`] /// semantics, then all matching patterns are written to the given set. /// /// Unless all of the patterns in this `PikeVM` are anchored, then /// generally speaking, this will visit every byte in the haystack. /// /// This search routine *does not* clear the pattern set. This gives some /// flexibility to the caller (e.g., running multiple searches with the /// same pattern set), but does make the API bug-prone if you're reusing /// the same pattern set for multiple searches but intended them to be /// independent. /// /// If a pattern ID matched but the given `PatternSet` does not have /// sufficient capacity to store it, then it is not inserted and silently /// dropped. /// /// # Example /// /// This example shows how to find all matching patterns in a haystack, /// even when some patterns match at the same position as other patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Input, MatchKind, PatternSet, /// }; /// /// let patterns = &[ /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", /// ]; /// let re = PikeVM::builder() /// .configure(PikeVM::config().match_kind(MatchKind::All)) /// .build_many(patterns)?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("foobar"); /// let mut patset = PatternSet::new(re.pattern_len()); /// re.which_overlapping_matches(&mut cache, &input, &mut patset); /// let expected = vec![0, 2, 3, 4, 6]; /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn which_overlapping_matches( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { self.which_overlapping_imp(cache, input, patset) } } impl PikeVM { /// The implementation of standard leftmost search. /// /// Capturing group spans are written to `slots`, but only if requested. /// `slots` can be any length. Any slot in the NFA that is activated but /// which is out of bounds for the given `slots` is ignored. fn search_imp( &self, cache: &mut Cache, input: &Input<'_>, slots: &mut [Option], ) -> Option { cache.setup_search(slots.len()); if input.is_done() { return None; } // Why do we even care about this? Well, in our 'Captures' // representation, we use usize::MAX as a sentinel to indicate "no // match." This isn't problematic so long as our haystack doesn't have // a maximal length. Byte slices are guaranteed by Rust to have a // length that fits into isize, and so this assert should always pass. // But we put it here to make our assumption explicit. assert!( input.haystack().len() < core::usize::MAX, "byte slice lengths must be less than usize MAX", ); instrument!(|c| c.reset(&self.nfa)); // Whether we want to visit all match states instead of emulating the // 'leftmost' semantics of typical backtracking regex engines. let allmatches = self.config.get_match_kind().continue_past_first_match(); let (anchored, start_id) = match self.start_config(input) { None => return None, Some(config) => config, }; let pre = if anchored { None } else { self.get_config().get_prefilter() }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; let mut hm = None; // Yes, our search doesn't end at input.end(), but includes it. This // is necessary because matches are delayed by one byte, just like // how the DFA engines work. The delay is used to handle look-behind // assertions. In the case of the PikeVM, the delay is implemented // by not considering a match to exist until it is visited in // 'steps'. Technically, we know a match exists in the previous // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA // determinization. We don't mark a DFA state as a match state if it // contains an NFA match state, but rather, whether the DFA state was // generated by a transition from a DFA state that contains an NFA // match state.) let mut at = input.start(); while at <= input.end() { // If we have no states left to visit, then there are some cases // where we know we can quit early or even skip ahead. if curr.set.is_empty() { // We have a match and we haven't been instructed to continue // on even after finding a match, so we can quit. if hm.is_some() && !allmatches { break; } // If we're running an anchored search and we've advanced // beyond the start position with no other states to try, then // we will never observe a match and thus can stop. if anchored && at > input.start() { break; } // If there no states left to explore at this position and we // know we can't terminate early, then we are effectively at // the starting state of the NFA. If we fell through here, // we'd end up adding our '(?s-u:.)*?' prefix and it would be // the only thing in 'curr'. So we might as well just skip // ahead until we find something that we know might advance us // forward. if let Some(ref pre) = pre { let span = Span::from(at..input.end()); match pre.find(input.haystack(), span) { None => break, Some(ref span) => at = span.start, } } } // Instead of using the NFA's unanchored start state, we actually // always use its anchored starting state. As a result, when doing // an unanchored search, we need to simulate our own '(?s-u:.)*?' // prefix, to permit a match to appear anywhere. // // Now, we don't *have* to do things this way. We could use the // NFA's unanchored starting state and do one 'epsilon_closure' // call from that starting state before the main loop here. And // that is just as correct. However, it turns out to be slower // than our approach here because it slightly increases the cost // of processing each byte by requiring us to visit more NFA // states to deal with the additional NFA states in the unanchored // prefix. By simulating it explicitly here, we lower those costs // substantially. The cost is itself small, but it adds up for // large haystacks. // // In order to simulate the '(?s-u:.)*?' prefix---which is not // greedy---we are careful not to perform an epsilon closure on // the start state if we already have a match. Namely, if we // did otherwise, we would never reach a terminating condition // because there would always be additional states to process. // In effect, the exclusion of running 'epsilon_closure' when // we have a match corresponds to the "dead" states we have in // our DFA regex engines. Namely, in a DFA, match states merely // instruct the search execution to record the current offset as // the most recently seen match. It is the dead state that actually // indicates when to stop the search (other than EOF or quit // states). // // However, when 'allmatches' is true, the caller has asked us to // leave in every possible match state. This tends not to make a // whole lot of sense in unanchored searches, because it means the // search really cannot terminate until EOF. And often, in that // case, you wind up skipping over a bunch of matches and are left // with the "last" match. Arguably, it just doesn't make a lot of // sense to run a 'leftmost' search (which is what this routine is) // with 'allmatches' set to true. But the DFAs support it and this // matches their behavior. (Generally, 'allmatches' is useful for // overlapping searches or leftmost anchored searches to find the // longest possible match by ignoring match priority.) // // Additionally, when we're running an anchored search, this // epsilon closure should only be computed at the beginning of the // search. If we re-computed it at every position, we would be // simulating an unanchored search when we were tasked to perform // an anchored search. if (!hm.is_some() || allmatches) && (!anchored || at == input.start()) { // Since we are adding to the 'curr' active states and since // this is for the start ID, we use a slots slice that is // guaranteed to have the right length but where every element // is absent. This is exactly what we want, because this // epsilon closure is responsible for simulating an unanchored // '(?s:.)*?' prefix. It is specifically outside of any // capturing groups, and thus, using slots that are always // absent is correct. // // Note though that we can't just use '&mut []' here, since // this epsilon closure may traverse through 'Captures' epsilon // transitions, and thus must be able to write offsets to the // slots given which are later copied to slot values in 'curr'. let slots = next.slot_table.all_absent(); self.epsilon_closure(stack, slots, curr, input, at, start_id); } if let Some(pid) = self.nexts(stack, curr, next, input, at, slots) { hm = Some(HalfMatch::new(pid, at)); } // Unless the caller asked us to return early, we need to mush on // to see if we can extend our match. (But note that 'nexts' will // quit right after seeing a match when match_kind==LeftmostFirst, // as is consistent with leftmost-first match priority.) if input.get_earliest() && hm.is_some() { break; } core::mem::swap(curr, next); next.set.clear(); at += 1; } instrument!(|c| c.eprint(&self.nfa)); hm } /// The implementation for the 'which_overlapping_matches' API. Basically, /// we do a single scan through the entire haystack (unless our regex /// or search is anchored) and record every pattern that matched. In /// particular, when MatchKind::All is used, this supports overlapping /// matches. So if we have the regexes 'sam' and 'samwise', they will /// *both* be reported in the pattern set when searching the haystack /// 'samwise'. fn which_overlapping_imp( &self, cache: &mut Cache, input: &Input<'_>, patset: &mut PatternSet, ) { // NOTE: This is effectively a copy of 'search_imp' above, but with no // captures support and instead writes patterns that matched directly // to 'patset'. See that routine for better commentary about what's // going on in this routine. We probably could unify the routines using // generics or more helper routines, but I'm not sure it's worth it. // // NOTE: We somewhat go out of our way here to support things like // 'input.get_earliest()' and 'leftmost-first' match semantics. Neither // of those seem particularly relevant to this routine, but they are // both supported by the DFA analogs of this routine by construction // and composition, so it seems like good sense to have the PikeVM // match that behavior. cache.setup_search(0); if input.is_done() { return; } assert!( input.haystack().len() < core::usize::MAX, "byte slice lengths must be less than usize MAX", ); instrument!(|c| c.reset(&self.nfa)); let allmatches = self.config.get_match_kind().continue_past_first_match(); let (anchored, start_id) = match self.start_config(input) { None => return, Some(config) => config, }; let Cache { ref mut stack, ref mut curr, ref mut next } = cache; for at in input.start()..=input.end() { let any_matches = !patset.is_empty(); if curr.set.is_empty() { if any_matches && !allmatches { break; } if anchored && at > input.start() { break; } } if !any_matches || allmatches { let slots = &mut []; self.epsilon_closure(stack, slots, curr, input, at, start_id); } self.nexts_overlapping(stack, curr, next, input, at, patset); // If we found a match and filled our set, then there is no more // additional info that we can provide. Thus, we can quit. We also // quit if the caller asked us to stop at the earliest point that // we know a match exists. if patset.is_full() || input.get_earliest() { break; } core::mem::swap(curr, next); next.set.clear(); } instrument!(|c| c.eprint(&self.nfa)); } /// Process the active states in 'curr' to find the states (written to /// 'next') we should process for the next byte in the haystack. /// /// 'stack' is used to perform a depth first traversal of the NFA when /// computing an epsilon closure. /// /// When a match is found, the slots for that match state (in 'curr') are /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr' /// stops (unless the PikeVM was configured with MatchKind::All semantics). #[cfg_attr(feature = "perf-inline", inline(always))] fn nexts( &self, stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, slots: &mut [Option], ) -> Option { instrument!(|c| c.record_state_set(&curr.set)); let mut pid = None; let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { pid = match self.next(stack, slot_table, next, input, at, sid) { None => continue, Some(pid) => Some(pid), }; slots.copy_from_slice(slot_table.for_state(sid)); if !self.config.get_match_kind().continue_past_first_match() { break; } } pid } /// Like 'nexts', but for the overlapping case. This doesn't write any /// slots, and instead just writes which pattern matched in 'patset'. #[cfg_attr(feature = "perf-inline", inline(always))] fn nexts_overlapping( &self, stack: &mut Vec, curr: &mut ActiveStates, next: &mut ActiveStates, input: &Input<'_>, at: usize, patset: &mut PatternSet, ) { instrument!(|c| c.record_state_set(&curr.set)); let utf8empty = self.get_nfa().has_empty() && self.get_nfa().is_utf8(); let ActiveStates { ref set, ref mut slot_table } = *curr; for sid in set.iter() { let pid = match self.next(stack, slot_table, next, input, at, sid) { None => continue, Some(pid) => pid, }; // This handles the case of finding a zero-width match that splits // a codepoint. Namely, if we're in UTF-8 mode AND we know we can // match the empty string, then the only valid way of getting to // this point with an offset that splits a codepoint is when we // have an empty match. Such matches, in UTF-8 mode, must not be // reported. So we just skip them here and pretend as if we did // not see a match. if utf8empty && !input.is_char_boundary(at) { continue; } let _ = patset.try_insert(pid); if !self.config.get_match_kind().continue_past_first_match() { break; } } } /// Starting from 'sid', if the position 'at' in the 'input' haystack has a /// transition defined out of 'sid', then add the state transitioned to and /// its epsilon closure to the 'next' set of states to explore. /// /// 'stack' is used by the epsilon closure computation to perform a depth /// first traversal of the NFA. /// /// 'curr_slot_table' should be the table of slots for the current set of /// states being explored. If there is a transition out of 'sid', then /// sid's row in the slot table is used to perform the epsilon closure. #[cfg_attr(feature = "perf-inline", inline(always))] fn next( &self, stack: &mut Vec, curr_slot_table: &mut SlotTable, next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) -> Option { instrument!(|c| c.record_step(sid)); match *self.nfa.state(sid) { State::Fail | State::Look { .. } | State::Union { .. } | State::BinaryUnion { .. } | State::Capture { .. } => None, State::ByteRange { ref trans } => { if trans.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, trans.next, ); } None } State::Sparse(ref sparse) => { if let Some(next_sid) = sparse.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, next_sid, ); } None } State::Dense(ref dense) => { if let Some(next_sid) = dense.matches(input.haystack(), at) { let slots = curr_slot_table.for_state(sid); // OK because 'at <= haystack.len() < usize::MAX', so // adding 1 will never wrap. let at = at.wrapping_add(1); self.epsilon_closure( stack, slots, next, input, at, next_sid, ); } None } State::Match { pattern_id } => Some(pattern_id), } } /// Compute the epsilon closure of 'sid', writing the closure into 'next' /// while copying slot values from 'curr_slots' into corresponding states /// in 'next'. 'curr_slots' should be the slot values corresponding to /// 'sid'. /// /// The given 'stack' is used to perform a depth first traversal of the /// NFA by recursively following all epsilon transitions out of 'sid'. /// Conditional epsilon transitions are followed if and only if they are /// satisfied for the position 'at' in the 'input' haystack. /// /// While this routine may write to 'curr_slots', once it returns, any /// writes are undone and the original values (even if absent) are /// restored. #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, input: &Input<'_>, at: usize, sid: StateID, ) { instrument!(|c| { c.record_closure(sid); c.record_stack_push(sid); }); stack.push(FollowEpsilon::Explore(sid)); while let Some(frame) = stack.pop() { match frame { FollowEpsilon::RestoreCapture { slot, offset: pos } => { curr_slots[slot] = pos; } FollowEpsilon::Explore(sid) => { self.epsilon_closure_explore( stack, curr_slots, next, input, at, sid, ); } } } } /// Explore all of the epsilon transitions out of 'sid'. This is mostly /// split out from 'epsilon_closure' in order to clearly delineate /// the actual work of computing an epsilon closure from the stack /// book-keeping. /// /// This will push any additional explorations needed on to 'stack'. /// /// 'curr_slots' should refer to the slots for the currently active NFA /// state. That is, the current state we are stepping through. These /// slots are mutated in place as new 'Captures' states are traversed /// during epsilon closure, but the slots are restored to their original /// values once the full epsilon closure is completed. The ultimate use of /// 'curr_slots' is to copy them to the corresponding 'next_slots', so that /// the capturing group spans are forwarded from the currently active state /// to the next. /// /// 'next' refers to the next set of active states. Computing an epsilon /// closure may increase the next set of active states. /// /// 'input' refers to the caller's input configuration and 'at' refers to /// the current position in the haystack. These are used to check whether /// conditional epsilon transitions (like look-around) are satisfied at /// the current position. If they aren't, then the epsilon closure won't /// include them. #[cfg_attr(feature = "perf-inline", inline(always))] fn epsilon_closure_explore( &self, stack: &mut Vec, curr_slots: &mut [Option], next: &mut ActiveStates, input: &Input<'_>, at: usize, mut sid: StateID, ) { // We can avoid pushing some state IDs on to our stack in precisely // the cases where a 'push(x)' would be immediately followed by a 'x // = pop()'. This is achieved by this outer-loop. We simply set 'sid' // to be the next state ID we want to explore once we're done with // our initial exploration. In practice, this avoids a lot of stack // thrashing. loop { instrument!(|c| c.record_set_insert(sid)); // Record this state as part of our next set of active states. If // we've already explored it, then no need to do it again. if !next.set.insert(sid) { return; } match *self.nfa.state(sid) { State::Fail | State::Match { .. } | State::ByteRange { .. } | State::Sparse { .. } | State::Dense { .. } => { next.slot_table.for_state(sid).copy_from_slice(curr_slots); return; } State::Look { look, next } => { // OK because we don't permit building a searcher with a // Unicode word boundary if the requisite Unicode data is // unavailable. if !self.nfa.look_matcher().matches_inline( look, input.haystack(), at, ) { return; } sid = next; } State::Union { ref alternates } => { sid = match alternates.get(0) { None => return, Some(&sid) => sid, }; instrument!(|c| { for &alt in &alternates[1..] { c.record_stack_push(alt); } }); stack.extend( alternates[1..] .iter() .copied() .rev() .map(FollowEpsilon::Explore), ); } State::BinaryUnion { alt1, alt2 } => { sid = alt1; instrument!(|c| c.record_stack_push(sid)); stack.push(FollowEpsilon::Explore(alt2)); } State::Capture { next, slot, .. } => { // There's no need to do anything with slots that // ultimately won't be copied into the caller-provided // 'Captures' value. So we just skip dealing with them at // all. if slot.as_usize() < curr_slots.len() { instrument!(|c| c.record_stack_push(sid)); stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot], }); // OK because length of a slice must fit into an isize. curr_slots[slot] = Some(NonMaxUsize::new(at).unwrap()); } sid = next; } } } } /// Return the starting configuration of a PikeVM search. /// /// The "start config" is basically whether the search should be anchored /// or not and the NFA state ID at which to begin the search. The state ID /// returned always corresponds to an anchored starting state even when the /// search is unanchored. This is because the PikeVM search loop deals with /// unanchored searches with an explicit epsilon closure out of the start /// state. /// /// This routine accounts for both the caller's `Input` configuration /// and the pattern itself. For example, even if the caller asks for an /// unanchored search, if the pattern itself is anchored, then this will /// always return 'true' because implementing an unanchored search in that /// case would be incorrect. /// /// Similarly, if the caller requests an anchored search for a particular /// pattern, then the starting state ID returned will reflect that. /// /// If a pattern ID is given in the input configuration that is not in /// this regex, then `None` is returned. fn start_config(&self, input: &Input<'_>) -> Option<(bool, StateID)> { match input.get_anchored() { // Only way we're unanchored is if both the caller asked for an // unanchored search *and* the pattern is itself not anchored. Anchored::No => Some(( self.nfa.is_always_start_anchored(), self.nfa.start_anchored(), )), Anchored::Yes => Some((true, self.nfa.start_anchored())), Anchored::Pattern(pid) => { Some((true, self.nfa.start_pattern(pid)?)) } } } } /// An iterator over all non-overlapping matches for a particular search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the PikeVM. /// * `'c` represents the lifetime of the PikeVM's cache. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`PikeVM::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'c, 'h> { re: &'r PikeVM, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for FindMatches<'r, 'c, 'h> { type Item = Match; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) }) } } /// An iterator over all non-overlapping leftmost matches, with their capturing /// groups, for a particular search. /// /// The iterator yields a [`Captures`] value until no more matches could be /// found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the PikeVM. /// * `'c` represents the lifetime of the PikeVM's cache. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`PikeVM::captures_iter`] method. #[derive(Debug)] pub struct CapturesMatches<'r, 'c, 'h> { re: &'r PikeVM, cache: &'c mut Cache, caps: Captures, it: iter::Searcher<'h>, } impl<'r, 'c, 'h> Iterator for CapturesMatches<'r, 'c, 'h> { type Item = Captures; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; // 'advance' converts errors into panics, which is OK here because // the PikeVM can never return an error. it.advance(|input| { re.search(cache, input, caps); Ok(caps.get_match()) }); if caps.is_match() { Some(caps.clone()) } else { None } } } /// A cache represents mutable state that a [`PikeVM`] requires during a /// search. /// /// For a given [`PikeVM`], its corresponding cache may be created either via /// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in /// every way, except the former does not require explicitly importing `Cache`. /// /// A particular `Cache` is coupled with the [`PikeVM`] from which it /// was created. It may only be used with that `PikeVM`. A cache and its /// allocations may be re-purposed via [`Cache::reset`], in which case, it can /// only be used with the new `PikeVM` (and not the old one). #[derive(Clone, Debug)] pub struct Cache { /// Stack used while computing epsilon closure. This effectively lets us /// move what is more naturally expressed through recursion to a stack /// on the heap. stack: Vec, /// The current active states being explored for the current byte in the /// haystack. curr: ActiveStates, /// The next set of states we're building that will be explored for the /// next byte in the haystack. next: ActiveStates, } impl Cache { /// Create a new [`PikeVM`] cache. /// /// A potentially more convenient routine to create a cache is /// [`PikeVM::create_cache`], as it does not require also importing the /// `Cache` type. /// /// If you want to reuse the returned `Cache` with some other `PikeVM`, /// then you must call [`Cache::reset`] with the desired `PikeVM`. pub fn new(re: &PikeVM) -> Cache { Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re), } } /// Reset this cache such that it can be used for searching with a /// different [`PikeVM`]. /// /// A cache reset permits reusing memory already allocated in this cache /// with a different `PikeVM`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `PikeVM`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re1 = PikeVM::new(r"\w")?; /// let re2 = PikeVM::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.find_iter(&mut cache, "Δ").next(), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the PikeVM we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.find_iter(&mut cache, "☃").next(), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &PikeVM) { self.curr.reset(re); self.next.reset(re); } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { use core::mem::size_of; (self.stack.len() * size_of::()) + self.curr.memory_usage() + self.next.memory_usage() } /// Clears this cache. This should be called at the start of every search /// to ensure we start with a clean slate. /// /// This also sets the length of the capturing groups used in the current /// search. This permits an optimization where by 'SlotTable::for_state' /// only returns the number of slots equivalent to the number of slots /// given in the 'Captures' value. This may be less than the total number /// of possible slots, e.g., when one only wants to track overall match /// offsets. This in turn permits less copying of capturing group spans /// in the PikeVM. fn setup_search(&mut self, captures_slot_len: usize) { self.stack.clear(); self.curr.setup_search(captures_slot_len); self.next.setup_search(captures_slot_len); } } /// A set of active states used to "simulate" the execution of an NFA via the /// PikeVM. /// /// There are two sets of these used during NFA simulation. One set corresponds /// to the "current" set of states being traversed for the current position /// in a haystack. The other set corresponds to the "next" set of states being /// built, which will become the new "current" set for the next position in the /// haystack. These two sets correspond to CLIST and NLIST in Thompson's /// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387 /// /// In addition to representing a set of NFA states, this also maintains slot /// values for each state. These slot values are what turn the NFA simulation /// into the "Pike VM." Namely, they track capturing group values for each /// state. During the computation of epsilon closure, we copy slot values from /// states in the "current" set to the "next" set. Eventually, once a match /// is found, the slot values for that match state are what we write to the /// caller provided 'Captures' value. #[derive(Clone, Debug)] struct ActiveStates { /// The set of active NFA states. This set preserves insertion order, which /// is critical for simulating the match semantics of backtracking regex /// engines. set: SparseSet, /// The slots for every NFA state, where each slot stores a (possibly /// absent) offset. Every capturing group has two slots. One for a start /// offset and one for an end offset. slot_table: SlotTable, } impl ActiveStates { /// Create a new set of active states for the given PikeVM. The active /// states returned may only be used with the given PikeVM. (Use 'reset' /// to re-purpose the allocation for a different PikeVM.) fn new(re: &PikeVM) -> ActiveStates { let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new(), }; active.reset(re); active } /// Reset this set of active states such that it can be used with the given /// PikeVM (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { self.set.resize(re.get_nfa().states().len()); self.slot_table.reset(re); } /// Return the heap memory usage, in bytes, used by this set of active /// states. /// /// This does not include the stack size of this value. fn memory_usage(&self) -> usize { self.set.memory_usage() + self.slot_table.memory_usage() } /// Setup this set of active states for a new search. The given slot /// length should be the number of slots in a caller provided 'Captures' /// (and may be zero). fn setup_search(&mut self, captures_slot_len: usize) { self.set.clear(); self.slot_table.setup_search(captures_slot_len); } } /// A table of slots, where each row represent a state in an NFA. Thus, the /// table has room for storing slots for every single state in an NFA. /// /// This table is represented with a single contiguous allocation. In general, /// the notion of "capturing group" doesn't really exist at this level of /// abstraction, hence the name "slot" instead. (Indeed, every capturing group /// maps to a pair of slots, one for the start offset and one for the end /// offset.) Slots are indexed by the 'Captures' NFA state. /// /// N.B. Not every state actually needs a row of slots. Namely, states that /// only have epsilon transitions currently never have anything written to /// their rows in this table. Thus, the table is somewhat wasteful in its heap /// usage. However, it is important to maintain fast random access by state /// ID, which means one giant table tends to work well. RE2 takes a different /// approach here and allocates each row as its own reference counted thing. /// I explored such a strategy at one point here, but couldn't get it to work /// well using entirely safe code. (To the ambitious reader: I encourage you to /// re-litigate that experiment.) I very much wanted to stick to safe code, but /// could be convinced otherwise if there was a solid argument and the safety /// was encapsulated well. #[derive(Clone, Debug)] struct SlotTable { /// The actual table of offsets. table: Vec>, /// The number of slots per state, i.e., the table's stride or the length /// of each row. slots_per_state: usize, /// The number of slots in the caller-provided 'Captures' value for the /// current search. Setting this to 'slots_per_state' is always correct, /// but may be wasteful. slots_for_captures: usize, } impl SlotTable { /// Create a new slot table. /// /// One should call 'reset' with the corresponding PikeVM before use. fn new() -> SlotTable { SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 } } /// Reset this slot table such that it can be used with the given PikeVM /// (and only that PikeVM). fn reset(&mut self, re: &PikeVM) { let nfa = re.get_nfa(); self.slots_per_state = nfa.group_info().slot_len(); // This is always correct, but may be reduced for a particular search // if a 'Captures' has fewer slots, e.g., none at all or only slots // for tracking the overall match instead of all slots for every // group. self.slots_for_captures = core::cmp::max( self.slots_per_state, nfa.pattern_len().checked_mul(2).unwrap(), ); let len = nfa .states() .len() .checked_mul(self.slots_per_state) // Add space to account for scratch space used during a search. .and_then(|x| x.checked_add(self.slots_for_captures)) // It seems like this could actually panic on legitimate inputs on // 32-bit targets, and very likely to panic on 16-bit. Should we // somehow convert this to an error? What about something similar // for the lazy DFA cache? If you're tripping this assert, please // file a bug. .expect("slot table length doesn't overflow"); // This happens about as often as a regex is compiled, so it probably // should be at debug level, but I found it quite distracting and not // particularly useful. trace!( "resizing PikeVM active states table to {} entries \ (slots_per_state={})", len, self.slots_per_state, ); self.table.resize(len, None); } /// Return the heap memory usage, in bytes, used by this slot table. /// /// This does not include the stack size of this value. fn memory_usage(&self) -> usize { self.table.len() * core::mem::size_of::>() } /// Perform any per-search setup for this slot table. /// /// In particular, this sets the length of the number of slots used in the /// 'Captures' given by the caller (if any at all). This number may be /// smaller than the total number of slots available, e.g., when the caller /// is only interested in tracking the overall match and not the spans of /// every matching capturing group. Only tracking the overall match can /// save a substantial amount of time copying capturing spans during a /// search. fn setup_search(&mut self, captures_slot_len: usize) { self.slots_for_captures = captures_slot_len; } /// Return a mutable slice of the slots for the given state. /// /// Note that the length of the slice returned may be less than the total /// number of slots available for this state. In particular, the length /// always matches the number of slots indicated via 'setup_search'. fn for_state(&mut self, sid: StateID) -> &mut [Option] { let i = sid.as_usize() * self.slots_per_state; &mut self.table[i..i + self.slots_for_captures] } /// Return a slice of slots of appropriate length where every slot offset /// is guaranteed to be absent. This is useful in cases where you need to /// compute an epsilon closure outside of the user supplied regex, and thus /// never want it to have any capturing slots set. fn all_absent(&mut self) -> &mut [Option] { let i = self.table.len() - self.slots_for_captures; &mut self.table[i..i + self.slots_for_captures] } } /// Represents a stack frame for use while computing an epsilon closure. /// /// (An "epsilon closure" refers to the set of reachable NFA states from a /// single state without consuming any input. That is, the set of all epsilon /// transitions not only from that single state, but from every other state /// reachable by an epsilon transition as well. This is why it's called a /// "closure." Computing an epsilon closure is also done during DFA /// determinization! Compare and contrast the epsilon closure here in this /// PikeVM and the one used for determinization in crate::util::determinize.) /// /// Computing the epsilon closure in a Thompson NFA proceeds via a depth /// first traversal over all epsilon transitions from a particular state. /// (A depth first traversal is important because it emulates the same priority /// of matches that is typically found in backtracking regex engines.) This /// depth first traversal is naturally expressed using recursion, but to avoid /// a call stack size proportional to the size of a regex, we put our stack on /// the heap instead. /// /// This stack thus consists of call frames. The typical call frame is /// `Explore`, which instructs epsilon closure to explore the epsilon /// transitions from that state. (Subsequent epsilon transitions are then /// pushed on to the stack as more `Explore` frames.) If the state ID being /// explored has no epsilon transitions, then the capturing group slots are /// copied from the original state that sparked the epsilon closure (from the /// 'step' routine) to the state ID being explored. This way, capturing group /// slots are forwarded from the previous state to the next. /// /// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to /// set the position for a particular slot back to some particular offset. This /// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will /// set the offset of the slot indicated in `Capture` to the current offset, /// and then push the old offset on to the stack as a `RestoreCapture` frame. /// Thus, the new offset is only used until the epsilon closure reverts back to /// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon /// transition its "scope" to only states that come "after" it during depth /// first traversal. #[derive(Clone, Debug)] enum FollowEpsilon { /// Explore the epsilon transitions from a state ID. Explore(StateID), /// Reset the given `slot` to the given `offset` (which might be `None`). RestoreCapture { slot: SmallIndex, offset: Option }, } /// A set of counters that "instruments" a PikeVM search. To enable this, you /// must enable the 'internal-instrument-pikevm' feature. Then run your Rust /// program with RUST_LOG=regex_automata::nfa::thompson::pikevm=trace set in /// the environment. The metrics collected will be dumped automatically for /// every search executed by the PikeVM. /// /// NOTE: When 'internal-instrument-pikevm' is enabled, it will likely cause an /// absolute decrease in wall-clock performance, even if the 'trace' log level /// isn't enabled. (Although, we do try to avoid extra costs when 'trace' isn't /// enabled.) The main point of instrumentation is to get counts of various /// events that occur during the PikeVM's execution. /// /// This is a somewhat hacked together collection of metrics that are useful /// to gather from a PikeVM search. In particular, it lets us scrutinize the /// performance profile of a search beyond what general purpose profiling tools /// give us. Namely, we orient the profiling data around the specific states of /// the NFA. /// /// In other words, this lets us see which parts of the NFA graph are most /// frequently activated. This then provides direction for optimization /// opportunities. /// /// The really sad part about this is that it absolutely clutters up the PikeVM /// implementation. :'( Another approach would be to just manually add this /// code in whenever I want this kind of profiling data, but it's complicated /// and tedious enough that I went with this approach... for now. /// /// When instrumentation is enabled (which also turns on 'logging'), then a /// `Counters` is initialized for every search and `trace`'d just before the /// search returns to the caller. /// /// Tip: When debugging performance problems with the PikeVM, it's best to try /// to work with an NFA that is as small as possible. Otherwise the state graph /// is likely to be too big to digest. #[cfg(feature = "internal-instrument-pikevm")] #[derive(Clone, Debug)] struct Counters { /// The number of times the NFA is in a particular permutation of states. state_sets: alloc::collections::BTreeMap, u64>, /// The number of times 'step' is called for a particular state ID (which /// indexes this array). steps: Vec, /// The number of times an epsilon closure was computed for a state. closures: Vec, /// The number of times a particular state ID is pushed on to a stack while /// computing an epsilon closure. stack_pushes: Vec, /// The number of times a particular state ID is inserted into a sparse set /// while computing an epsilon closure. set_inserts: Vec, } #[cfg(feature = "internal-instrument-pikevm")] impl Counters { fn empty() -> Counters { Counters { state_sets: alloc::collections::BTreeMap::new(), steps: vec![], closures: vec![], stack_pushes: vec![], set_inserts: vec![], } } fn reset(&mut self, nfa: &NFA) { let len = nfa.states().len(); self.state_sets.clear(); self.steps.clear(); self.steps.resize(len, 0); self.closures.clear(); self.closures.resize(len, 0); self.stack_pushes.clear(); self.stack_pushes.resize(len, 0); self.set_inserts.clear(); self.set_inserts.resize(len, 0); } fn eprint(&self, nfa: &NFA) { trace!("===== START PikeVM Instrumentation Output ====="); // We take the top-K most occurring state sets. Otherwise the output // is likely to be overwhelming. And we probably only care about the // most frequently occurring ones anyway. const LIMIT: usize = 20; let mut set_counts = self.state_sets.iter().collect::, &u64)>>(); set_counts.sort_by_key(|(_, &count)| core::cmp::Reverse(count)); trace!("## PikeVM frequency of state sets (top {})", LIMIT); for (set, count) in set_counts.iter().take(LIMIT) { trace!("{:?}: {}", set, count); } if set_counts.len() > LIMIT { trace!( "... {} sets omitted (out of {} total)", set_counts.len() - LIMIT, set_counts.len(), ); } trace!(""); trace!("## PikeVM total frequency of events"); trace!( "steps: {}, closures: {}, stack-pushes: {}, set-inserts: {}", self.steps.iter().copied().sum::(), self.closures.iter().copied().sum::(), self.stack_pushes.iter().copied().sum::(), self.set_inserts.iter().copied().sum::(), ); trace!(""); trace!("## PikeVM frequency of events broken down by state"); for sid in 0..self.steps.len() { trace!( "{:06}: steps: {}, closures: {}, \ stack-pushes: {}, set-inserts: {}", sid, self.steps[sid], self.closures[sid], self.stack_pushes[sid], self.set_inserts[sid], ); } trace!(""); trace!("## NFA debug display"); trace!("{:?}", nfa); trace!("===== END PikeVM Instrumentation Output ====="); } fn record_state_set(&mut self, set: &SparseSet) { let set = set.iter().collect::>(); *self.state_sets.entry(set).or_insert(0) += 1; } fn record_step(&mut self, sid: StateID) { self.steps[sid] += 1; } fn record_closure(&mut self, sid: StateID) { self.closures[sid] += 1; } fn record_stack_push(&mut self, sid: StateID) { self.stack_pushes[sid] += 1; } fn record_set_insert(&mut self, sid: StateID) { self.set_inserts[sid] += 1; } } regex-automata-0.4.9/src/nfa/thompson/range_trie.rs000064400000000000000000001215571046102023000204630ustar 00000000000000/* I've called the primary data structure in this module a "range trie." As far as I can tell, there is no prior art on a data structure like this, however, it's likely someone somewhere has built something like it. Searching for "range trie" turns up the paper "Range Tries for Scalable Address Lookup," but it does not appear relevant. The range trie is just like a trie in that it is a special case of a deterministic finite state machine. It has states and each state has a set of transitions to other states. It is acyclic, and, like a normal trie, it makes no attempt to reuse common suffixes among its elements. The key difference between a normal trie and a range trie below is that a range trie operates on *contiguous sequences* of bytes instead of singleton bytes. One could say say that our alphabet is ranges of bytes instead of bytes themselves, except a key part of range trie construction is splitting ranges apart to ensure there is at most one transition that can be taken for any byte in a given state. I've tried to explain the details of how the range trie works below, so for now, we are left with trying to understand what problem we're trying to solve. Which is itself fairly involved! At the highest level, here's what we want to do. We want to convert a sequence of Unicode codepoints into a finite state machine whose transitions are over *bytes* and *not* Unicode codepoints. We want this because it makes said finite state machines much smaller and much faster to execute. As a simple example, consider a byte oriented automaton for all Unicode scalar values (0x00 through 0x10FFFF, not including surrogate codepoints): [00-7F] [C2-DF][80-BF] [E0-E0][A0-BF][80-BF] [E1-EC][80-BF][80-BF] [ED-ED][80-9F][80-BF] [EE-EF][80-BF][80-BF] [F0-F0][90-BF][80-BF][80-BF] [F1-F3][80-BF][80-BF][80-BF] [F4-F4][80-8F][80-BF][80-BF] (These byte ranges are generated via the regex-syntax::utf8 module, which was based on Russ Cox's code in RE2, which was in turn based on Ken Thompson's implementation of the same idea in his Plan9 implementation of grep.) It should be fairly straight-forward to see how one could compile this into a DFA. The sequences are sorted and non-overlapping. Essentially, you could build a trie from this fairly easy. The problem comes when your initial range (in this case, 0x00-0x10FFFF) isn't so nice. For example, the class represented by '\w' contains only a tenth of the codepoints that 0x00-0x10FFFF contains, but if we were to write out the byte based ranges as we did above, the list would stretch to 892 entries! This turns into quite a large NFA with a few thousand states. Turning this beast into a DFA takes quite a bit of time. We are thus left with trying to trim down the number of states we produce as early as possible. One approach (used by RE2 and still by the regex crate, at time of writing) is to try to find common suffixes while building NFA states for the above and reuse them. This is very cheap to do and one can control precisely how much extra memory you want to use for the cache. Another approach, however, is to reuse an algorithm for constructing a *minimal* DFA from a sorted sequence of inputs. I don't want to go into the full details here, but I explain it in more depth in my blog post on FSTs[1]. Note that the algorithm was not invented by me, but was published in paper by Daciuk et al. in 2000 called "Incremental Construction of MinimalAcyclic Finite-State Automata." Like the suffix cache approach above, it is also possible to control the amount of extra memory one uses, although this usually comes with the cost of sacrificing true minimality. (But it's typically close enough with a reasonably sized cache of states.) The catch is that Daciuk's algorithm only works if you add your keys in lexicographic ascending order. In our case, since we're dealing with ranges, we also need the additional requirement that ranges are either equivalent or do not overlap at all. For example, if one were given the following byte ranges: [BC-BF][80-BF] [BC-BF][90-BF] Then Daciuk's algorithm would not work, since there is nothing to handle the fact that the ranges overlap. They would need to be split apart. Thankfully, Thompson's algorithm for producing byte ranges for Unicode codepoint ranges meets both of our requirements. (A proof for this eludes me, but it appears true.) ... however, we would also like to be able to compile UTF-8 automata in reverse. We want this because in order to find the starting location of a match using a DFA, we need to run a second DFA---a reversed version of the forward DFA---backwards to discover the match location. Unfortunately, if we reverse our byte sequences for 0x00-0x10FFFF, we get sequences that are can overlap, even if they are sorted: [00-7F] [80-BF][80-9F][ED-ED] [80-BF][80-BF][80-8F][F4-F4] [80-BF][80-BF][80-BF][F1-F3] [80-BF][80-BF][90-BF][F0-F0] [80-BF][80-BF][E1-EC] [80-BF][80-BF][EE-EF] [80-BF][A0-BF][E0-E0] [80-BF][C2-DF] For example, '[80-BF][80-BF][EE-EF]' and '[80-BF][A0-BF][E0-E0]' have overlapping ranges between '[80-BF]' and '[A0-BF]'. Thus, there is no simple way to apply Daciuk's algorithm. And thus, the range trie was born. The range trie's only purpose is to take sequences of byte ranges like the ones above, collect them into a trie and then spit them out in a sorted fashion with no overlapping ranges. For example, 0x00-0x10FFFF gets translated to: [0-7F] [80-BF][80-9F][80-8F][F1-F3] [80-BF][80-9F][80-8F][F4] [80-BF][80-9F][90-BF][F0] [80-BF][80-9F][90-BF][F1-F3] [80-BF][80-9F][E1-EC] [80-BF][80-9F][ED] [80-BF][80-9F][EE-EF] [80-BF][A0-BF][80-8F][F1-F3] [80-BF][A0-BF][80-8F][F4] [80-BF][A0-BF][90-BF][F0] [80-BF][A0-BF][90-BF][F1-F3] [80-BF][A0-BF][E0] [80-BF][A0-BF][E1-EC] [80-BF][A0-BF][EE-EF] [80-BF][C2-DF] We've thus satisfied our requirements for running Daciuk's algorithm. All sequences of ranges are sorted, and any corresponding ranges are either exactly equivalent or non-overlapping. In effect, a range trie is building a DFA from a sequence of arbitrary byte ranges. But it uses an algorithm custom tailored to its input, so it is not as costly as traditional DFA construction. While it is still quite a bit more costly than the forward case (which only needs Daciuk's algorithm), it winds up saving a substantial amount of time if one is doing a full DFA powerset construction later by virtue of producing a much much smaller NFA. [1] - https://blog.burntsushi.net/transducers/ [2] - https://www.mitpressjournals.org/doi/pdfplus/10.1162/089120100561601 */ use core::{cell::RefCell, fmt, mem, ops::RangeInclusive}; use alloc::{format, string::String, vec, vec::Vec}; use regex_syntax::utf8::Utf8Range; use crate::util::primitives::StateID; /// There is only one final state in this trie. Every sequence of byte ranges /// added shares the same final state. const FINAL: StateID = StateID::ZERO; /// The root state of the trie. const ROOT: StateID = StateID::new_unchecked(1); /// A range trie represents an ordered set of sequences of bytes. /// /// A range trie accepts as input a sequence of byte ranges and merges /// them into the existing set such that the trie can produce a sorted /// non-overlapping sequence of byte ranges. The sequence emitted corresponds /// precisely to the sequence of bytes matched by the given keys, although the /// byte ranges themselves may be split at different boundaries. /// /// The order complexity of this data structure seems difficult to analyze. /// If the size of a byte is held as a constant, then insertion is clearly /// O(n) where n is the number of byte ranges in the input key. However, if /// k=256 is our alphabet size, then insertion could be O(k^2 * n). In /// particular it seems possible for pathological inputs to cause insertion /// to do a lot of work. However, for what we use this data structure for, /// there should be no pathological inputs since the ultimate source is always /// a sorted set of Unicode scalar value ranges. /// /// Internally, this trie is setup like a finite state machine. Note though /// that it is acyclic. #[derive(Clone)] pub struct RangeTrie { /// The states in this trie. The first is always the shared final state. /// The second is always the root state. Otherwise, there is no /// particular order. states: Vec, /// A free-list of states. When a range trie is cleared, all of its states /// are added to this list. Creating a new state reuses states from this /// list before allocating a new one. free: Vec, /// A stack for traversing this trie to yield sequences of byte ranges in /// lexicographic order. iter_stack: RefCell>, /// A buffer that stores the current sequence during iteration. iter_ranges: RefCell>, /// A stack used for traversing the trie in order to (deeply) duplicate /// a state. States are recursively duplicated when ranges are split. dupe_stack: Vec, /// A stack used for traversing the trie during insertion of a new /// sequence of byte ranges. insert_stack: Vec, } /// A single state in this trie. #[derive(Clone)] struct State { /// A sorted sequence of non-overlapping transitions to other states. Each /// transition corresponds to a single range of bytes. transitions: Vec, } /// A transition is a single range of bytes. If a particular byte is in this /// range, then the corresponding machine may transition to the state pointed /// to by `next_id`. #[derive(Clone)] struct Transition { /// The byte range. range: Utf8Range, /// The next state to transition to. next_id: StateID, } impl RangeTrie { /// Create a new empty range trie. pub fn new() -> RangeTrie { let mut trie = RangeTrie { states: vec![], free: vec![], iter_stack: RefCell::new(vec![]), iter_ranges: RefCell::new(vec![]), dupe_stack: vec![], insert_stack: vec![], }; trie.clear(); trie } /// Clear this range trie such that it is empty. Clearing a range trie /// and reusing it can beneficial because this may reuse allocations. pub fn clear(&mut self) { self.free.extend(self.states.drain(..)); self.add_empty(); // final self.add_empty(); // root } /// Iterate over all of the sequences of byte ranges in this trie, and /// call the provided function for each sequence. Iteration occurs in /// lexicographic order. pub fn iter Result<(), E>>( &self, mut f: F, ) -> Result<(), E> { let mut stack = self.iter_stack.borrow_mut(); stack.clear(); let mut ranges = self.iter_ranges.borrow_mut(); ranges.clear(); // We do iteration in a way that permits us to use a single buffer // for our keys. We iterate in a depth first fashion, while being // careful to expand our frontier as we move deeper in the trie. stack.push(NextIter { state_id: ROOT, tidx: 0 }); while let Some(NextIter { mut state_id, mut tidx }) = stack.pop() { // This could be implemented more simply without an inner loop // here, but at the cost of more stack pushes. loop { let state = self.state(state_id); // If we've visited all transitions in this state, then pop // back to the parent state. if tidx >= state.transitions.len() { ranges.pop(); break; } let t = &state.transitions[tidx]; ranges.push(t.range); if t.next_id == FINAL { f(&ranges)?; ranges.pop(); tidx += 1; } else { // Expand our frontier. Once we come back to this state // via the stack, start in on the next transition. stack.push(NextIter { state_id, tidx: tidx + 1 }); // Otherwise, move to the first transition of the next // state. state_id = t.next_id; tidx = 0; } } } Ok(()) } /// Inserts a new sequence of ranges into this trie. /// /// The sequence given must be non-empty and must not have a length /// exceeding 4. pub fn insert(&mut self, ranges: &[Utf8Range]) { assert!(!ranges.is_empty()); assert!(ranges.len() <= 4); let mut stack = mem::replace(&mut self.insert_stack, vec![]); stack.clear(); stack.push(NextInsert::new(ROOT, ranges)); while let Some(next) = stack.pop() { let (state_id, ranges) = (next.state_id(), next.ranges()); assert!(!ranges.is_empty()); let (mut new, rest) = (ranges[0], &ranges[1..]); // i corresponds to the position of the existing transition on // which we are operating. Typically, the result is to remove the // transition and replace it with two or more new transitions // corresponding to the partitions generated by splitting the // 'new' with the ith transition's range. let mut i = self.state(state_id).find(new); // In this case, there is no overlap *and* the new range is greater // than all existing ranges. So we can just add it to the end. if i == self.state(state_id).transitions.len() { let next_id = NextInsert::push(self, &mut stack, rest); self.add_transition(state_id, new, next_id); continue; } // The need for this loop is a bit subtle, buf basically, after // we've handled the partitions from our initial split, it's // possible that there will be a partition leftover that overlaps // with a subsequent transition. If so, then we have to repeat // the split process again with the leftovers and that subsequent // transition. 'OUTER: loop { let old = self.state(state_id).transitions[i].clone(); let split = match Split::new(old.range, new) { Some(split) => split, None => { let next_id = NextInsert::push(self, &mut stack, rest); self.add_transition_at(i, state_id, new, next_id); continue; } }; let splits = split.as_slice(); // If we only have one partition, then the ranges must be // equivalent. There's nothing to do here for this state, so // just move on to the next one. if splits.len() == 1 { // ... but only if we have anything left to do. if !rest.is_empty() { stack.push(NextInsert::new(old.next_id, rest)); } break; } // At this point, we know that 'split' is non-empty and there // must be some overlap AND that the two ranges are not // equivalent. Therefore, the existing range MUST be removed // and split up somehow. Instead of actually doing the removal // and then a subsequent insertion---with all the memory // shuffling that entails---we simply overwrite the transition // at position `i` for the first new transition we want to // insert. After that, we're forced to do expensive inserts. let mut first = true; let mut add_trans = |trie: &mut RangeTrie, pos, from, range, to| { if first { trie.set_transition_at(pos, from, range, to); first = false; } else { trie.add_transition_at(pos, from, range, to); } }; for (j, &srange) in splits.iter().enumerate() { match srange { SplitRange::Old(r) => { // Deep clone the state pointed to by the ith // transition. This is always necessary since 'old' // is always coupled with at least a 'both' // partition. We don't want any new changes made // via the 'both' partition to impact the part of // the transition that doesn't overlap with the // new range. let dup_id = self.duplicate(old.next_id); add_trans(self, i, state_id, r, dup_id); } SplitRange::New(r) => { // This is a bit subtle, but if this happens to be // the last partition in our split, it is possible // that this overlaps with a subsequent transition. // If it does, then we must repeat the whole // splitting process over again with `r` and the // subsequent transition. { let trans = &self.state(state_id).transitions; if j + 1 == splits.len() && i < trans.len() && intersects(r, trans[i].range) { new = r; continue 'OUTER; } } // ... otherwise, setup exploration for a new // empty state and add a brand new transition for // this new range. let next_id = NextInsert::push(self, &mut stack, rest); add_trans(self, i, state_id, r, next_id); } SplitRange::Both(r) => { // Continue adding the remaining ranges on this // path and update the transition with the new // range. if !rest.is_empty() { stack.push(NextInsert::new(old.next_id, rest)); } add_trans(self, i, state_id, r, old.next_id); } } i += 1; } // If we've reached this point, then we know that there are // no subsequent transitions with any overlap. Therefore, we // can stop processing this range and move on to the next one. break; } } self.insert_stack = stack; } pub fn add_empty(&mut self) -> StateID { let id = match StateID::try_from(self.states.len()) { Ok(id) => id, Err(_) => { // This generally should not happen since a range trie is // only ever used to compile a single sequence of Unicode // scalar values. If we ever got to this point, we would, at // *minimum*, be using 96GB in just the range trie alone. panic!("too many sequences added to range trie"); } }; // If we have some free states available, then use them to avoid // more allocations. if let Some(mut state) = self.free.pop() { state.clear(); self.states.push(state); } else { self.states.push(State { transitions: vec![] }); } id } /// Performs a deep clone of the given state and returns the duplicate's /// state ID. /// /// A "deep clone" in this context means that the state given along with /// recursively all states that it points to are copied. Once complete, /// the given state ID and the returned state ID share nothing. /// /// This is useful during range trie insertion when a new range overlaps /// with an existing range that is bigger than the new one. The part /// of the existing range that does *not* overlap with the new one is /// duplicated so that adding the new range to the overlap doesn't disturb /// the non-overlapping portion. /// /// There's one exception: if old_id is the final state, then it is not /// duplicated and the same final state is returned. This is because all /// final states in this trie are equivalent. fn duplicate(&mut self, old_id: StateID) -> StateID { if old_id == FINAL { return FINAL; } let mut stack = mem::replace(&mut self.dupe_stack, vec![]); stack.clear(); let new_id = self.add_empty(); // old_id is the state we're cloning and new_id is the ID of the // duplicated state for old_id. stack.push(NextDupe { old_id, new_id }); while let Some(NextDupe { old_id, new_id }) = stack.pop() { for i in 0..self.state(old_id).transitions.len() { let t = self.state(old_id).transitions[i].clone(); if t.next_id == FINAL { // All final states are the same, so there's no need to // duplicate it. self.add_transition(new_id, t.range, FINAL); continue; } let new_child_id = self.add_empty(); self.add_transition(new_id, t.range, new_child_id); stack.push(NextDupe { old_id: t.next_id, new_id: new_child_id, }); } } self.dupe_stack = stack; new_id } /// Adds the given transition to the given state. /// /// Callers must ensure that all previous transitions in this state /// are lexicographically smaller than the given range. fn add_transition( &mut self, from_id: StateID, range: Utf8Range, next_id: StateID, ) { self.state_mut(from_id) .transitions .push(Transition { range, next_id }); } /// Like `add_transition`, except this inserts the transition just before /// the ith transition. fn add_transition_at( &mut self, i: usize, from_id: StateID, range: Utf8Range, next_id: StateID, ) { self.state_mut(from_id) .transitions .insert(i, Transition { range, next_id }); } /// Overwrites the transition at position i with the given transition. fn set_transition_at( &mut self, i: usize, from_id: StateID, range: Utf8Range, next_id: StateID, ) { self.state_mut(from_id).transitions[i] = Transition { range, next_id }; } /// Return an immutable borrow for the state with the given ID. fn state(&self, id: StateID) -> &State { &self.states[id] } /// Return a mutable borrow for the state with the given ID. fn state_mut(&mut self, id: StateID) -> &mut State { &mut self.states[id] } } impl State { /// Find the position at which the given range should be inserted in this /// state. /// /// The position returned is always in the inclusive range /// [0, transitions.len()]. If 'transitions.len()' is returned, then the /// given range overlaps with no other range in this state *and* is greater /// than all of them. /// /// For all other possible positions, the given range either overlaps /// with the transition at that position or is otherwise less than it /// with no overlap (and is greater than the previous transition). In the /// former case, careful attention must be paid to inserting this range /// as a new transition. In the latter case, the range can be inserted as /// a new transition at the given position without disrupting any other /// transitions. fn find(&self, range: Utf8Range) -> usize { /// Returns the position `i` at which `pred(xs[i])` first returns true /// such that for all `j >= i`, `pred(xs[j]) == true`. If `pred` never /// returns true, then `xs.len()` is returned. /// /// We roll our own binary search because it doesn't seem like the /// standard library's binary search can be used here. Namely, if /// there is an overlapping range, then we want to find the first such /// occurrence, but there may be many. Or at least, it's not quite /// clear to me how to do it. fn binary_search(xs: &[T], mut pred: F) -> usize where F: FnMut(&T) -> bool, { let (mut left, mut right) = (0, xs.len()); while left < right { // Overflow is impossible because xs.len() <= 256. let mid = (left + right) / 2; if pred(&xs[mid]) { right = mid; } else { left = mid + 1; } } left } // Benchmarks suggest that binary search is just a bit faster than // straight linear search. Specifically when using the debug tool: // // hyperfine "regex-cli debug thompson -qr --captures none '\w{90} ecurB'" binary_search(&self.transitions, |t| range.start <= t.range.end) } /// Clear this state such that it has zero transitions. fn clear(&mut self) { self.transitions.clear(); } } /// The next state to process during duplication. #[derive(Clone, Debug)] struct NextDupe { /// The state we want to duplicate. old_id: StateID, /// The ID of the new state that is a duplicate of old_id. new_id: StateID, } /// The next state (and its corresponding transition) that we want to visit /// during iteration in lexicographic order. #[derive(Clone, Debug)] struct NextIter { state_id: StateID, tidx: usize, } /// The next state to process during insertion and any remaining ranges that we /// want to add for a particular sequence of ranges. The first such instance /// is always the root state along with all ranges given. #[derive(Clone, Debug)] struct NextInsert { /// The next state to begin inserting ranges. This state should be the /// state at which `ranges[0]` should be inserted. state_id: StateID, /// The ranges to insert. We used a fixed-size array here to avoid an /// allocation. ranges: [Utf8Range; 4], /// The number of valid ranges in the above array. len: u8, } impl NextInsert { /// Create the next item to visit. The given state ID should correspond /// to the state at which the first range in the given slice should be /// inserted. The slice given must not be empty and it must be no longer /// than 4. fn new(state_id: StateID, ranges: &[Utf8Range]) -> NextInsert { let len = ranges.len(); assert!(len > 0); assert!(len <= 4); let mut tmp = [Utf8Range { start: 0, end: 0 }; 4]; tmp[..len].copy_from_slice(ranges); NextInsert { state_id, ranges: tmp, len: u8::try_from(len).unwrap() } } /// Push a new empty state to visit along with any remaining ranges that /// still need to be inserted. The ID of the new empty state is returned. /// /// If ranges is empty, then no new state is created and FINAL is returned. fn push( trie: &mut RangeTrie, stack: &mut Vec, ranges: &[Utf8Range], ) -> StateID { if ranges.is_empty() { FINAL } else { let next_id = trie.add_empty(); stack.push(NextInsert::new(next_id, ranges)); next_id } } /// Return the ID of the state to visit. fn state_id(&self) -> StateID { self.state_id } /// Return the remaining ranges to insert. fn ranges(&self) -> &[Utf8Range] { &self.ranges[..usize::try_from(self.len).unwrap()] } } /// Split represents a partitioning of two ranges into one or more ranges. This /// is the secret sauce that makes a range trie work, as it's what tells us /// how to deal with two overlapping but unequal ranges during insertion. /// /// Essentially, either two ranges overlap or they don't. If they don't, then /// handling insertion is easy: just insert the new range into its /// lexicographically correct position. Since it does not overlap with anything /// else, no other transitions are impacted by the new range. /// /// If they do overlap though, there are generally three possible cases to /// handle: /// /// 1. The part where the two ranges actually overlap. i.e., The intersection. /// 2. The part of the existing range that is not in the new range. /// 3. The part of the new range that is not in the old range. /// /// (1) is guaranteed to always occur since all overlapping ranges have a /// non-empty intersection. If the two ranges are not equivalent, then at /// least one of (2) or (3) is guaranteed to occur as well. In some cases, /// e.g., `[0-4]` and `[4-9]`, all three cases will occur. /// /// This `Split` type is responsible for providing (1), (2) and (3) for any /// possible pair of byte ranges. /// /// As for insertion, for the overlap in (1), the remaining ranges to insert /// should be added by following the corresponding transition. However, this /// should only be done for the overlapping parts of the range. If there was /// a part of the existing range that was not in the new range, then that /// existing part must be split off from the transition and duplicated. The /// remaining parts of the overlap can then be added to using the new ranges /// without disturbing the existing range. /// /// Handling the case for the part of a new range that is not in an existing /// range is seemingly easy. Just treat it as if it were a non-overlapping /// range. The problem here is that if this new non-overlapping range occurs /// after both (1) and (2), then it's possible that it can overlap with the /// next transition in the current state. If it does, then the whole process /// must be repeated! /// /// # Details of the 3 cases /// /// The following details the various cases that are implemented in code /// below. It's plausible that the number of cases is not actually minimal, /// but it's important for this code to remain at least somewhat readable. /// /// Given [a,b] and [x,y], where a <= b, x <= y, b < 256 and y < 256, we define /// the follow distinct relationships where at least one must apply. The order /// of these matters, since multiple can match. The first to match applies. /// /// 1. b < x <=> [a,b] < [x,y] /// 2. y < a <=> [x,y] < [a,b] /// /// In the case of (1) and (2), these are the only cases where there is no /// overlap. Or otherwise, the intersection of [a,b] and [x,y] is empty. In /// order to compute the intersection, one can do [max(a,x), min(b,y)]. The /// intersection in all of the following cases is non-empty. /// /// 3. a = x && b = y <=> [a,b] == [x,y] /// 4. a = x && b < y <=> [x,y] right-extends [a,b] /// 5. b = y && a > x <=> [x,y] left-extends [a,b] /// 6. x = a && y < b <=> [a,b] right-extends [x,y] /// 7. y = b && x > a <=> [a,b] left-extends [x,y] /// 8. a > x && b < y <=> [x,y] covers [a,b] /// 9. x > a && y < b <=> [a,b] covers [x,y] /// 10. b = x && a < y <=> [a,b] is left-adjacent to [x,y] /// 11. y = a && x < b <=> [x,y] is left-adjacent to [a,b] /// 12. b > x && b < y <=> [a,b] left-overlaps [x,y] /// 13. y > a && y < b <=> [x,y] left-overlaps [a,b] /// /// In cases 3-13, we can form rules that partition the ranges into a /// non-overlapping ordered sequence of ranges: /// /// 3. [a,b] /// 4. [a,b], [b+1,y] /// 5. [x,a-1], [a,b] /// 6. [x,y], [y+1,b] /// 7. [a,x-1], [x,y] /// 8. [x,a-1], [a,b], [b+1,y] /// 9. [a,x-1], [x,y], [y+1,b] /// 10. [a,b-1], [b,b], [b+1,y] /// 11. [x,y-1], [y,y], [y+1,b] /// 12. [a,x-1], [x,b], [b+1,y] /// 13. [x,a-1], [a,y], [y+1,b] /// /// In the code below, we go a step further and identify each of the above /// outputs as belonging either to the overlap of the two ranges or to one /// of [a,b] or [x,y] exclusively. #[derive(Clone, Debug, Eq, PartialEq)] struct Split { partitions: [SplitRange; 3], len: usize, } /// A tagged range indicating how it was derived from a pair of ranges. #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum SplitRange { Old(Utf8Range), New(Utf8Range), Both(Utf8Range), } impl Split { /// Create a partitioning of the given ranges. /// /// If the given ranges have an empty intersection, then None is returned. fn new(o: Utf8Range, n: Utf8Range) -> Option { let range = |r: RangeInclusive| Utf8Range { start: *r.start(), end: *r.end(), }; let old = |r| SplitRange::Old(range(r)); let new = |r| SplitRange::New(range(r)); let both = |r| SplitRange::Both(range(r)); // Use same names as the comment above to make it easier to compare. let (a, b, x, y) = (o.start, o.end, n.start, n.end); if b < x || y < a { // case 1, case 2 None } else if a == x && b == y { // case 3 Some(Split::parts1(both(a..=b))) } else if a == x && b < y { // case 4 Some(Split::parts2(both(a..=b), new(b + 1..=y))) } else if b == y && a > x { // case 5 Some(Split::parts2(new(x..=a - 1), both(a..=b))) } else if x == a && y < b { // case 6 Some(Split::parts2(both(x..=y), old(y + 1..=b))) } else if y == b && x > a { // case 7 Some(Split::parts2(old(a..=x - 1), both(x..=y))) } else if a > x && b < y { // case 8 Some(Split::parts3(new(x..=a - 1), both(a..=b), new(b + 1..=y))) } else if x > a && y < b { // case 9 Some(Split::parts3(old(a..=x - 1), both(x..=y), old(y + 1..=b))) } else if b == x && a < y { // case 10 Some(Split::parts3(old(a..=b - 1), both(b..=b), new(b + 1..=y))) } else if y == a && x < b { // case 11 Some(Split::parts3(new(x..=y - 1), both(y..=y), old(y + 1..=b))) } else if b > x && b < y { // case 12 Some(Split::parts3(old(a..=x - 1), both(x..=b), new(b + 1..=y))) } else if y > a && y < b { // case 13 Some(Split::parts3(new(x..=a - 1), both(a..=y), old(y + 1..=b))) } else { unreachable!() } } /// Create a new split with a single partition. This only occurs when two /// ranges are equivalent. fn parts1(r1: SplitRange) -> Split { // This value doesn't matter since it is never accessed. let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); Split { partitions: [r1, nada, nada], len: 1 } } /// Create a new split with two partitions. fn parts2(r1: SplitRange, r2: SplitRange) -> Split { // This value doesn't matter since it is never accessed. let nada = SplitRange::Old(Utf8Range { start: 0, end: 0 }); Split { partitions: [r1, r2, nada], len: 2 } } /// Create a new split with three partitions. fn parts3(r1: SplitRange, r2: SplitRange, r3: SplitRange) -> Split { Split { partitions: [r1, r2, r3], len: 3 } } /// Return the partitions in this split as a slice. fn as_slice(&self) -> &[SplitRange] { &self.partitions[..self.len] } } impl fmt::Debug for RangeTrie { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { writeln!(f, "")?; for (i, state) in self.states.iter().enumerate() { let status = if i == FINAL.as_usize() { '*' } else { ' ' }; writeln!(f, "{}{:06}: {:?}", status, i, state)?; } Ok(()) } } impl fmt::Debug for State { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { let rs = self .transitions .iter() .map(|t| format!("{:?}", t)) .collect::>() .join(", "); write!(f, "{}", rs) } } impl fmt::Debug for Transition { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { if self.range.start == self.range.end { write!( f, "{:02X} => {:02X}", self.range.start, self.next_id.as_usize(), ) } else { write!( f, "{:02X}-{:02X} => {:02X}", self.range.start, self.range.end, self.next_id.as_usize(), ) } } } /// Returns true if and only if the given ranges intersect. fn intersects(r1: Utf8Range, r2: Utf8Range) -> bool { !(r1.end < r2.start || r2.end < r1.start) } #[cfg(test)] mod tests { use super::*; fn r(range: RangeInclusive) -> Utf8Range { Utf8Range { start: *range.start(), end: *range.end() } } fn split_maybe( old: RangeInclusive, new: RangeInclusive, ) -> Option { Split::new(r(old), r(new)) } fn split( old: RangeInclusive, new: RangeInclusive, ) -> Vec { split_maybe(old, new).unwrap().as_slice().to_vec() } #[test] fn no_splits() { // case 1 assert_eq!(None, split_maybe(0..=1, 2..=3)); // case 2 assert_eq!(None, split_maybe(2..=3, 0..=1)); } #[test] fn splits() { let range = |r: RangeInclusive| Utf8Range { start: *r.start(), end: *r.end(), }; let old = |r| SplitRange::Old(range(r)); let new = |r| SplitRange::New(range(r)); let both = |r| SplitRange::Both(range(r)); // case 3 assert_eq!(split(0..=0, 0..=0), vec![both(0..=0)]); assert_eq!(split(9..=9, 9..=9), vec![both(9..=9)]); // case 4 assert_eq!(split(0..=5, 0..=6), vec![both(0..=5), new(6..=6)]); assert_eq!(split(0..=5, 0..=8), vec![both(0..=5), new(6..=8)]); assert_eq!(split(5..=5, 5..=8), vec![both(5..=5), new(6..=8)]); // case 5 assert_eq!(split(1..=5, 0..=5), vec![new(0..=0), both(1..=5)]); assert_eq!(split(3..=5, 0..=5), vec![new(0..=2), both(3..=5)]); assert_eq!(split(5..=5, 0..=5), vec![new(0..=4), both(5..=5)]); // case 6 assert_eq!(split(0..=6, 0..=5), vec![both(0..=5), old(6..=6)]); assert_eq!(split(0..=8, 0..=5), vec![both(0..=5), old(6..=8)]); assert_eq!(split(5..=8, 5..=5), vec![both(5..=5), old(6..=8)]); // case 7 assert_eq!(split(0..=5, 1..=5), vec![old(0..=0), both(1..=5)]); assert_eq!(split(0..=5, 3..=5), vec![old(0..=2), both(3..=5)]); assert_eq!(split(0..=5, 5..=5), vec![old(0..=4), both(5..=5)]); // case 8 assert_eq!( split(3..=6, 2..=7), vec![new(2..=2), both(3..=6), new(7..=7)], ); assert_eq!( split(3..=6, 1..=8), vec![new(1..=2), both(3..=6), new(7..=8)], ); // case 9 assert_eq!( split(2..=7, 3..=6), vec![old(2..=2), both(3..=6), old(7..=7)], ); assert_eq!( split(1..=8, 3..=6), vec![old(1..=2), both(3..=6), old(7..=8)], ); // case 10 assert_eq!( split(3..=6, 6..=7), vec![old(3..=5), both(6..=6), new(7..=7)], ); assert_eq!( split(3..=6, 6..=8), vec![old(3..=5), both(6..=6), new(7..=8)], ); assert_eq!( split(5..=6, 6..=7), vec![old(5..=5), both(6..=6), new(7..=7)], ); // case 11 assert_eq!( split(6..=7, 3..=6), vec![new(3..=5), both(6..=6), old(7..=7)], ); assert_eq!( split(6..=8, 3..=6), vec![new(3..=5), both(6..=6), old(7..=8)], ); assert_eq!( split(6..=7, 5..=6), vec![new(5..=5), both(6..=6), old(7..=7)], ); // case 12 assert_eq!( split(3..=7, 5..=9), vec![old(3..=4), both(5..=7), new(8..=9)], ); assert_eq!( split(3..=5, 4..=6), vec![old(3..=3), both(4..=5), new(6..=6)], ); // case 13 assert_eq!( split(5..=9, 3..=7), vec![new(3..=4), both(5..=7), old(8..=9)], ); assert_eq!( split(4..=6, 3..=5), vec![new(3..=3), both(4..=5), old(6..=6)], ); } // Arguably there should be more tests here, but in practice, this data // structure is well covered by the huge number of regex tests. } regex-automata-0.4.9/src/util/alphabet.rs000064400000000000000000001174011046102023000164570ustar 00000000000000/*! This module provides APIs for dealing with the alphabets of finite state machines. There are two principal types in this module, [`ByteClasses`] and [`Unit`]. The former defines the alphabet of a finite state machine while the latter represents an element of that alphabet. To a first approximation, the alphabet of all automata in this crate is just a `u8`. Namely, every distinct byte value. All 256 of them. In practice, this can be quite wasteful when building a transition table for a DFA, since it requires storing a state identifier for each element in the alphabet. Instead, we collapse the alphabet of an automaton down into equivalence classes, where every byte in the same equivalence class never discriminates between a match or a non-match from any other byte in the same class. For example, in the regex `[a-z]+`, then you could consider it having an alphabet consisting of two equivalence classes: `a-z` and everything else. In terms of the transitions on an automaton, it doesn't actually require representing every distinct byte. Just the equivalence classes. The downside of equivalence classes is that, of course, searching a haystack deals with individual byte values. Those byte values need to be mapped to their corresponding equivalence class. This is what `ByteClasses` does. In practice, doing this for every state transition has negligible impact on modern CPUs. Moreover, it helps make more efficient use of the CPU cache by (possibly considerably) shrinking the size of the transition table. One last hiccup concerns `Unit`. Namely, because of look-around and how the DFAs in this crate work, we need to add a sentinel value to our alphabet of equivalence classes that represents the "end" of a search. We call that sentinel [`Unit::eoi`] or "end of input." Thus, a `Unit` is either an equivalence class corresponding to a set of bytes, or it is a special "end of input" sentinel. In general, you should not expect to need either of these types unless you're doing lower level shenanigans with DFAs, or even building your own DFAs. (Although, you don't have to use these types to build your own DFAs of course.) For example, if you're walking a DFA's state graph, it's probably useful to make use of [`ByteClasses`] to visit each element in the DFA's alphabet instead of just visiting every distinct `u8` value. The latter isn't necessarily wrong, but it could be potentially very wasteful. */ use crate::util::{ escape::DebugByte, wire::{self, DeserializeError, SerializeError}, }; /// Unit represents a single unit of haystack for DFA based regex engines. /// /// It is not expected for consumers of this crate to need to use this type /// unless they are implementing their own DFA. And even then, it's not /// required: implementors may use other techniques to handle haystack units. /// /// Typically, a single unit of haystack for a DFA would be a single byte. /// However, for the DFAs in this crate, matches are delayed by a single byte /// in order to handle look-ahead assertions (`\b`, `$` and `\z`). Thus, once /// we have consumed the haystack, we must run the DFA through one additional /// transition using a unit that indicates the haystack has ended. /// /// There is no way to represent a sentinel with a `u8` since all possible /// values *may* be valid haystack units to a DFA, therefore this type /// explicitly adds room for a sentinel value. /// /// The sentinel EOI value is always its own equivalence class and is /// ultimately represented by adding 1 to the maximum equivalence class value. /// So for example, the regex `^[a-z]+$` might be split into the following /// equivalence classes: /// /// ```text /// 0 => [\x00-`] /// 1 => [a-z] /// 2 => [{-\xFF] /// 3 => [EOI] /// ``` /// /// Where EOI is the special sentinel value that is always in its own /// singleton equivalence class. #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] pub struct Unit(UnitKind); #[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)] enum UnitKind { /// Represents a byte value, or more typically, an equivalence class /// represented as a byte value. U8(u8), /// Represents the "end of input" sentinel. We regretably use a `u16` /// here since the maximum sentinel value is `256`. Thankfully, we don't /// actually store a `Unit` anywhere, so this extra space shouldn't be too /// bad. EOI(u16), } impl Unit { /// Create a new haystack unit from a byte value. /// /// All possible byte values are legal. However, when creating a haystack /// unit for a specific DFA, one should be careful to only construct units /// that are in that DFA's alphabet. Namely, one way to compact a DFA's /// in-memory representation is to collapse its transitions to a set of /// equivalence classes into a set of all possible byte values. If a DFA /// uses equivalence classes instead of byte values, then the byte given /// here should be the equivalence class. pub fn u8(byte: u8) -> Unit { Unit(UnitKind::U8(byte)) } /// Create a new "end of input" haystack unit. /// /// The value given is the sentinel value used by this unit to represent /// the "end of input." The value should be the total number of equivalence /// classes in the corresponding alphabet. Its maximum value is `256`, /// which occurs when every byte is its own equivalence class. /// /// # Panics /// /// This panics when `num_byte_equiv_classes` is greater than `256`. pub fn eoi(num_byte_equiv_classes: usize) -> Unit { assert!( num_byte_equiv_classes <= 256, "max number of byte-based equivalent classes is 256, but got {}", num_byte_equiv_classes, ); Unit(UnitKind::EOI(u16::try_from(num_byte_equiv_classes).unwrap())) } /// If this unit is not an "end of input" sentinel, then returns its /// underlying byte value. Otherwise return `None`. pub fn as_u8(self) -> Option { match self.0 { UnitKind::U8(b) => Some(b), UnitKind::EOI(_) => None, } } /// If this unit is an "end of input" sentinel, then return the underlying /// sentinel value that was given to [`Unit::eoi`]. Otherwise return /// `None`. pub fn as_eoi(self) -> Option { match self.0 { UnitKind::U8(_) => None, UnitKind::EOI(sentinel) => Some(sentinel), } } /// Return this unit as a `usize`, regardless of whether it is a byte value /// or an "end of input" sentinel. In the latter case, the underlying /// sentinel value given to [`Unit::eoi`] is returned. pub fn as_usize(self) -> usize { match self.0 { UnitKind::U8(b) => usize::from(b), UnitKind::EOI(eoi) => usize::from(eoi), } } /// Returns true if and only of this unit is a byte value equivalent to the /// byte given. This always returns false when this is an "end of input" /// sentinel. pub fn is_byte(self, byte: u8) -> bool { self.as_u8().map_or(false, |b| b == byte) } /// Returns true when this unit represents an "end of input" sentinel. pub fn is_eoi(self) -> bool { self.as_eoi().is_some() } /// Returns true when this unit corresponds to an ASCII word byte. /// /// This always returns false when this unit represents an "end of input" /// sentinel. pub fn is_word_byte(self) -> bool { self.as_u8().map_or(false, crate::util::utf8::is_word_byte) } } impl core::fmt::Debug for Unit { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { match self.0 { UnitKind::U8(b) => write!(f, "{:?}", DebugByte(b)), UnitKind::EOI(_) => write!(f, "EOI"), } } } /// A representation of byte oriented equivalence classes. /// /// This is used in a DFA to reduce the size of the transition table. This can /// have a particularly large impact not only on the total size of a dense DFA, /// but also on compile times. /// /// The essential idea here is that the alphabet of a DFA is shrunk from the /// usual 256 distinct byte values down to a set of equivalence classes. The /// guarantee you get is that any byte belonging to the same equivalence class /// can be treated as if it were any other byte in the same class, and the /// result of a search wouldn't change. /// /// # Example /// /// This example shows how to get byte classes from an /// [`NFA`](crate::nfa::thompson::NFA) and ask for the class of various bytes. /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let nfa = NFA::new("[a-z]+")?; /// let classes = nfa.byte_classes(); /// // 'a' and 'z' are in the same class for this regex. /// assert_eq!(classes.get(b'a'), classes.get(b'z')); /// // But 'a' and 'A' are not. /// assert_ne!(classes.get(b'a'), classes.get(b'A')); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Copy)] pub struct ByteClasses([u8; 256]); impl ByteClasses { /// Creates a new set of equivalence classes where all bytes are mapped to /// the same class. #[inline] pub fn empty() -> ByteClasses { ByteClasses([0; 256]) } /// Creates a new set of equivalence classes where each byte belongs to /// its own equivalence class. #[inline] pub fn singletons() -> ByteClasses { let mut classes = ByteClasses::empty(); for b in 0..=255 { classes.set(b, b); } classes } /// Deserializes a byte class map from the given slice. If the slice is of /// insufficient length or otherwise contains an impossible mapping, then /// an error is returned. Upon success, the number of bytes read along with /// the map are returned. The number of bytes read is always a multiple of /// 8. pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(ByteClasses, usize), DeserializeError> { wire::check_slice_len(slice, 256, "byte class map")?; let mut classes = ByteClasses::empty(); for (b, &class) in slice[..256].iter().enumerate() { classes.set(u8::try_from(b).unwrap(), class); } // We specifically don't use 'classes.iter()' here because that // iterator depends on 'classes.alphabet_len()' being correct. But that // is precisely the thing we're trying to verify below! for &b in classes.0.iter() { if usize::from(b) >= classes.alphabet_len() { return Err(DeserializeError::generic( "found equivalence class greater than alphabet len", )); } } Ok((classes, 256)) } /// Writes this byte class map to the given byte buffer. if the given /// buffer is too small, then an error is returned. Upon success, the total /// number of bytes written is returned. The number of bytes written is /// guaranteed to be a multiple of 8. pub(crate) fn write_to( &self, mut dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("byte class map")); } for b in 0..=255 { dst[0] = self.get(b); dst = &mut dst[1..]; } Ok(nwrite) } /// Returns the total number of bytes written by `write_to`. pub(crate) fn write_to_len(&self) -> usize { 256 } /// Set the equivalence class for the given byte. #[inline] pub fn set(&mut self, byte: u8, class: u8) { self.0[usize::from(byte)] = class; } /// Get the equivalence class for the given byte. #[inline] pub fn get(&self, byte: u8) -> u8 { self.0[usize::from(byte)] } /// Get the equivalence class for the given haystack unit and return the /// class as a `usize`. #[inline] pub fn get_by_unit(&self, unit: Unit) -> usize { match unit.0 { UnitKind::U8(b) => usize::from(self.get(b)), UnitKind::EOI(b) => usize::from(b), } } /// Create a unit that represents the "end of input" sentinel based on the /// number of equivalence classes. #[inline] pub fn eoi(&self) -> Unit { // The alphabet length already includes the EOI sentinel, hence why // we subtract 1. Unit::eoi(self.alphabet_len().checked_sub(1).unwrap()) } /// Return the total number of elements in the alphabet represented by /// these equivalence classes. Equivalently, this returns the total number /// of equivalence classes. #[inline] pub fn alphabet_len(&self) -> usize { // Add one since the number of equivalence classes is one bigger than // the last one. But add another to account for the final EOI class // that isn't explicitly represented. usize::from(self.0[255]) + 1 + 1 } /// Returns the stride, as a base-2 exponent, required for these /// equivalence classes. /// /// The stride is always the smallest power of 2 that is greater than or /// equal to the alphabet length, and the `stride2` returned here is the /// exponent applied to `2` to get the smallest power. This is done so that /// converting between premultiplied state IDs and indices can be done with /// shifts alone, which is much faster than integer division. #[inline] pub fn stride2(&self) -> usize { let zeros = self.alphabet_len().next_power_of_two().trailing_zeros(); usize::try_from(zeros).unwrap() } /// Returns true if and only if every byte in this class maps to its own /// equivalence class. Equivalently, there are 257 equivalence classes /// and each class contains either exactly one byte or corresponds to the /// singleton class containing the "end of input" sentinel. #[inline] pub fn is_singleton(&self) -> bool { self.alphabet_len() == 257 } /// Returns an iterator over all equivalence classes in this set. #[inline] pub fn iter(&self) -> ByteClassIter<'_> { ByteClassIter { classes: self, i: 0 } } /// Returns an iterator over a sequence of representative bytes from each /// equivalence class within the range of bytes given. /// /// When the given range is unbounded on both sides, the iterator yields /// exactly N items, where N is equivalent to the number of equivalence /// classes. Each item is an arbitrary byte drawn from each equivalence /// class. /// /// This is useful when one is determinizing an NFA and the NFA's alphabet /// hasn't been converted to equivalence classes. Picking an arbitrary byte /// from each equivalence class then permits a full exploration of the NFA /// instead of using every possible byte value and thus potentially saves /// quite a lot of redundant work. /// /// # Example /// /// This shows an example of what a complete sequence of representatives /// might look like from a real example. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; /// /// let nfa = NFA::new("[a-z]+")?; /// let classes = nfa.byte_classes(); /// let reps: Vec = classes.representatives(..).collect(); /// // Note that the specific byte values yielded are not guaranteed! /// let expected = vec![ /// Unit::u8(b'\x00'), /// Unit::u8(b'a'), /// Unit::u8(b'{'), /// Unit::eoi(3), /// ]; /// assert_eq!(expected, reps); /// /// # Ok::<(), Box>(()) /// ``` /// /// Note though, that you can ask for an arbitrary range of bytes, and only /// representatives for that range will be returned: /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; /// /// let nfa = NFA::new("[a-z]+")?; /// let classes = nfa.byte_classes(); /// let reps: Vec = classes.representatives(b'A'..=b'z').collect(); /// // Note that the specific byte values yielded are not guaranteed! /// let expected = vec![ /// Unit::u8(b'A'), /// Unit::u8(b'a'), /// ]; /// assert_eq!(expected, reps); /// /// # Ok::<(), Box>(()) /// ``` pub fn representatives>( &self, range: R, ) -> ByteClassRepresentatives<'_> { use core::ops::Bound; let cur_byte = match range.start_bound() { Bound::Included(&i) => usize::from(i), Bound::Excluded(&i) => usize::from(i).checked_add(1).unwrap(), Bound::Unbounded => 0, }; let end_byte = match range.end_bound() { Bound::Included(&i) => { Some(usize::from(i).checked_add(1).unwrap()) } Bound::Excluded(&i) => Some(usize::from(i)), Bound::Unbounded => None, }; assert_ne!( cur_byte, usize::MAX, "start range must be less than usize::MAX", ); ByteClassRepresentatives { classes: self, cur_byte, end_byte, last_class: None, } } /// Returns an iterator of the bytes in the given equivalence class. /// /// This is useful when one needs to know the actual bytes that belong to /// an equivalence class. For example, conceptually speaking, accelerating /// a DFA state occurs when a state only has a few outgoing transitions. /// But in reality, what is required is that there are only a small /// number of distinct bytes that can lead to an outgoing transition. The /// difference is that any one transition can correspond to an equivalence /// class which may contains many bytes. Therefore, DFA state acceleration /// considers the actual elements in each equivalence class of each /// outgoing transition. /// /// # Example /// /// This shows an example of how to get all of the elements in an /// equivalence class. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, util::alphabet::Unit}; /// /// let nfa = NFA::new("[a-z]+")?; /// let classes = nfa.byte_classes(); /// let elements: Vec = classes.elements(Unit::u8(1)).collect(); /// let expected: Vec = (b'a'..=b'z').map(Unit::u8).collect(); /// assert_eq!(expected, elements); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn elements(&self, class: Unit) -> ByteClassElements { ByteClassElements { classes: self, class, byte: 0 } } /// Returns an iterator of byte ranges in the given equivalence class. /// /// That is, a sequence of contiguous ranges are returned. Typically, every /// class maps to a single contiguous range. fn element_ranges(&self, class: Unit) -> ByteClassElementRanges { ByteClassElementRanges { elements: self.elements(class), range: None } } } impl Default for ByteClasses { fn default() -> ByteClasses { ByteClasses::singletons() } } impl core::fmt::Debug for ByteClasses { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { if self.is_singleton() { write!(f, "ByteClasses({{singletons}})") } else { write!(f, "ByteClasses(")?; for (i, class) in self.iter().enumerate() { if i > 0 { write!(f, ", ")?; } write!(f, "{:?} => [", class.as_usize())?; for (start, end) in self.element_ranges(class) { if start == end { write!(f, "{:?}", start)?; } else { write!(f, "{:?}-{:?}", start, end)?; } } write!(f, "]")?; } write!(f, ")") } } } /// An iterator over each equivalence class. /// /// The last element in this iterator always corresponds to [`Unit::eoi`]. /// /// This is created by the [`ByteClasses::iter`] method. /// /// The lifetime `'a` refers to the lifetime of the byte classes that this /// iterator was created from. #[derive(Debug)] pub struct ByteClassIter<'a> { classes: &'a ByteClasses, i: usize, } impl<'a> Iterator for ByteClassIter<'a> { type Item = Unit; fn next(&mut self) -> Option { if self.i + 1 == self.classes.alphabet_len() { self.i += 1; Some(self.classes.eoi()) } else if self.i < self.classes.alphabet_len() { let class = u8::try_from(self.i).unwrap(); self.i += 1; Some(Unit::u8(class)) } else { None } } } /// An iterator over representative bytes from each equivalence class. /// /// This is created by the [`ByteClasses::representatives`] method. /// /// The lifetime `'a` refers to the lifetime of the byte classes that this /// iterator was created from. #[derive(Debug)] pub struct ByteClassRepresentatives<'a> { classes: &'a ByteClasses, cur_byte: usize, end_byte: Option, last_class: Option, } impl<'a> Iterator for ByteClassRepresentatives<'a> { type Item = Unit; fn next(&mut self) -> Option { while self.cur_byte < self.end_byte.unwrap_or(256) { let byte = u8::try_from(self.cur_byte).unwrap(); let class = self.classes.get(byte); self.cur_byte += 1; if self.last_class != Some(class) { self.last_class = Some(class); return Some(Unit::u8(byte)); } } if self.cur_byte != usize::MAX && self.end_byte.is_none() { // Using usize::MAX as a sentinel is OK because we ban usize::MAX // from appearing as a start bound in iterator construction. But // why do it this way? Well, we want to return the EOI class // whenever the end of the given range is unbounded because EOI // isn't really a "byte" per se, so the only way it should be // excluded is if there is a bounded end to the range. Therefore, // when the end is unbounded, we just need to know whether we've // reported EOI or not. When we do, we set cur_byte to a value it // can never otherwise be. self.cur_byte = usize::MAX; return Some(self.classes.eoi()); } None } } /// An iterator over all elements in an equivalence class. /// /// This is created by the [`ByteClasses::elements`] method. /// /// The lifetime `'a` refers to the lifetime of the byte classes that this /// iterator was created from. #[derive(Debug)] pub struct ByteClassElements<'a> { classes: &'a ByteClasses, class: Unit, byte: usize, } impl<'a> Iterator for ByteClassElements<'a> { type Item = Unit; fn next(&mut self) -> Option { while self.byte < 256 { let byte = u8::try_from(self.byte).unwrap(); self.byte += 1; if self.class.is_byte(self.classes.get(byte)) { return Some(Unit::u8(byte)); } } if self.byte < 257 { self.byte += 1; if self.class.is_eoi() { return Some(Unit::eoi(256)); } } None } } /// An iterator over all elements in an equivalence class expressed as a /// sequence of contiguous ranges. #[derive(Debug)] struct ByteClassElementRanges<'a> { elements: ByteClassElements<'a>, range: Option<(Unit, Unit)>, } impl<'a> Iterator for ByteClassElementRanges<'a> { type Item = (Unit, Unit); fn next(&mut self) -> Option<(Unit, Unit)> { loop { let element = match self.elements.next() { None => return self.range.take(), Some(element) => element, }; match self.range.take() { None => { self.range = Some((element, element)); } Some((start, end)) => { if end.as_usize() + 1 != element.as_usize() || element.is_eoi() { self.range = Some((element, element)); return Some((start, end)); } self.range = Some((start, element)); } } } } } /// A partitioning of bytes into equivalence classes. /// /// A byte class set keeps track of an *approximation* of equivalence classes /// of bytes during NFA construction. That is, every byte in an equivalence /// class cannot discriminate between a match and a non-match. /// /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the /// same equivalence class because it never matters whether an `a` or a `b` is /// seen, and no combination of `a`s and `b`s in the text can discriminate a /// match. /// /// Note though that this does not compute the minimal set of equivalence /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the /// same equivalence class for the same reason that `a` and `b` are in the /// same equivalence class in the aforementioned regex. However, in this /// implementation, `a` and `c` are put into distinct equivalence classes. The /// reason for this is implementation complexity. In the future, we should /// endeavor to compute the minimal equivalence classes since they can have a /// rather large impact on the size of the DFA. (Doing this will likely require /// rethinking how equivalence classes are computed, including changing the /// representation here, which is only able to group contiguous bytes into the /// same equivalence class.) #[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub(crate) struct ByteClassSet(ByteSet); #[cfg(feature = "alloc")] impl Default for ByteClassSet { fn default() -> ByteClassSet { ByteClassSet::empty() } } #[cfg(feature = "alloc")] impl ByteClassSet { /// Create a new set of byte classes where all bytes are part of the same /// equivalence class. pub(crate) fn empty() -> Self { ByteClassSet(ByteSet::empty()) } /// Indicate the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. pub(crate) fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0.add(start - 1); } self.0.add(end); } /// Add the contiguous ranges in the set given to this byte class set. pub(crate) fn add_set(&mut self, set: &ByteSet) { for (start, end) in set.iter_ranges() { self.set_range(start, end); } } /// Convert this boolean set to a map that maps all byte values to their /// corresponding equivalence class. The last mapping indicates the largest /// equivalence class identifier (which is never bigger than 255). pub(crate) fn byte_classes(&self) -> ByteClasses { let mut classes = ByteClasses::empty(); let mut class = 0u8; let mut b = 0u8; loop { classes.set(b, class); if b == 255 { break; } if self.0.contains(b) { class = class.checked_add(1).unwrap(); } b = b.checked_add(1).unwrap(); } classes } } /// A simple set of bytes that is reasonably cheap to copy and allocation free. #[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] pub(crate) struct ByteSet { bits: BitSet, } /// The representation of a byte set. Split out so that we can define a /// convenient Debug impl for it while keeping "ByteSet" in the output. #[derive(Clone, Copy, Default, Eq, PartialEq)] struct BitSet([u128; 2]); impl ByteSet { /// Create an empty set of bytes. pub(crate) fn empty() -> ByteSet { ByteSet { bits: BitSet([0; 2]) } } /// Add a byte to this set. /// /// If the given byte already belongs to this set, then this is a no-op. pub(crate) fn add(&mut self, byte: u8) { let bucket = byte / 128; let bit = byte % 128; self.bits.0[usize::from(bucket)] |= 1 << bit; } /// Remove a byte from this set. /// /// If the given byte is not in this set, then this is a no-op. pub(crate) fn remove(&mut self, byte: u8) { let bucket = byte / 128; let bit = byte % 128; self.bits.0[usize::from(bucket)] &= !(1 << bit); } /// Return true if and only if the given byte is in this set. pub(crate) fn contains(&self, byte: u8) -> bool { let bucket = byte / 128; let bit = byte % 128; self.bits.0[usize::from(bucket)] & (1 << bit) > 0 } /// Return true if and only if the given inclusive range of bytes is in /// this set. pub(crate) fn contains_range(&self, start: u8, end: u8) -> bool { (start..=end).all(|b| self.contains(b)) } /// Returns an iterator over all bytes in this set. pub(crate) fn iter(&self) -> ByteSetIter { ByteSetIter { set: self, b: 0 } } /// Returns an iterator over all contiguous ranges of bytes in this set. pub(crate) fn iter_ranges(&self) -> ByteSetRangeIter { ByteSetRangeIter { set: self, b: 0 } } /// Return true if and only if this set is empty. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_empty(&self) -> bool { self.bits.0 == [0, 0] } /// Deserializes a byte set from the given slice. If the slice is of /// incorrect length or is otherwise malformed, then an error is returned. /// Upon success, the number of bytes read along with the set are returned. /// The number of bytes read is always a multiple of 8. pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(ByteSet, usize), DeserializeError> { use core::mem::size_of; wire::check_slice_len(slice, 2 * size_of::(), "byte set")?; let mut nread = 0; let (low, nr) = wire::try_read_u128(slice, "byte set low bucket")?; nread += nr; let (high, nr) = wire::try_read_u128(slice, "byte set high bucket")?; nread += nr; Ok((ByteSet { bits: BitSet([low, high]) }, nread)) } /// Writes this byte set to the given byte buffer. If the given buffer is /// too small, then an error is returned. Upon success, the total number of /// bytes written is returned. The number of bytes written is guaranteed to /// be a multiple of 8. pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { use core::mem::size_of; let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("byte set")); } let mut nw = 0; E::write_u128(self.bits.0[0], &mut dst[nw..]); nw += size_of::(); E::write_u128(self.bits.0[1], &mut dst[nw..]); nw += size_of::(); assert_eq!(nwrite, nw, "expected to write certain number of bytes",); assert_eq!( nw % 8, 0, "expected to write multiple of 8 bytes for byte set", ); Ok(nw) } /// Returns the total number of bytes written by `write_to`. pub(crate) fn write_to_len(&self) -> usize { 2 * core::mem::size_of::() } } impl core::fmt::Debug for BitSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut fmtd = f.debug_set(); for b in 0u8..=255 { if (ByteSet { bits: *self }).contains(b) { fmtd.entry(&b); } } fmtd.finish() } } #[derive(Debug)] pub(crate) struct ByteSetIter<'a> { set: &'a ByteSet, b: usize, } impl<'a> Iterator for ByteSetIter<'a> { type Item = u8; fn next(&mut self) -> Option { while self.b <= 255 { let b = u8::try_from(self.b).unwrap(); self.b += 1; if self.set.contains(b) { return Some(b); } } None } } #[derive(Debug)] pub(crate) struct ByteSetRangeIter<'a> { set: &'a ByteSet, b: usize, } impl<'a> Iterator for ByteSetRangeIter<'a> { type Item = (u8, u8); fn next(&mut self) -> Option<(u8, u8)> { let asu8 = |n: usize| u8::try_from(n).unwrap(); while self.b <= 255 { let start = asu8(self.b); self.b += 1; if !self.set.contains(start) { continue; } let mut end = start; while self.b <= 255 && self.set.contains(asu8(self.b)) { end = asu8(self.b); self.b += 1; } return Some((start, end)); } None } } #[cfg(all(test, feature = "alloc"))] mod tests { use alloc::{vec, vec::Vec}; use super::*; #[test] fn byte_classes() { let mut set = ByteClassSet::empty(); set.set_range(b'a', b'z'); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(b'a' - 1), 0); assert_eq!(classes.get(b'a'), 1); assert_eq!(classes.get(b'm'), 1); assert_eq!(classes.get(b'z'), 1); assert_eq!(classes.get(b'z' + 1), 2); assert_eq!(classes.get(254), 2); assert_eq!(classes.get(255), 2); let mut set = ByteClassSet::empty(); set.set_range(0, 2); set.set_range(4, 6); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(3), 1); assert_eq!(classes.get(4), 2); assert_eq!(classes.get(5), 2); assert_eq!(classes.get(6), 2); assert_eq!(classes.get(7), 3); assert_eq!(classes.get(255), 3); } #[test] fn full_byte_classes() { let mut set = ByteClassSet::empty(); for b in 0u8..=255 { set.set_range(b, b); } assert_eq!(set.byte_classes().alphabet_len(), 257); } #[test] fn elements_typical() { let mut set = ByteClassSet::empty(); set.set_range(b'b', b'd'); set.set_range(b'g', b'm'); set.set_range(b'z', b'z'); let classes = set.byte_classes(); // class 0: \x00-a // class 1: b-d // class 2: e-f // class 3: g-m // class 4: n-y // class 5: z-z // class 6: \x7B-\xFF // class 7: EOI assert_eq!(classes.alphabet_len(), 8); let elements = classes.elements(Unit::u8(0)).collect::>(); assert_eq!(elements.len(), 98); assert_eq!(elements[0], Unit::u8(b'\x00')); assert_eq!(elements[97], Unit::u8(b'a')); let elements = classes.elements(Unit::u8(1)).collect::>(); assert_eq!( elements, vec![Unit::u8(b'b'), Unit::u8(b'c'), Unit::u8(b'd')], ); let elements = classes.elements(Unit::u8(2)).collect::>(); assert_eq!(elements, vec![Unit::u8(b'e'), Unit::u8(b'f')],); let elements = classes.elements(Unit::u8(3)).collect::>(); assert_eq!( elements, vec![ Unit::u8(b'g'), Unit::u8(b'h'), Unit::u8(b'i'), Unit::u8(b'j'), Unit::u8(b'k'), Unit::u8(b'l'), Unit::u8(b'm'), ], ); let elements = classes.elements(Unit::u8(4)).collect::>(); assert_eq!(elements.len(), 12); assert_eq!(elements[0], Unit::u8(b'n')); assert_eq!(elements[11], Unit::u8(b'y')); let elements = classes.elements(Unit::u8(5)).collect::>(); assert_eq!(elements, vec![Unit::u8(b'z')]); let elements = classes.elements(Unit::u8(6)).collect::>(); assert_eq!(elements.len(), 133); assert_eq!(elements[0], Unit::u8(b'\x7B')); assert_eq!(elements[132], Unit::u8(b'\xFF')); let elements = classes.elements(Unit::eoi(7)).collect::>(); assert_eq!(elements, vec![Unit::eoi(256)]); } #[test] fn elements_singletons() { let classes = ByteClasses::singletons(); assert_eq!(classes.alphabet_len(), 257); let elements = classes.elements(Unit::u8(b'a')).collect::>(); assert_eq!(elements, vec![Unit::u8(b'a')]); let elements = classes.elements(Unit::eoi(5)).collect::>(); assert_eq!(elements, vec![Unit::eoi(256)]); } #[test] fn elements_empty() { let classes = ByteClasses::empty(); assert_eq!(classes.alphabet_len(), 2); let elements = classes.elements(Unit::u8(0)).collect::>(); assert_eq!(elements.len(), 256); assert_eq!(elements[0], Unit::u8(b'\x00')); assert_eq!(elements[255], Unit::u8(b'\xFF')); let elements = classes.elements(Unit::eoi(1)).collect::>(); assert_eq!(elements, vec![Unit::eoi(256)]); } #[test] fn representatives() { let mut set = ByteClassSet::empty(); set.set_range(b'b', b'd'); set.set_range(b'g', b'm'); set.set_range(b'z', b'z'); let classes = set.byte_classes(); let got: Vec = classes.representatives(..).collect(); let expected = vec![ Unit::u8(b'\x00'), Unit::u8(b'b'), Unit::u8(b'e'), Unit::u8(b'g'), Unit::u8(b'n'), Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7), ]; assert_eq!(expected, got); let got: Vec = classes.representatives(..0).collect(); assert!(got.is_empty()); let got: Vec = classes.representatives(1..1).collect(); assert!(got.is_empty()); let got: Vec = classes.representatives(255..255).collect(); assert!(got.is_empty()); // A weird case that is the only guaranteed to way to get an iterator // of just the EOI class by excluding all possible byte values. let got: Vec = classes .representatives(( core::ops::Bound::Excluded(255), core::ops::Bound::Unbounded, )) .collect(); let expected = vec![Unit::eoi(7)]; assert_eq!(expected, got); let got: Vec = classes.representatives(..=255).collect(); let expected = vec![ Unit::u8(b'\x00'), Unit::u8(b'b'), Unit::u8(b'e'), Unit::u8(b'g'), Unit::u8(b'n'), Unit::u8(b'z'), Unit::u8(b'\x7B'), ]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'b'..=b'd').collect(); let expected = vec![Unit::u8(b'b')]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'a'..=b'd').collect(); let expected = vec![Unit::u8(b'a'), Unit::u8(b'b')]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'b'..=b'e').collect(); let expected = vec![Unit::u8(b'b'), Unit::u8(b'e')]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'A'..=b'Z').collect(); let expected = vec![Unit::u8(b'A')]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'A'..=b'z').collect(); let expected = vec![ Unit::u8(b'A'), Unit::u8(b'b'), Unit::u8(b'e'), Unit::u8(b'g'), Unit::u8(b'n'), Unit::u8(b'z'), ]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'z'..).collect(); let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B'), Unit::eoi(7)]; assert_eq!(expected, got); let got: Vec = classes.representatives(b'z'..=0xFF).collect(); let expected = vec![Unit::u8(b'z'), Unit::u8(b'\x7B')]; assert_eq!(expected, got); } } regex-automata-0.4.9/src/util/captures.rs000064400000000000000000003060041046102023000165240ustar 00000000000000/*! Provides types for dealing with capturing groups. Capturing groups refer to sub-patterns of regexes that some regex engines can report matching offsets for. For example, matching `[a-z]([0-9]+)` against `a789` would give `a789` as the overall match (for the implicit capturing group at index `0`) and `789` as the match for the capturing group `([0-9]+)` (an explicit capturing group at index `1`). Not all regex engines can report match offsets for capturing groups. Indeed, to a first approximation, regex engines that can report capturing group offsets tend to be quite a bit slower than regex engines that can't. This is because tracking capturing groups at search time usually requires more "power" that in turn adds overhead. Other regex implementations might call capturing groups "submatches." # Overview The main types in this module are: * [`Captures`] records the capturing group offsets found during a search. It provides convenience routines for looking up capturing group offsets by either index or name. * [`GroupInfo`] records the mapping between capturing groups and "slots," where the latter are how capturing groups are recorded during a regex search. This also keeps a mapping from capturing group name to index, and capture group index to name. A `GroupInfo` is used by `Captures` internally to provide a convenient API. It is unlikely that you'll use a `GroupInfo` directly, but for example, if you've compiled an Thompson NFA, then you can use [`thompson::NFA::group_info`](crate::nfa::thompson::NFA::group_info) to get its underlying `GroupInfo`. */ use alloc::{string::String, sync::Arc, vec, vec::Vec}; use crate::util::{ interpolate, primitives::{ NonMaxUsize, PatternID, PatternIDError, PatternIDIter, SmallIndex, }, search::{Match, Span}, }; /// The span offsets of capturing groups after a match has been found. /// /// This type represents the output of regex engines that can report the /// offsets at which capturing groups matches or "submatches" occur. For /// example, the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM). When a match /// occurs, it will at minimum contain the [`PatternID`] of the pattern that /// matched. Depending upon how it was constructed, it may also contain the /// start/end offsets of the entire match of the pattern and the start/end /// offsets of each capturing group that participated in the match. /// /// Values of this type are always created for a specific [`GroupInfo`]. It is /// unspecified behavior to use a `Captures` value in a search with any regex /// engine that has a different `GroupInfo` than the one the `Captures` were /// created with. /// /// # Constructors /// /// There are three constructors for this type that control what kind of /// information is available upon a match: /// /// * [`Captures::all`]: Will store overall pattern match offsets in addition /// to the offsets of capturing groups that participated in the match. /// * [`Captures::matches`]: Will store only the overall pattern /// match offsets. The offsets of capturing groups (even ones that participated /// in the match) are not available. /// * [`Captures::empty`]: Will only store the pattern ID that matched. No /// match offsets are available at all. /// /// If you aren't sure which to choose, then pick the first one. The first one /// is what convenience routines like, /// [`PikeVM::create_captures`](crate::nfa::thompson::pikevm::PikeVM::create_captures), /// will use automatically. /// /// The main difference between these choices is performance. Namely, if you /// ask for _less_ information, then the execution of regex search may be able /// to run more quickly. /// /// # Notes /// /// It is worth pointing out that this type is not coupled to any one specific /// regex engine. Instead, its coupling is with [`GroupInfo`], which is the /// thing that is responsible for mapping capturing groups to "slot" offsets. /// Slot offsets are indices into a single sequence of memory at which matching /// haystack offsets for the corresponding group are written by regex engines. /// /// # Example /// /// This example shows how to parse a simple date and extract the components of /// the date via capturing groups: /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: named capturing groups /// /// This example is like the one above, but leverages the ability to name /// capturing groups in order to make the code a bit clearer: /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new(r"^(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2})$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group_by_name("y")); /// assert_eq!(Some(Span::from(5..7)), caps.get_group_by_name("m")); /// assert_eq!(Some(Span::from(8..10)), caps.get_group_by_name("d")); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone)] pub struct Captures { /// The group info that these capture groups are coupled to. This is what /// gives the "convenience" of the `Captures` API. Namely, it provides the /// slot mapping and the name|-->index mapping for capture lookups by name. group_info: GroupInfo, /// The ID of the pattern that matched. Regex engines must set this to /// None when no match occurs. pid: Option, /// The slot values, i.e., submatch offsets. /// /// In theory, the smallest sequence of slots would be something like /// `max(groups(pattern) for pattern in regex) * 2`, but instead, we use /// `sum(groups(pattern) for pattern in regex) * 2`. Why? /// /// Well, the former could be used in theory, because we don't generally /// have any overlapping APIs that involve capturing groups. Therefore, /// there's technically never any need to have slots set for multiple /// patterns. However, this might change some day, in which case, we would /// need to have slots available. /// /// The other reason is that during the execution of some regex engines, /// there exists a point in time where multiple slots for different /// patterns may be written to before knowing which pattern has matched. /// Therefore, the regex engines themselves, in order to support multiple /// patterns correctly, must have all slots available. If `Captures` /// doesn't have all slots available, then regex engines can't write /// directly into the caller provided `Captures` and must instead write /// into some other storage and then copy the slots involved in the match /// at the end of the search. /// /// So overall, at least as of the time of writing, it seems like the path /// of least resistance is to just require allocating all possible slots /// instead of the conceptual minimum. Another way to justify this is that /// the most common case is a single pattern, in which case, there is no /// inefficiency here since the 'max' and 'sum' calculations above are /// equivalent in that case. /// /// N.B. The mapping from group index to slot is maintained by `GroupInfo` /// and is considered an API guarantee. See `GroupInfo` for more details on /// that mapping. /// /// N.B. `Option` has the same size as a `usize`. slots: Vec>, } impl Captures { /// Create new storage for the offsets of all matching capturing groups. /// /// This routine provides the most information for matches---namely, the /// spans of matching capturing groups---but also requires the regex search /// routines to do the most work. /// /// It is unspecified behavior to use the returned `Captures` value in a /// search with a `GroupInfo` other than the one that is provided to this /// constructor. /// /// # Example /// /// This example shows that all capturing groups---but only ones that /// participated in a match---are available to query after a match has /// been found: /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// Span, Match, /// }; /// /// let re = PikeVM::new( /// r"^(?:(?P[a-z]+)|(?P[A-Z]+))(?P[0-9]+)$", /// )?; /// let mut cache = re.create_cache(); /// let mut caps = Captures::all(re.get_nfa().group_info().clone()); /// /// re.captures(&mut cache, "ABC123", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); /// // The 'lower' group didn't match, so it won't have any offsets. /// assert_eq!(None, caps.get_group_by_name("lower")); /// assert_eq!(Some(Span::from(0..3)), caps.get_group_by_name("upper")); /// assert_eq!(Some(Span::from(3..6)), caps.get_group_by_name("digits")); /// /// # Ok::<(), Box>(()) /// ``` pub fn all(group_info: GroupInfo) -> Captures { let slots = group_info.slot_len(); Captures { group_info, pid: None, slots: vec![None; slots] } } /// Create new storage for only the full match spans of a pattern. This /// does not include any capturing group offsets. /// /// It is unspecified behavior to use the returned `Captures` value in a /// search with a `GroupInfo` other than the one that is provided to this /// constructor. /// /// # Example /// /// This example shows that only overall match offsets are reported when /// this constructor is used. Accessing any capturing groups other than /// the 0th will always return `None`. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// Match, /// }; /// /// let re = PikeVM::new( /// r"^(?:(?P[a-z]+)|(?P[A-Z]+))(?P[0-9]+)$", /// )?; /// let mut cache = re.create_cache(); /// let mut caps = Captures::matches(re.get_nfa().group_info().clone()); /// /// re.captures(&mut cache, "ABC123", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Match::must(0, 0..6)), caps.get_match()); /// // We didn't ask for capturing group offsets, so they aren't available. /// assert_eq!(None, caps.get_group_by_name("lower")); /// assert_eq!(None, caps.get_group_by_name("upper")); /// assert_eq!(None, caps.get_group_by_name("digits")); /// /// # Ok::<(), Box>(()) /// ``` pub fn matches(group_info: GroupInfo) -> Captures { // This is OK because we know there are at least this many slots, // and GroupInfo construction guarantees that the number of slots fits // into a usize. let slots = group_info.pattern_len().checked_mul(2).unwrap(); Captures { group_info, pid: None, slots: vec![None; slots] } } /// Create new storage for only tracking which pattern matched. No offsets /// are stored at all. /// /// It is unspecified behavior to use the returned `Captures` value in a /// search with a `GroupInfo` other than the one that is provided to this /// constructor. /// /// # Example /// /// This example shows that only the pattern that matched can be accessed /// from a `Captures` value created via this constructor. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// PatternID, /// }; /// /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; /// let mut cache = re.create_cache(); /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); /// /// re.captures(&mut cache, "aABCz", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(PatternID::must(0)), caps.pattern()); /// // We didn't ask for any offsets, so they aren't available. /// assert_eq!(None, caps.get_match()); /// /// re.captures(&mut cache, &"aABCz"[1..], &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); /// // We didn't ask for any offsets, so they aren't available. /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` pub fn empty(group_info: GroupInfo) -> Captures { Captures { group_info, pid: None, slots: vec![] } } /// Returns true if and only if this capturing group represents a match. /// /// This is a convenience routine for `caps.pattern().is_some()`. /// /// # Example /// /// When using the PikeVM (for example), the lightest weight way of /// detecting whether a match exists is to create capturing groups that /// only track the ID of the pattern that match (if any): /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// }; /// /// let re = PikeVM::new(r"[a-z]+")?; /// let mut cache = re.create_cache(); /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); /// /// re.captures(&mut cache, "aABCz", &mut caps); /// assert!(caps.is_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match(&self) -> bool { self.pid.is_some() } /// Returns the identifier of the pattern that matched when this /// capturing group represents a match. If no match was found, then this /// always returns `None`. /// /// This returns a pattern ID in precisely the cases in which `is_match` /// returns `true`. Similarly, the pattern ID returned is always the /// same pattern ID found in the `Match` returned by `get_match`. /// /// # Example /// /// When using the PikeVM (for example), the lightest weight way of /// detecting which pattern matched is to create capturing groups that only /// track the ID of the pattern that match (if any): /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::captures::Captures, /// PatternID, /// }; /// /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; /// let mut cache = re.create_cache(); /// let mut caps = Captures::empty(re.get_nfa().group_info().clone()); /// /// re.captures(&mut cache, "ABC", &mut caps); /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); /// // Recall that offsets are only available when using a non-empty /// // Captures value. So even though a match occurred, this returns None! /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn pattern(&self) -> Option { self.pid } /// Returns the pattern ID and the span of the match, if one occurred. /// /// This always returns `None` when `Captures` was created with /// [`Captures::empty`], even if a match was found. /// /// If this routine returns a non-`None` value, then `is_match` is /// guaranteed to return `true` and `pattern` is also guaranteed to return /// a non-`None` value. /// /// # Example /// /// This example shows how to get the full match from a search: /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match}; /// /// let re = PikeVM::new_many(&[r"[a-z]+", r"[A-Z]+"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "ABC", &mut caps); /// assert_eq!(Some(Match::must(1, 0..3)), caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn get_match(&self) -> Option { Some(Match::new(self.pattern()?, self.get_group(0)?)) } /// Returns the span of a capturing group match corresponding to the group /// index given, only if both the overall pattern matched and the capturing /// group participated in that match. /// /// This returns `None` if `index` is invalid. `index` is valid if and only /// if it's less than [`Captures::group_len`] for the matching pattern. /// /// This always returns `None` when `Captures` was created with /// [`Captures::empty`], even if a match was found. This also always /// returns `None` for any `index > 0` when `Captures` was created with /// [`Captures::matches`]. /// /// If this routine returns a non-`None` value, then `is_match` is /// guaranteed to return `true`, `pattern` is guaranteed to return a /// non-`None` value and `get_match` is guaranteed to return a non-`None` /// value. /// /// By convention, the 0th capture group will always return the same /// span as the span returned by `get_match`. This is because the 0th /// capture group always corresponds to the entirety of the pattern's /// match. (It is similarly always unnamed because it is implicit.) This /// isn't necessarily true of all regex engines. For example, one can /// hand-compile a [`thompson::NFA`](crate::nfa::thompson::NFA) via a /// [`thompson::Builder`](crate::nfa::thompson::Builder), which isn't /// technically forced to make the 0th capturing group always correspond to /// the entire match. /// /// # Example /// /// This example shows how to get the capturing groups, by index, from a /// match: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); /// assert_eq!(Some(Span::from(0..5)), caps.get_group(1)); /// assert_eq!(Some(Span::from(6..17)), caps.get_group(2)); /// // Looking for a non-existent capturing group will return None: /// assert_eq!(None, caps.get_group(3)); /// # // literals are too big for 32-bit usize: #1039 /// # #[cfg(target_pointer_width = "64")] /// assert_eq!(None, caps.get_group(9944060567225171988)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn get_group(&self, index: usize) -> Option { let pid = self.pattern()?; // There's a little bit of work needed to map captures to slots in the // fully general case. But in the overwhelming common case of a single // pattern, we can just do some simple arithmetic. let (slot_start, slot_end) = if self.group_info().pattern_len() == 1 { (index.checked_mul(2)?, index.checked_mul(2)?.checked_add(1)?) } else { self.group_info().slots(pid, index)? }; let start = self.slots.get(slot_start).copied()??; let end = self.slots.get(slot_end).copied()??; Some(Span { start: start.get(), end: end.get() }) } /// Returns the span of a capturing group match corresponding to the group /// name given, only if both the overall pattern matched and the capturing /// group participated in that match. /// /// This returns `None` if `name` does not correspond to a valid capturing /// group for the pattern that matched. /// /// This always returns `None` when `Captures` was created with /// [`Captures::empty`], even if a match was found. This also always /// returns `None` for any `index > 0` when `Captures` was created with /// [`Captures::matches`]. /// /// If this routine returns a non-`None` value, then `is_match` is /// guaranteed to return `true`, `pattern` is guaranteed to return a /// non-`None` value and `get_match` is guaranteed to return a non-`None` /// value. /// /// # Example /// /// This example shows how to get the capturing groups, by name, from a /// match: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span, Match}; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); /// assert_eq!(Some(Match::must(0, 0..17)), caps.get_match()); /// assert_eq!(Some(Span::from(0..5)), caps.get_group_by_name("first")); /// assert_eq!(Some(Span::from(6..17)), caps.get_group_by_name("last")); /// // Looking for a non-existent capturing group will return None: /// assert_eq!(None, caps.get_group_by_name("middle")); /// /// # Ok::<(), Box>(()) /// ``` pub fn get_group_by_name(&self, name: &str) -> Option { let index = self.group_info().to_index(self.pattern()?, name)?; self.get_group(index) } /// Returns an iterator of possible spans for every capturing group in the /// matching pattern. /// /// If this `Captures` value does not correspond to a match, then the /// iterator returned yields no elements. /// /// Note that the iterator returned yields elements of type `Option`. /// A span is present if and only if it corresponds to a capturing group /// that participated in a match. /// /// # Example /// /// This example shows how to collect all capturing groups: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new( /// // Matches first/last names, with an optional middle name. /// r"^(?P\pL+)\s+(?:(?P\pL+)\s+)?(?P\pL+)$", /// )?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Harry James Potter", &mut caps); /// assert!(caps.is_match()); /// let groups: Vec> = caps.iter().collect(); /// assert_eq!(groups, vec![ /// Some(Span::from(0..18)), /// Some(Span::from(0..5)), /// Some(Span::from(6..11)), /// Some(Span::from(12..18)), /// ]); /// /// # Ok::<(), Box>(()) /// ``` /// /// This example uses the same regex as the previous example, but with a /// haystack that omits the middle name. This results in a capturing group /// that is present in the elements yielded by the iterator but without a /// match: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Span}; /// /// let re = PikeVM::new( /// // Matches first/last names, with an optional middle name. /// r"^(?P\pL+)\s+(?:(?P\pL+)\s+)?(?P\pL+)$", /// )?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Harry Potter", &mut caps); /// assert!(caps.is_match()); /// let groups: Vec> = caps.iter().collect(); /// assert_eq!(groups, vec![ /// Some(Span::from(0..12)), /// Some(Span::from(0..5)), /// None, /// Some(Span::from(6..12)), /// ]); /// /// # Ok::<(), Box>(()) /// ``` pub fn iter(&self) -> CapturesPatternIter<'_> { let names = self .pattern() .map_or(GroupInfoPatternNames::empty().enumerate(), |pid| { self.group_info().pattern_names(pid).enumerate() }); CapturesPatternIter { caps: self, names } } /// Return the total number of capturing groups for the matching pattern. /// /// If this `Captures` value does not correspond to a match, then this /// always returns `0`. /// /// This always returns the same number of elements yielded by /// [`Captures::iter`]. That is, the number includes capturing groups even /// if they don't participate in the match. /// /// # Example /// /// This example shows how to count the total number of capturing groups /// associated with a pattern. Notice that it includes groups that did not /// participate in a match (just like `Captures::iter` does). /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new( /// // Matches first/last names, with an optional middle name. /// r"^(?P\pL+)\s+(?:(?P\pL+)\s+)?(?P\pL+)$", /// )?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Harry Potter", &mut caps); /// assert_eq!(4, caps.group_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn group_len(&self) -> usize { let pid = match self.pattern() { None => return 0, Some(pid) => pid, }; self.group_info().group_len(pid) } /// Returns a reference to the underlying group info on which these /// captures are based. /// /// The difference between `GroupInfo` and `Captures` is that the former /// defines the structure of capturing groups where as the latter is what /// stores the actual match information. So where as `Captures` only gives /// you access to the current match, `GroupInfo` lets you query any /// information about all capturing groups, even ones for patterns that /// weren't involved in a match. /// /// Note that a `GroupInfo` uses reference counting internally, so it may /// be cloned cheaply. /// /// # Example /// /// This example shows how to get all capturing group names from the /// underlying `GroupInfo`. Notice that we don't even need to run a /// search. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; /// /// let re = PikeVM::new_many(&[ /// r"(?Pa)", /// r"(a)(b)", /// r"ab", /// r"(?Pa)(?Pa)", /// r"(?Pz)", /// ])?; /// let caps = re.create_captures(); /// /// let expected = vec![ /// (PatternID::must(0), 0, None), /// (PatternID::must(0), 1, Some("foo")), /// (PatternID::must(1), 0, None), /// (PatternID::must(1), 1, None), /// (PatternID::must(1), 2, None), /// (PatternID::must(2), 0, None), /// (PatternID::must(3), 0, None), /// (PatternID::must(3), 1, Some("bar")), /// (PatternID::must(3), 2, Some("quux")), /// (PatternID::must(4), 0, None), /// (PatternID::must(4), 1, Some("foo")), /// ]; /// // We could also just use 're.get_nfa().group_info()'. /// let got: Vec<(PatternID, usize, Option<&str>)> = /// caps.group_info().all_names().collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` pub fn group_info(&self) -> &GroupInfo { &self.group_info } /// Interpolates the capture references in `replacement` with the /// corresponding substrings in `haystack` matched by each reference. The /// interpolated string is returned. /// /// See the [`interpolate` module](interpolate) for documentation on the /// format of the replacement string. /// /// # Example /// /// This example shows how to use interpolation, and also shows how it /// can work with multi-pattern regexes. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; /// /// let re = PikeVM::new_many(&[ /// r"(?[0-9]{2})-(?[0-9]{2})-(?[0-9]{4})", /// r"(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})", /// ])?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let replacement = "year=$year, month=$month, day=$day"; /// /// // This matches the first pattern. /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let result = caps.interpolate_string(hay, replacement); /// assert_eq!("year=2010, month=03, day=14", result); /// /// // And this matches the second pattern. /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let result = caps.interpolate_string(hay, replacement); /// assert_eq!("year=2010, month=03, day=14", result); /// /// # Ok::<(), Box>(()) /// ``` pub fn interpolate_string( &self, haystack: &str, replacement: &str, ) -> String { let mut dst = String::new(); self.interpolate_string_into(haystack, replacement, &mut dst); dst } /// Interpolates the capture references in `replacement` with the /// corresponding substrings in `haystack` matched by each reference. The /// interpolated string is written to `dst`. /// /// See the [`interpolate` module](interpolate) for documentation on the /// format of the replacement string. /// /// # Example /// /// This example shows how to use interpolation, and also shows how it /// can work with multi-pattern regexes. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; /// /// let re = PikeVM::new_many(&[ /// r"(?[0-9]{2})-(?[0-9]{2})-(?[0-9]{4})", /// r"(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})", /// ])?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let replacement = "year=$year, month=$month, day=$day"; /// /// // This matches the first pattern. /// let hay = "On 14-03-2010, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let mut dst = String::new(); /// caps.interpolate_string_into(hay, replacement, &mut dst); /// assert_eq!("year=2010, month=03, day=14", dst); /// /// // And this matches the second pattern. /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let mut dst = String::new(); /// caps.interpolate_string_into(hay, replacement, &mut dst); /// assert_eq!("year=2010, month=03, day=14", dst); /// /// # Ok::<(), Box>(()) /// ``` pub fn interpolate_string_into( &self, haystack: &str, replacement: &str, dst: &mut String, ) { interpolate::string( replacement, |index, dst| { let span = match self.get_group(index) { None => return, Some(span) => span, }; dst.push_str(&haystack[span]); }, |name| self.group_info().to_index(self.pattern()?, name), dst, ); } /// Interpolates the capture references in `replacement` with the /// corresponding substrings in `haystack` matched by each reference. The /// interpolated byte string is returned. /// /// See the [`interpolate` module](interpolate) for documentation on the /// format of the replacement string. /// /// # Example /// /// This example shows how to use interpolation, and also shows how it /// can work with multi-pattern regexes. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; /// /// let re = PikeVM::new_many(&[ /// r"(?[0-9]{2})-(?[0-9]{2})-(?[0-9]{4})", /// r"(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})", /// ])?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let replacement = b"year=$year, month=$month, day=$day"; /// /// // This matches the first pattern. /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let result = caps.interpolate_bytes(hay, replacement); /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); /// /// // And this matches the second pattern. /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let result = caps.interpolate_bytes(hay, replacement); /// assert_eq!(&b"year=2010, month=03, day=14"[..], result); /// /// # Ok::<(), Box>(()) /// ``` pub fn interpolate_bytes( &self, haystack: &[u8], replacement: &[u8], ) -> Vec { let mut dst = vec![]; self.interpolate_bytes_into(haystack, replacement, &mut dst); dst } /// Interpolates the capture references in `replacement` with the /// corresponding substrings in `haystack` matched by each reference. The /// interpolated byte string is written to `dst`. /// /// See the [`interpolate` module](interpolate) for documentation on the /// format of the replacement string. /// /// # Example /// /// This example shows how to use interpolation, and also shows how it /// can work with multi-pattern regexes. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, PatternID}; /// /// let re = PikeVM::new_many(&[ /// r"(?[0-9]{2})-(?[0-9]{2})-(?[0-9]{4})", /// r"(?[0-9]{4})-(?[0-9]{2})-(?[0-9]{2})", /// ])?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let replacement = b"year=$year, month=$month, day=$day"; /// /// // This matches the first pattern. /// let hay = b"On 14-03-2010, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let mut dst = vec![]; /// caps.interpolate_bytes_into(hay, replacement, &mut dst); /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); /// /// // And this matches the second pattern. /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// let mut dst = vec![]; /// caps.interpolate_bytes_into(hay, replacement, &mut dst); /// assert_eq!(&b"year=2010, month=03, day=14"[..], dst); /// /// # Ok::<(), Box>(()) /// ``` pub fn interpolate_bytes_into( &self, haystack: &[u8], replacement: &[u8], dst: &mut Vec, ) { interpolate::bytes( replacement, |index, dst| { let span = match self.get_group(index) { None => return, Some(span) => span, }; dst.extend_from_slice(&haystack[span]); }, |name| self.group_info().to_index(self.pattern()?, name), dst, ); } /// This is a convenience routine for extracting the substrings /// corresponding to matching capture groups in the given `haystack`. The /// `haystack` should be the same substring used to find the match spans in /// this `Captures` value. /// /// This is identical to [`Captures::extract_bytes`], except it works with /// `&str` instead of `&[u8]`. /// /// # Panics /// /// This panics if the number of explicit matching groups in this /// `Captures` value is less than `N`. This also panics if this `Captures` /// value does not correspond to a match. /// /// Note that this does *not* panic if the number of explicit matching /// groups is bigger than `N`. In that case, only the first `N` matching /// groups are extracted. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let hay = "On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// assert!(caps.is_match()); /// let (full, [year, month, day]) = caps.extract(hay); /// assert_eq!("2010-03-14", full); /// assert_eq!("2010", year); /// assert_eq!("03", month); /// assert_eq!("14", day); /// /// // We can also ask for fewer than all capture groups. /// let (full, [year]) = caps.extract(hay); /// assert_eq!("2010-03-14", full); /// assert_eq!("2010", year); /// /// # Ok::<(), Box>(()) /// ``` pub fn extract<'h, const N: usize>( &self, haystack: &'h str, ) -> (&'h str, [&'h str; N]) { let mut matched = self.iter().flatten(); let whole_match = &haystack[matched.next().expect("a match")]; let group_matches = [0; N].map(|_| { let sp = matched.next().expect("too few matching groups"); &haystack[sp] }); (whole_match, group_matches) } /// This is a convenience routine for extracting the substrings /// corresponding to matching capture groups in the given `haystack`. The /// `haystack` should be the same substring used to find the match spans in /// this `Captures` value. /// /// This is identical to [`Captures::extract`], except it works with /// `&[u8]` instead of `&str`. /// /// # Panics /// /// This panics if the number of explicit matching groups in this /// `Captures` value is less than `N`. This also panics if this `Captures` /// value does not correspond to a match. /// /// Note that this does *not* panic if the number of explicit matching /// groups is bigger than `N`. In that case, only the first `N` matching /// groups are extracted. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new(r"([0-9]{4})-([0-9]{2})-([0-9]{2})")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// let hay = b"On 2010-03-14, I became a Tenneessee lamb."; /// re.captures(&mut cache, hay, &mut caps); /// assert!(caps.is_match()); /// let (full, [year, month, day]) = caps.extract_bytes(hay); /// assert_eq!(b"2010-03-14", full); /// assert_eq!(b"2010", year); /// assert_eq!(b"03", month); /// assert_eq!(b"14", day); /// /// // We can also ask for fewer than all capture groups. /// let (full, [year]) = caps.extract_bytes(hay); /// assert_eq!(b"2010-03-14", full); /// assert_eq!(b"2010", year); /// /// # Ok::<(), Box>(()) /// ``` pub fn extract_bytes<'h, const N: usize>( &self, haystack: &'h [u8], ) -> (&'h [u8], [&'h [u8]; N]) { let mut matched = self.iter().flatten(); let whole_match = &haystack[matched.next().expect("a match")]; let group_matches = [0; N].map(|_| { let sp = matched.next().expect("too few matching groups"); &haystack[sp] }); (whole_match, group_matches) } } /// Lower level "slot" oriented APIs. One does not typically need to use these /// when executing a search. They are instead mostly intended for folks that /// are writing their own regex engine while reusing this `Captures` type. impl Captures { /// Clear this `Captures` value. /// /// After clearing, all slots inside this `Captures` value will be set to /// `None`. Similarly, any pattern ID that it was previously associated /// with (for a match) is erased. /// /// It is not usually necessary to call this routine. Namely, a `Captures` /// value only provides high level access to the capturing groups of the /// pattern that matched, and only low level access to individual slots. /// Thus, even if slots corresponding to groups that aren't associated /// with the matching pattern are set, then it won't impact the higher /// level APIs. Namely, higher level APIs like [`Captures::get_group`] will /// return `None` if no pattern ID is present, even if there are spans set /// in the underlying slots. /// /// Thus, to "clear" a `Captures` value of a match, it is usually only /// necessary to call [`Captures::set_pattern`] with `None`. /// /// # Example /// /// This example shows what happens when a `Captures` value is cleared. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); /// assert!(caps.is_match()); /// let slots: Vec> = /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); /// // Note that the following ordering is considered an API guarantee. /// assert_eq!(slots, vec![ /// Some(0), /// Some(17), /// Some(0), /// Some(5), /// Some(6), /// Some(17), /// ]); /// /// // Now clear the slots. Everything is gone and it is no longer a match. /// caps.clear(); /// assert!(!caps.is_match()); /// let slots: Vec> = /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); /// assert_eq!(slots, vec![ /// None, /// None, /// None, /// None, /// None, /// None, /// ]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn clear(&mut self) { self.pid = None; for slot in self.slots.iter_mut() { *slot = None; } } /// Set the pattern on this `Captures` value. /// /// When the pattern ID is `None`, then this `Captures` value does not /// correspond to a match (`is_match` will return `false`). Otherwise, it /// corresponds to a match. /// /// This is useful in search implementations where you might want to /// initially call `set_pattern(None)` in order to avoid the cost of /// calling `clear()` if it turns out to not be necessary. /// /// # Example /// /// This example shows that `set_pattern` merely overwrites the pattern ID. /// It does not actually change the underlying slot values. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::nfa::thompson::pikevm::PikeVM; /// /// let re = PikeVM::new(r"^(?P\pL+)\s+(?P\pL+)$")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "Bruce Springsteen", &mut caps); /// assert!(caps.is_match()); /// assert!(caps.pattern().is_some()); /// let slots: Vec> = /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); /// // Note that the following ordering is considered an API guarantee. /// assert_eq!(slots, vec![ /// Some(0), /// Some(17), /// Some(0), /// Some(5), /// Some(6), /// Some(17), /// ]); /// /// // Now set the pattern to None. Note that the slot values remain. /// caps.set_pattern(None); /// assert!(!caps.is_match()); /// assert!(!caps.pattern().is_some()); /// let slots: Vec> = /// caps.slots().iter().map(|s| s.map(|x| x.get())).collect(); /// // Note that the following ordering is considered an API guarantee. /// assert_eq!(slots, vec![ /// Some(0), /// Some(17), /// Some(0), /// Some(5), /// Some(6), /// Some(17), /// ]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn set_pattern(&mut self, pid: Option) { self.pid = pid; } /// Returns the underlying slots, where each slot stores a single offset. /// /// Every matching capturing group generally corresponds to two slots: one /// slot for the starting position and another for the ending position. /// Typically, either both are present or neither are. (The weasel word /// "typically" is used here because it really depends on the regex engine /// implementation. Every sensible regex engine likely adheres to this /// invariant, and every regex engine in this crate is sensible.) /// /// Generally speaking, callers should prefer to use higher level routines /// like [`Captures::get_match`] or [`Captures::get_group`]. /// /// An important note here is that a regex engine may not reset all of the /// slots to `None` values when no match occurs, or even when a match of /// a different pattern occurs. But this depends on how the regex engine /// implementation deals with slots. /// /// # Example /// /// This example shows how to get the underlying slots from a regex match. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::primitives::{PatternID, NonMaxUsize}, /// }; /// /// let re = PikeVM::new_many(&[ /// r"[a-z]+", /// r"[0-9]+", /// ])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// re.captures(&mut cache, "123", &mut caps); /// assert_eq!(Some(PatternID::must(1)), caps.pattern()); /// // Note that the only guarantee we have here is that slots 2 and 3 /// // are set to correct values. The contents of the first two slots are /// // unspecified since the 0th pattern did not match. /// let expected = &[ /// None, /// None, /// NonMaxUsize::new(0), /// NonMaxUsize::new(3), /// ]; /// assert_eq!(expected, caps.slots()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn slots(&self) -> &[Option] { &self.slots } /// Returns the underlying slots as a mutable slice, where each slot stores /// a single offset. /// /// This tends to be most useful for regex engine implementations for /// writing offsets for matching capturing groups to slots. /// /// See [`Captures::slots`] for more information about slots. #[inline] pub fn slots_mut(&mut self) -> &mut [Option] { &mut self.slots } } impl core::fmt::Debug for Captures { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut dstruct = f.debug_struct("Captures"); dstruct.field("pid", &self.pid); if let Some(pid) = self.pid { dstruct.field("spans", &CapturesDebugMap { pid, caps: self }); } dstruct.finish() } } /// A little helper type to provide a nice map-like debug representation for /// our capturing group spans. struct CapturesDebugMap<'a> { pid: PatternID, caps: &'a Captures, } impl<'a> core::fmt::Debug for CapturesDebugMap<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { struct Key<'a>(usize, Option<&'a str>); impl<'a> core::fmt::Debug for Key<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{}", self.0)?; if let Some(name) = self.1 { write!(f, "/{:?}", name)?; } Ok(()) } } let mut map = f.debug_map(); let names = self.caps.group_info().pattern_names(self.pid); for (group_index, maybe_name) in names.enumerate() { let key = Key(group_index, maybe_name); match self.caps.get_group(group_index) { None => map.entry(&key, &None::<()>), Some(span) => map.entry(&key, &span), }; } map.finish() } } /// An iterator over all capturing groups in a `Captures` value. /// /// This iterator includes capturing groups that did not participate in a /// match. See the [`Captures::iter`] method documentation for more details /// and examples. /// /// The lifetime parameter `'a` refers to the lifetime of the underlying /// `Captures` value. #[derive(Clone, Debug)] pub struct CapturesPatternIter<'a> { caps: &'a Captures, names: core::iter::Enumerate>, } impl<'a> Iterator for CapturesPatternIter<'a> { type Item = Option; fn next(&mut self) -> Option> { let (group_index, _) = self.names.next()?; Some(self.caps.get_group(group_index)) } fn size_hint(&self) -> (usize, Option) { self.names.size_hint() } fn count(self) -> usize { self.names.count() } } impl<'a> ExactSizeIterator for CapturesPatternIter<'a> {} impl<'a> core::iter::FusedIterator for CapturesPatternIter<'a> {} /// Represents information about capturing groups in a compiled regex. /// /// The information encapsulated by this type consists of the following. For /// each pattern: /// /// * A map from every capture group name to its corresponding capture group /// index. /// * A map from every capture group index to its corresponding capture group /// name. /// * A map from capture group index to its corresponding slot index. A slot /// refers to one half of a capturing group. That is, a capture slot is either /// the start or end of a capturing group. A slot is usually the mechanism /// by which a regex engine records offsets for each capturing group during a /// search. /// /// A `GroupInfo` uses reference counting internally and is thus cheap to /// clone. /// /// # Mapping from capture groups to slots /// /// One of the main responsibilities of a `GroupInfo` is to build a mapping /// from `(PatternID, u32)` (where the `u32` is a capture index) to something /// called a "slot." As mentioned above, a slot refers to one half of a /// capturing group. Both combined provide the start and end offsets of /// a capturing group that participated in a match. /// /// **The mapping between group indices and slots is an API guarantee.** That /// is, the mapping won't change within a semver compatible release. /// /// Slots exist primarily because this is a convenient mechanism by which /// regex engines report group offsets at search time. For example, the /// [`nfa::thompson::State::Capture`](crate::nfa::thompson::State::Capture) /// NFA state includes the slot index. When a regex engine transitions through /// this state, it will likely use the slot index to write the current haystack /// offset to some region of memory. When a match is found, those slots are /// then reported to the caller, typically via a convenient abstraction like a /// [`Captures`] value. /// /// Because this crate provides first class support for multi-pattern regexes, /// and because of some performance related reasons, the mapping between /// capturing groups and slots is a little complex. However, in the case of a /// single pattern, the mapping can be described very simply: for all capture /// group indices `i`, its corresponding slots are at `i * 2` and `i * 2 + 1`. /// Notice that the pattern ID isn't involved at all here, because it only /// applies to a single-pattern regex, it is therefore always `0`. /// /// In the multi-pattern case, the mapping is a bit more complicated. To talk /// about it, we must define what we mean by "implicit" vs "explicit" /// capturing groups: /// /// * An **implicit** capturing group refers to the capturing group that is /// present for every pattern automatically, and corresponds to the overall /// match of a pattern. Every pattern has precisely one implicit capturing /// group. It is always unnamed and it always corresponds to the capture group /// index `0`. /// * An **explicit** capturing group refers to any capturing group that /// appears in the concrete syntax of the pattern. (Or, if an NFA was hand /// built without any concrete syntax, it refers to any capturing group with an /// index greater than `0`.) /// /// Some examples: /// /// * `\w+` has one implicit capturing group and zero explicit capturing /// groups. /// * `(\w+)` has one implicit group and one explicit group. /// * `foo(\d+)(?:\pL+)(\d+)` has one implicit group and two explicit groups. /// /// Turning back to the slot mapping, we can now state it as follows: /// /// * Given a pattern ID `pid`, the slots for its implicit group are always /// at `pid * 2` and `pid * 2 + 1`. /// * Given a pattern ID `0`, the slots for its explicit groups start /// at `group_info.pattern_len() * 2`. /// * Given a pattern ID `pid > 0`, the slots for its explicit groups start /// immediately following where the slots for the explicit groups of `pid - 1` /// end. /// /// In particular, while there is a concrete formula one can use to determine /// where the slots for the implicit group of any pattern are, there is no /// general formula for determining where the slots for explicit capturing /// groups are. This is because each pattern can contain a different number /// of groups. /// /// The intended way of getting the slots for a particular capturing group /// (whether implicit or explicit) is via the [`GroupInfo::slot`] or /// [`GroupInfo::slots`] method. /// /// See below for a concrete example of how capturing groups get mapped to /// slots. /// /// # Example /// /// This example shows how to build a new `GroupInfo` and query it for /// information. /// /// ``` /// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; /// /// let info = GroupInfo::new(vec![ /// vec![None, Some("foo")], /// vec![None], /// vec![None, None, None, Some("bar"), None], /// vec![None, None, Some("foo")], /// ])?; /// // The number of patterns being tracked. /// assert_eq!(4, info.pattern_len()); /// // We can query the number of groups for any pattern. /// assert_eq!(2, info.group_len(PatternID::must(0))); /// assert_eq!(1, info.group_len(PatternID::must(1))); /// assert_eq!(5, info.group_len(PatternID::must(2))); /// assert_eq!(3, info.group_len(PatternID::must(3))); /// // An invalid pattern always has zero groups. /// assert_eq!(0, info.group_len(PatternID::must(999))); /// // 2 slots per group /// assert_eq!(22, info.slot_len()); /// /// // We can map a group index for a particular pattern to its name, if /// // one exists. /// assert_eq!(Some("foo"), info.to_name(PatternID::must(3), 2)); /// assert_eq!(None, info.to_name(PatternID::must(2), 4)); /// // Or map a name to its group index. /// assert_eq!(Some(1), info.to_index(PatternID::must(0), "foo")); /// assert_eq!(Some(2), info.to_index(PatternID::must(3), "foo")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: mapping from capture groups to slots /// /// This example shows the specific mapping from capture group indices for /// each pattern to their corresponding slots. The slot values shown in this /// example are considered an API guarantee. /// /// ``` /// use regex_automata::util::{captures::GroupInfo, primitives::PatternID}; /// /// let info = GroupInfo::new(vec![ /// vec![None, Some("foo")], /// vec![None], /// vec![None, None, None, Some("bar"), None], /// vec![None, None, Some("foo")], /// ])?; /// /// // We first show the slots for each pattern's implicit group. /// assert_eq!(Some((0, 1)), info.slots(PatternID::must(0), 0)); /// assert_eq!(Some((2, 3)), info.slots(PatternID::must(1), 0)); /// assert_eq!(Some((4, 5)), info.slots(PatternID::must(2), 0)); /// assert_eq!(Some((6, 7)), info.slots(PatternID::must(3), 0)); /// /// // And now we show the slots for each pattern's explicit group. /// assert_eq!(Some((8, 9)), info.slots(PatternID::must(0), 1)); /// assert_eq!(Some((10, 11)), info.slots(PatternID::must(2), 1)); /// assert_eq!(Some((12, 13)), info.slots(PatternID::must(2), 2)); /// assert_eq!(Some((14, 15)), info.slots(PatternID::must(2), 3)); /// assert_eq!(Some((16, 17)), info.slots(PatternID::must(2), 4)); /// assert_eq!(Some((18, 19)), info.slots(PatternID::must(3), 1)); /// assert_eq!(Some((20, 21)), info.slots(PatternID::must(3), 2)); /// /// // Asking for the slots for an invalid pattern ID or even for an invalid /// // group index for a specific pattern will return None. So for example, /// // you're guaranteed to not get the slots for a different pattern than the /// // one requested. /// assert_eq!(None, info.slots(PatternID::must(5), 0)); /// assert_eq!(None, info.slots(PatternID::must(1), 1)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug, Default)] pub struct GroupInfo(Arc); impl GroupInfo { /// Creates a new group info from a sequence of patterns, where each /// sequence of patterns yields a sequence of possible group names. The /// index of each pattern in the sequence corresponds to its `PatternID`, /// and the index of each group in each pattern's sequence corresponds to /// its corresponding group index. /// /// While this constructor is very generic and therefore perhaps hard to /// chew on, an example of a valid concrete type that can be passed to /// this constructor is `Vec>>`. The outer `Vec` /// corresponds to the patterns, i.e., one `Vec>` per /// pattern. The inner `Vec` corresponds to the capturing groups for /// each pattern. The `Option` corresponds to the name of the /// capturing group, if present. /// /// It is legal to pass an empty iterator to this constructor. It will /// return an empty group info with zero slots. An empty group info is /// useful for cases where you have no patterns or for cases where slots /// aren't being used at all (e.g., for most DFAs in this crate). /// /// # Errors /// /// This constructor returns an error if the given capturing groups are /// invalid in some way. Those reasons include, but are not necessarily /// limited to: /// /// * Too many patterns (i.e., `PatternID` would overflow). /// * Too many capturing groups (e.g., `u32` would overflow). /// * A pattern is given that has no capturing groups. (All patterns must /// have at least an implicit capturing group at index `0`.) /// * The capturing group at index `0` has a name. It must be unnamed. /// * There are duplicate capturing group names within the same pattern. /// (Multiple capturing groups with the same name may exist, but they /// must be in different patterns.) /// /// An example below shows how to trigger some of the above error /// conditions. /// /// # Example /// /// This example shows how to build a new `GroupInfo` and query it for /// information. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// let info = GroupInfo::new(vec![ /// vec![None, Some("foo")], /// vec![None], /// vec![None, None, None, Some("bar"), None], /// vec![None, None, Some("foo")], /// ])?; /// // The number of patterns being tracked. /// assert_eq!(4, info.pattern_len()); /// // 2 slots per group /// assert_eq!(22, info.slot_len()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: empty `GroupInfo` /// /// This example shows how to build a new `GroupInfo` and query it for /// information. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// let info = GroupInfo::empty(); /// // Everything is zero. /// assert_eq!(0, info.pattern_len()); /// assert_eq!(0, info.slot_len()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: error conditions /// /// This example shows how to provoke some of the ways in which building /// a `GroupInfo` can fail. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// // Either the group info is empty, or all patterns must have at least /// // one capturing group. /// assert!(GroupInfo::new(vec![ /// vec![None, Some("a")], // ok /// vec![None], // ok /// vec![], // not ok /// ]).is_err()); /// // Note that building an empty group info is OK. /// assert!(GroupInfo::new(Vec::>>::new()).is_ok()); /// /// // The first group in each pattern must correspond to an implicit /// // anonymous group. i.e., One that is not named. By convention, this /// // group corresponds to the overall match of a regex. Every other group /// // in a pattern is explicit and optional. /// assert!(GroupInfo::new(vec![vec![Some("foo")]]).is_err()); /// /// // There must not be duplicate group names within the same pattern. /// assert!(GroupInfo::new(vec![ /// vec![None, Some("foo"), Some("foo")], /// ]).is_err()); /// // But duplicate names across distinct patterns is OK. /// assert!(GroupInfo::new(vec![ /// vec![None, Some("foo")], /// vec![None, Some("foo")], /// ]).is_ok()); /// /// # Ok::<(), Box>(()) /// ``` /// /// There are other ways for building a `GroupInfo` to fail but are /// difficult to show. For example, if the number of patterns given would /// overflow `PatternID`. pub fn new(pattern_groups: P) -> Result where P: IntoIterator, G: IntoIterator>, N: AsRef, { let mut group_info = GroupInfoInner { slot_ranges: vec![], name_to_index: vec![], index_to_name: vec![], memory_extra: 0, }; for (pattern_index, groups) in pattern_groups.into_iter().enumerate() { // If we can't convert the pattern index to an ID, then the caller // tried to build capture info for too many patterns. let pid = PatternID::new(pattern_index) .map_err(GroupInfoError::too_many_patterns)?; let mut groups_iter = groups.into_iter().enumerate(); match groups_iter.next() { None => return Err(GroupInfoError::missing_groups(pid)), Some((_, Some(_))) => { return Err(GroupInfoError::first_must_be_unnamed(pid)) } Some((_, None)) => {} } group_info.add_first_group(pid); // Now iterate over the rest, which correspond to all of the // (conventionally) explicit capture groups in a regex pattern. for (group_index, maybe_name) in groups_iter { // Just like for patterns, if the group index can't be // converted to a "small" index, then the caller has given too // many groups for a particular pattern. let group = SmallIndex::new(group_index).map_err(|_| { GroupInfoError::too_many_groups(pid, group_index) })?; group_info.add_explicit_group(pid, group, maybe_name)?; } } group_info.fixup_slot_ranges()?; Ok(GroupInfo(Arc::new(group_info))) } /// This creates an empty `GroupInfo`. /// /// This is a convenience routine for calling `GroupInfo::new` with an /// iterator that yields no elements. /// /// # Example /// /// This example shows how to build a new empty `GroupInfo` and query it /// for information. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// let info = GroupInfo::empty(); /// // Everything is zero. /// assert_eq!(0, info.pattern_len()); /// assert_eq!(0, info.all_group_len()); /// assert_eq!(0, info.slot_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn empty() -> GroupInfo { GroupInfo::new(core::iter::empty::<[Option<&str>; 0]>()) .expect("empty group info is always valid") } /// Return the capture group index corresponding to the given name in the /// given pattern. If no such capture group name exists in the given /// pattern, then this returns `None`. /// /// If the given pattern ID is invalid, then this returns `None`. /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// /// This example shows how to find the capture index for the given pattern /// and group name. /// /// Remember that capture indices are relative to the pattern, such that /// the same capture index value may refer to different capturing groups /// for distinct patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); /// /// let nfa = NFA::new_many(&[ /// r"a(?P\w+)z(?P\s+)", /// r"a(?P\d+)z", /// ])?; /// let groups = nfa.group_info(); /// assert_eq!(Some(2), groups.to_index(pid0, "foo")); /// // Recall that capture index 0 is always unnamed and refers to the /// // entire pattern. So the first capturing group present in the pattern /// // itself always starts at index 1. /// assert_eq!(Some(1), groups.to_index(pid1, "foo")); /// /// // And if a name does not exist for a particular pattern, None is /// // returned. /// assert!(groups.to_index(pid0, "quux").is_some()); /// assert!(groups.to_index(pid1, "quux").is_none()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn to_index(&self, pid: PatternID, name: &str) -> Option { let indices = self.0.name_to_index.get(pid.as_usize())?; indices.get(name).cloned().map(|i| i.as_usize()) } /// Return the capture name for the given index and given pattern. If the /// corresponding group does not have a name, then this returns `None`. /// /// If the pattern ID is invalid, then this returns `None`. /// /// If the group index is invalid for the given pattern, then this returns /// `None`. A group `index` is valid for a pattern `pid` in an `nfa` if and /// only if `index < nfa.pattern_capture_len(pid)`. /// /// This also returns `None` for all inputs if these captures are empty /// (e.g., built from an empty [`GroupInfo`]). To check whether captures /// are present for a specific pattern, use [`GroupInfo::group_len`]. /// /// # Example /// /// This example shows how to find the capture group name for the given /// pattern and group index. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let (pid0, pid1) = (PatternID::must(0), PatternID::must(1)); /// /// let nfa = NFA::new_many(&[ /// r"a(?P\w+)z(\s+)x(\d+)", /// r"a(\d+)z(?P\s+)", /// ])?; /// let groups = nfa.group_info(); /// assert_eq!(None, groups.to_name(pid0, 0)); /// assert_eq!(Some("foo"), groups.to_name(pid0, 1)); /// assert_eq!(None, groups.to_name(pid0, 2)); /// assert_eq!(None, groups.to_name(pid0, 3)); /// /// assert_eq!(None, groups.to_name(pid1, 0)); /// assert_eq!(None, groups.to_name(pid1, 1)); /// assert_eq!(Some("foo"), groups.to_name(pid1, 2)); /// // '3' is not a valid capture index for the second pattern. /// assert_eq!(None, groups.to_name(pid1, 3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn to_name(&self, pid: PatternID, group_index: usize) -> Option<&str> { let pattern_names = self.0.index_to_name.get(pid.as_usize())?; pattern_names.get(group_index)?.as_deref() } /// Return an iterator of all capture groups and their names (if present) /// for a particular pattern. /// /// If the given pattern ID is invalid or if this `GroupInfo` is empty, /// then the iterator yields no elements. /// /// The number of elements yielded by this iterator is always equal to /// the result of calling [`GroupInfo::group_len`] with the same /// `PatternID`. /// /// # Example /// /// This example shows how to get a list of all capture group names for /// a particular pattern. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new(r"(a)(?Pb)(c)(d)(?Pe)")?; /// // The first is the implicit group that is always unnammed. The next /// // 5 groups are the explicit groups found in the concrete syntax above. /// let expected = vec![None, None, Some("foo"), None, None, Some("bar")]; /// let got: Vec> = /// nfa.group_info().pattern_names(PatternID::ZERO).collect(); /// assert_eq!(expected, got); /// /// // Using an invalid pattern ID will result in nothing yielded. /// let got = nfa.group_info().pattern_names(PatternID::must(999)).count(); /// assert_eq!(0, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn pattern_names(&self, pid: PatternID) -> GroupInfoPatternNames<'_> { GroupInfoPatternNames { it: self .0 .index_to_name .get(pid.as_usize()) .map(|indices| indices.iter()) .unwrap_or([].iter()), } } /// Return an iterator of all capture groups for all patterns supported by /// this `GroupInfo`. Each item yielded is a triple of the group's pattern /// ID, index in the pattern and the group's name, if present. /// /// # Example /// /// This example shows how to get a list of all capture groups found in /// one NFA, potentially spanning multiple patterns. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new_many(&[ /// r"(?Pa)", /// r"a", /// r"(a)", /// ])?; /// let expected = vec![ /// (PatternID::must(0), 0, None), /// (PatternID::must(0), 1, Some("foo")), /// (PatternID::must(1), 0, None), /// (PatternID::must(2), 0, None), /// (PatternID::must(2), 1, None), /// ]; /// let got: Vec<(PatternID, usize, Option<&str>)> = /// nfa.group_info().all_names().collect(); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// Unlike other capturing group related routines, this routine doesn't /// panic even if captures aren't enabled on this NFA: /// /// ``` /// use regex_automata::nfa::thompson::{NFA, WhichCaptures}; /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build_many(&[ /// r"(?Pa)", /// r"a", /// r"(a)", /// ])?; /// // When captures aren't enabled, there's nothing to return. /// assert_eq!(0, nfa.group_info().all_names().count()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn all_names(&self) -> GroupInfoAllNames<'_> { GroupInfoAllNames { group_info: self, pids: PatternID::iter(self.pattern_len()), current_pid: None, names: None, } } /// Returns the starting and ending slot corresponding to the given /// capturing group for the given pattern. The ending slot is always one /// more than the starting slot returned. /// /// Note that this is like [`GroupInfo::slot`], except that it also returns /// the ending slot value for convenience. /// /// If either the pattern ID or the capture index is invalid, then this /// returns None. /// /// # Example /// /// This example shows that the starting slots for the first capturing /// group of each pattern are distinct. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new_many(&["a", "b"])?; /// assert_ne!( /// nfa.group_info().slots(PatternID::must(0), 0), /// nfa.group_info().slots(PatternID::must(1), 0), /// ); /// /// // Also, the start and end slot values are never equivalent. /// let (start, end) = nfa.group_info().slots(PatternID::ZERO, 0).unwrap(); /// assert_ne!(start, end); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn slots( &self, pid: PatternID, group_index: usize, ) -> Option<(usize, usize)> { // Since 'slot' only even returns valid starting slots, we know that // there must also be an end slot and that end slot is always one more // than the start slot. self.slot(pid, group_index).map(|start| (start, start + 1)) } /// Returns the starting slot corresponding to the given capturing group /// for the given pattern. The ending slot is always one more than the /// value returned. /// /// If either the pattern ID or the capture index is invalid, then this /// returns None. /// /// # Example /// /// This example shows that the starting slots for the first capturing /// group of each pattern are distinct. /// /// ``` /// use regex_automata::{nfa::thompson::NFA, PatternID}; /// /// let nfa = NFA::new_many(&["a", "b"])?; /// assert_ne!( /// nfa.group_info().slot(PatternID::must(0), 0), /// nfa.group_info().slot(PatternID::must(1), 0), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn slot(&self, pid: PatternID, group_index: usize) -> Option { if group_index >= self.group_len(pid) { return None; } // At this point, we know that 'pid' refers to a real pattern and that // 'group_index' refers to a real group. We therefore also know that // the pattern and group can be combined to return a correct slot. // That's why we don't need to use checked arithmetic below. if group_index == 0 { Some(pid.as_usize() * 2) } else { // As above, we don't need to check that our slot is less than the // end of our range since we already know the group index is a // valid index for the given pattern. let (start, _) = self.0.slot_ranges[pid]; Some(start.as_usize() + ((group_index - 1) * 2)) } } /// Returns the total number of patterns in this `GroupInfo`. /// /// This may return zero if the `GroupInfo` was constructed with no /// patterns. /// /// This is guaranteed to be no bigger than [`PatternID::LIMIT`] because /// `GroupInfo` construction will fail if too many patterns are added. /// /// # Example /// /// ``` /// use regex_automata::nfa::thompson::NFA; /// /// let nfa = NFA::new_many(&["[0-9]+", "[a-z]+", "[A-Z]+"])?; /// assert_eq!(3, nfa.group_info().pattern_len()); /// /// let nfa = NFA::never_match(); /// assert_eq!(0, nfa.group_info().pattern_len()); /// /// let nfa = NFA::always_match(); /// assert_eq!(1, nfa.group_info().pattern_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn pattern_len(&self) -> usize { self.0.pattern_len() } /// Return the number of capture groups in a pattern. /// /// If the pattern ID is invalid, then this returns `0`. /// /// # Example /// /// This example shows how the values returned by this routine may vary /// for different patterns and NFA configurations. /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and /// // 1 unnamed and implicit group spanning the entire pattern. /// assert_eq!(4, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::new(r"abc")?; /// // There is just the unnamed implicit group. /// assert_eq!(1, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn group_len(&self, pid: PatternID) -> usize { self.0.group_len(pid) } /// Return the total number of capture groups across all patterns. /// /// This includes implicit groups that represent the entire match of a /// pattern. /// /// # Example /// /// This example shows how the values returned by this routine may vary /// for different patterns and NFA configurations. /// /// ``` /// use regex_automata::{nfa::thompson::{NFA, WhichCaptures}, PatternID}; /// /// let nfa = NFA::new(r"(a)(b)(c)")?; /// // There are 3 explicit groups in the pattern's concrete syntax and /// // 1 unnamed and implicit group spanning the entire pattern. /// assert_eq!(4, nfa.group_info().all_group_len()); /// /// let nfa = NFA::new(r"abc")?; /// // There is just the unnamed implicit group. /// assert_eq!(1, nfa.group_info().all_group_len()); /// /// let nfa = NFA::new_many(&["(a)", "b", "(c)"])?; /// // Each pattern has one implicit groups, and two /// // patterns have one explicit group each. /// assert_eq!(5, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"abc")?; /// // We disabled capturing groups, so there are none. /// assert_eq!(0, nfa.group_info().all_group_len()); /// /// let nfa = NFA::compiler() /// .configure(NFA::config().which_captures(WhichCaptures::None)) /// .build(r"(a)(b)(c)")?; /// // We disabled capturing groups, so there are none, even if there are /// // explicit groups in the concrete syntax. /// assert_eq!(0, nfa.group_info().group_len(PatternID::ZERO)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn all_group_len(&self) -> usize { self.slot_len() / 2 } /// Returns the total number of slots in this `GroupInfo` across all /// patterns. /// /// The total number of slots is always twice the total number of capturing /// groups, including both implicit and explicit groups. /// /// # Example /// /// This example shows the relationship between the number of capturing /// groups and slots. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// // There are 11 total groups here. /// let info = GroupInfo::new(vec![ /// vec![None, Some("foo")], /// vec![None], /// vec![None, None, None, Some("bar"), None], /// vec![None, None, Some("foo")], /// ])?; /// // 2 slots per group gives us 11*2=22 slots. /// assert_eq!(22, info.slot_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn slot_len(&self) -> usize { self.0.small_slot_len().as_usize() } /// Returns the total number of slots for implicit capturing groups. /// /// This is like [`GroupInfo::slot_len`], except it doesn't include the /// explicit slots for each pattern. Since there are always exactly 2 /// implicit slots for each pattern, the number of implicit slots is always /// equal to twice the number of patterns. /// /// # Example /// /// This example shows the relationship between the number of capturing /// groups, implicit slots and explicit slots. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// // There are 11 total groups here. /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; /// // 2 slots per group gives us 11*2=22 slots. /// assert_eq!(6, info.slot_len()); /// // 2 implicit slots per pattern gives us 2 implicit slots since there /// // is 1 pattern. /// assert_eq!(2, info.implicit_slot_len()); /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. /// assert_eq!(4, info.explicit_slot_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn implicit_slot_len(&self) -> usize { self.pattern_len() * 2 } /// Returns the total number of slots for explicit capturing groups. /// /// This is like [`GroupInfo::slot_len`], except it doesn't include the /// implicit slots for each pattern. (There are always 2 implicit slots for /// each pattern.) /// /// For a non-empty `GroupInfo`, it is always the case that `slot_len` is /// strictly greater than `explicit_slot_len`. For an empty `GroupInfo`, /// both the total number of slots and the number of explicit slots is /// `0`. /// /// # Example /// /// This example shows the relationship between the number of capturing /// groups, implicit slots and explicit slots. /// /// ``` /// use regex_automata::util::captures::GroupInfo; /// /// // There are 11 total groups here. /// let info = GroupInfo::new(vec![vec![None, Some("foo"), Some("bar")]])?; /// // 2 slots per group gives us 11*2=22 slots. /// assert_eq!(6, info.slot_len()); /// // 2 implicit slots per pattern gives us 2 implicit slots since there /// // is 1 pattern. /// assert_eq!(2, info.implicit_slot_len()); /// // 2 explicit capturing groups gives us 2*2=4 explicit slots. /// assert_eq!(4, info.explicit_slot_len()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn explicit_slot_len(&self) -> usize { self.slot_len().saturating_sub(self.implicit_slot_len()) } /// Returns the memory usage, in bytes, of this `GroupInfo`. /// /// This does **not** include the stack size used up by this `GroupInfo`. /// To compute that, use `std::mem::size_of::()`. #[inline] pub fn memory_usage(&self) -> usize { use core::mem::size_of as s; s::() + self.0.slot_ranges.len() * s::<(SmallIndex, SmallIndex)>() + self.0.name_to_index.len() * s::() + self.0.index_to_name.len() * s::>>>() + self.0.memory_extra } } /// A map from capture group name to its corresponding capture group index. /// /// This type is actually wrapped inside a Vec indexed by pattern ID on a /// `GroupInfo`, since multiple patterns may have the same capture group name. /// That is, each pattern gets its own namespace of capture group names. /// /// Perhaps a more memory efficient representation would be /// HashMap<(PatternID, Arc), usize>, but this makes it difficult to look /// up a capture index by name without producing a `Arc`, which requires /// an allocation. To fix this, I think we'd need to define our own unsized /// type or something? Anyway, I didn't give this much thought since it /// probably doesn't matter much in the grand scheme of things. But it did /// stand out to me as mildly wasteful. #[cfg(feature = "std")] type CaptureNameMap = std::collections::HashMap, SmallIndex>; #[cfg(not(feature = "std"))] type CaptureNameMap = alloc::collections::BTreeMap, SmallIndex>; /// The inner guts of `GroupInfo`. This type only exists so that it can /// be wrapped in an `Arc` to make `GroupInfo` reference counted. #[derive(Debug, Default)] struct GroupInfoInner { slot_ranges: Vec<(SmallIndex, SmallIndex)>, name_to_index: Vec, index_to_name: Vec>>>, memory_extra: usize, } impl GroupInfoInner { /// This adds the first unnamed group for the given pattern ID. The given /// pattern ID must be zero if this is the first time this method is /// called, or must be exactly one more than the pattern ID supplied to the /// previous call to this method. (This method panics if this rule is /// violated.) /// /// This can be thought of as initializing the GroupInfo state for the /// given pattern and closing off the state for any previous pattern. fn add_first_group(&mut self, pid: PatternID) { assert_eq!(pid.as_usize(), self.slot_ranges.len()); assert_eq!(pid.as_usize(), self.name_to_index.len()); assert_eq!(pid.as_usize(), self.index_to_name.len()); // This is the start of our slots for the explicit capturing groups. // Note that since the slots for the 0th group for every pattern appear // before any slots for the nth group (where n > 0) in any pattern, we // will have to fix up the slot ranges once we know how many patterns // we've added capture groups for. let slot_start = self.small_slot_len(); self.slot_ranges.push((slot_start, slot_start)); self.name_to_index.push(CaptureNameMap::new()); self.index_to_name.push(vec![None]); self.memory_extra += core::mem::size_of::>>(); } /// Add an explicit capturing group for the given pattern with the given /// index. If the group has a name, then that must be given as well. /// /// Note that every capturing group except for the first or zeroth group is /// explicit. /// /// This returns an error if adding this group would result in overflowing /// slot indices or if a capturing group with the same name for this /// pattern has already been added. fn add_explicit_group>( &mut self, pid: PatternID, group: SmallIndex, maybe_name: Option, ) -> Result<(), GroupInfoError> { // We also need to check that the slot index generated for // this group is also valid. Although, this is a little weird // because we offset these indices below, at which point, we'll // have to recheck them. Gosh this is annoying. Note that // the '+2' below is OK because 'end' is guaranteed to be less // than isize::MAX. let end = &mut self.slot_ranges[pid].1; *end = SmallIndex::new(end.as_usize() + 2).map_err(|_| { GroupInfoError::too_many_groups(pid, group.as_usize()) })?; if let Some(name) = maybe_name { let name = Arc::::from(name.as_ref()); if self.name_to_index[pid].contains_key(&*name) { return Err(GroupInfoError::duplicate(pid, &name)); } let len = name.len(); self.name_to_index[pid].insert(Arc::clone(&name), group); self.index_to_name[pid].push(Some(name)); // Adds the memory used by the Arc in both maps. self.memory_extra += 2 * (len + core::mem::size_of::>>()); // And also the value entry for the 'name_to_index' map. // This is probably an underestimate for 'name_to_index' since // hashmaps/btrees likely have some non-zero overhead, but we // assume here that they have zero overhead. self.memory_extra += core::mem::size_of::(); } else { self.index_to_name[pid].push(None); self.memory_extra += core::mem::size_of::>>(); } // This is a sanity assert that checks that our group index // is in line with the number of groups added so far for this // pattern. assert_eq!(group.one_more(), self.group_len(pid)); // And is also in line with the 'index_to_name' map. assert_eq!(group.one_more(), self.index_to_name[pid].len()); Ok(()) } /// This corrects the slot ranges to account for the slots corresponding /// to the zeroth group of each pattern. That is, every slot range is /// offset by 'pattern_len() * 2', since each pattern uses two slots to /// represent the zeroth group. fn fixup_slot_ranges(&mut self) -> Result<(), GroupInfoError> { use crate::util::primitives::IteratorIndexExt; // Since we know number of patterns fits in PatternID and // PatternID::MAX < isize::MAX, it follows that multiplying by 2 will // never overflow usize. let offset = self.pattern_len().checked_mul(2).unwrap(); for (pid, &mut (ref mut start, ref mut end)) in self.slot_ranges.iter_mut().with_pattern_ids() { let group_len = 1 + ((end.as_usize() - start.as_usize()) / 2); let new_end = match end.as_usize().checked_add(offset) { Some(new_end) => new_end, None => { return Err(GroupInfoError::too_many_groups( pid, group_len, )) } }; *end = SmallIndex::new(new_end).map_err(|_| { GroupInfoError::too_many_groups(pid, group_len) })?; // Since start <= end, if end is valid then start must be too. *start = SmallIndex::new(start.as_usize() + offset).unwrap(); } Ok(()) } /// Return the total number of patterns represented by this capture slot /// info. fn pattern_len(&self) -> usize { self.slot_ranges.len() } /// Return the total number of capturing groups for the given pattern. If /// the given pattern isn't valid for this capture slot info, then 0 is /// returned. fn group_len(&self, pid: PatternID) -> usize { let (start, end) = match self.slot_ranges.get(pid.as_usize()) { None => return 0, Some(range) => range, }; // The difference between any two SmallIndex values always fits in a // usize since we know that SmallIndex::MAX <= isize::MAX-1. We also // know that start<=end by construction and that the number of groups // never exceeds SmallIndex and thus never overflows usize. 1 + ((end.as_usize() - start.as_usize()) / 2) } /// Return the total number of slots in this capture slot info as a /// "small index." fn small_slot_len(&self) -> SmallIndex { // Since slots are allocated in order of pattern (starting at 0) and // then in order of capture group, it follows that the number of slots // is the end of the range of slots for the last pattern. This is // true even when the last pattern has no capturing groups, since // 'slot_ranges' will still represent it explicitly with an empty // range. self.slot_ranges.last().map_or(SmallIndex::ZERO, |&(_, end)| end) } } /// An error that may occur when building a `GroupInfo`. /// /// Building a `GroupInfo` does a variety of checks to make sure the /// capturing groups satisfy a number of invariants. This includes, but is not /// limited to, ensuring that the first capturing group is unnamed and that /// there are no duplicate capture groups for a specific pattern. #[derive(Clone, Debug)] pub struct GroupInfoError { kind: GroupInfoErrorKind, } /// The kind of error that occurs when building a `GroupInfo` fails. /// /// We keep this un-exported because it's not clear how useful it is to /// export it. #[derive(Clone, Debug)] enum GroupInfoErrorKind { /// This occurs when too many patterns have been added. i.e., It would /// otherwise overflow a `PatternID`. TooManyPatterns { err: PatternIDError }, /// This occurs when too many capturing groups have been added for a /// particular pattern. TooManyGroups { /// The ID of the pattern that had too many groups. pattern: PatternID, /// The minimum number of groups that the caller has tried to add for /// a pattern. minimum: usize, }, /// An error that occurs when a pattern has no capture groups. Either the /// group info must be empty, or all patterns must have at least one group /// (corresponding to the unnamed group for the entire pattern). MissingGroups { /// The ID of the pattern that had no capturing groups. pattern: PatternID, }, /// An error that occurs when one tries to provide a name for the capture /// group at index 0. This capturing group must currently always be /// unnamed. FirstMustBeUnnamed { /// The ID of the pattern that was found to have a named first /// capturing group. pattern: PatternID, }, /// An error that occurs when duplicate capture group names for the same /// pattern are added. /// /// NOTE: At time of writing, this error can never occur if you're using /// regex-syntax, since the parser itself will reject patterns with /// duplicate capture group names. This error can only occur when the /// builder is used to hand construct NFAs. Duplicate { /// The pattern in which the duplicate capture group name was found. pattern: PatternID, /// The duplicate name. name: String, }, } impl GroupInfoError { fn too_many_patterns(err: PatternIDError) -> GroupInfoError { GroupInfoError { kind: GroupInfoErrorKind::TooManyPatterns { err } } } fn too_many_groups(pattern: PatternID, minimum: usize) -> GroupInfoError { GroupInfoError { kind: GroupInfoErrorKind::TooManyGroups { pattern, minimum }, } } fn missing_groups(pattern: PatternID) -> GroupInfoError { GroupInfoError { kind: GroupInfoErrorKind::MissingGroups { pattern } } } fn first_must_be_unnamed(pattern: PatternID) -> GroupInfoError { GroupInfoError { kind: GroupInfoErrorKind::FirstMustBeUnnamed { pattern }, } } fn duplicate(pattern: PatternID, name: &str) -> GroupInfoError { GroupInfoError { kind: GroupInfoErrorKind::Duplicate { pattern, name: String::from(name), }, } } } #[cfg(feature = "std")] impl std::error::Error for GroupInfoError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind { GroupInfoErrorKind::TooManyPatterns { .. } | GroupInfoErrorKind::TooManyGroups { .. } | GroupInfoErrorKind::MissingGroups { .. } | GroupInfoErrorKind::FirstMustBeUnnamed { .. } | GroupInfoErrorKind::Duplicate { .. } => None, } } } impl core::fmt::Display for GroupInfoError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { use self::GroupInfoErrorKind::*; match self.kind { TooManyPatterns { ref err } => { write!(f, "too many patterns to build capture info: {}", err) } TooManyGroups { pattern, minimum } => { write!( f, "too many capture groups (at least {}) were \ found for pattern {}", minimum, pattern.as_usize() ) } MissingGroups { pattern } => write!( f, "no capturing groups found for pattern {} \ (either all patterns have zero groups or all patterns have \ at least one group)", pattern.as_usize(), ), FirstMustBeUnnamed { pattern } => write!( f, "first capture group (at index 0) for pattern {} has a name \ (it must be unnamed)", pattern.as_usize(), ), Duplicate { pattern, ref name } => write!( f, "duplicate capture group name '{}' found for pattern {}", name, pattern.as_usize(), ), } } } /// An iterator over capturing groups and their names for a specific pattern. /// /// This iterator is created by [`GroupInfo::pattern_names`]. /// /// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` /// from which this iterator was created. #[derive(Clone, Debug)] pub struct GroupInfoPatternNames<'a> { it: core::slice::Iter<'a, Option>>, } impl GroupInfoPatternNames<'static> { fn empty() -> GroupInfoPatternNames<'static> { GroupInfoPatternNames { it: [].iter() } } } impl<'a> Iterator for GroupInfoPatternNames<'a> { type Item = Option<&'a str>; fn next(&mut self) -> Option> { self.it.next().map(|x| x.as_deref()) } fn size_hint(&self) -> (usize, Option) { self.it.size_hint() } fn count(self) -> usize { self.it.count() } } impl<'a> ExactSizeIterator for GroupInfoPatternNames<'a> {} impl<'a> core::iter::FusedIterator for GroupInfoPatternNames<'a> {} /// An iterator over capturing groups and their names for a `GroupInfo`. /// /// This iterator is created by [`GroupInfo::all_names`]. /// /// The lifetime parameter `'a` refers to the lifetime of the `GroupInfo` /// from which this iterator was created. #[derive(Debug)] pub struct GroupInfoAllNames<'a> { group_info: &'a GroupInfo, pids: PatternIDIter, current_pid: Option, names: Option>>, } impl<'a> Iterator for GroupInfoAllNames<'a> { type Item = (PatternID, usize, Option<&'a str>); fn next(&mut self) -> Option<(PatternID, usize, Option<&'a str>)> { // If the group info has no captures, then we never have anything // to yield. We need to consider this case explicitly (at time of // writing) because 'pattern_capture_names' will panic if captures // aren't enabled. if self.group_info.0.index_to_name.is_empty() { return None; } if self.current_pid.is_none() { self.current_pid = Some(self.pids.next()?); } let pid = self.current_pid.unwrap(); if self.names.is_none() { self.names = Some(self.group_info.pattern_names(pid).enumerate()); } let (group_index, name) = match self.names.as_mut().unwrap().next() { Some((group_index, name)) => (group_index, name), None => { self.current_pid = None; self.names = None; return self.next(); } }; Some((pid, group_index, name)) } } regex-automata-0.4.9/src/util/determinize/mod.rs000064400000000000000000000753251046102023000200050ustar 00000000000000/*! This module contains types and routines for implementing determinization. In this crate, there are at least two places where we implement determinization: fully ahead-of-time compiled DFAs in the `dfa` module and lazily compiled DFAs in the `hybrid` module. The stuff in this module corresponds to the things that are in common between these implementations. There are three broad things that our implementations of determinization have in common, as defined by this module: * The classification of start states. That is, whether we're dealing with word boundaries, line boundaries, etc., is all the same. This also includes the look-behind assertions that are satisfied by each starting state classification. * The representation of DFA states as sets of NFA states, including convenience types for building these DFA states that are amenable to reusing allocations. * Routines for the "classical" parts of determinization: computing the epsilon closure, tracking match states (with corresponding pattern IDs, since we support multi-pattern finite automata) and, of course, computing the transition function between states for units of input. I did consider a couple of alternatives to this particular form of code reuse: 1. Don't do any code reuse. The problem here is that we *really* want both forms of determinization to do exactly identical things when it comes to their handling of NFA states. While our tests generally ensure this, the code is tricky and large enough where not reusing code is a pretty big bummer. 2. Implement all of determinization once and make it generic over fully compiled DFAs and lazily compiled DFAs. While I didn't actually try this approach, my instinct is that it would be more complex than is needed here. And the interface required would be pretty hairy. Instead, I think splitting it into logical sub-components works better. */ use alloc::vec::Vec; pub(crate) use self::state::{ State, StateBuilderEmpty, StateBuilderMatches, StateBuilderNFA, }; use crate::{ nfa::thompson, util::{ alphabet, look::{Look, LookSet}, primitives::StateID, search::MatchKind, sparse_set::{SparseSet, SparseSets}, start::Start, utf8, }, }; mod state; /// Compute the set of all reachable NFA states, including the full epsilon /// closure, from a DFA state for a single unit of input. The set of reachable /// states is returned as a `StateBuilderNFA`. The `StateBuilderNFA` returned /// also includes any look-behind assertions satisfied by `unit`, in addition /// to whether it is a match state. For multi-pattern DFAs, the builder will /// also include the pattern IDs that match (in the order seen). /// /// `nfa` must be able to resolve any NFA state in `state` and any NFA state /// reachable via the epsilon closure of any NFA state in `state`. `sparses` /// must have capacity equivalent to `nfa.len()`. /// /// `match_kind` should correspond to the match semantics implemented by the /// DFA being built. Generally speaking, for leftmost-first match semantics, /// states that appear after the first NFA match state will not be included in /// the `StateBuilderNFA` returned since they are impossible to visit. /// /// `sparses` is used as scratch space for NFA traversal. Other than their /// capacity requirements (detailed above), there are no requirements on what's /// contained within them (if anything). Similarly, what's inside of them once /// this routine returns is unspecified. /// /// `stack` must have length 0. It is used as scratch space for depth first /// traversal. After returning, it is guaranteed that `stack` will have length /// 0. /// /// `state` corresponds to the current DFA state on which one wants to compute /// the transition for the input `unit`. /// /// `empty_builder` corresponds to the builder allocation to use to produce a /// complete `StateBuilderNFA` state. If the state is not needed (or is already /// cached), then it can be cleared and reused without needing to create a new /// `State`. The `StateBuilderNFA` state returned is final and ready to be /// turned into a `State` if necessary. pub(crate) fn next( nfa: &thompson::NFA, match_kind: MatchKind, sparses: &mut SparseSets, stack: &mut Vec, state: &State, unit: alphabet::Unit, empty_builder: StateBuilderEmpty, ) -> StateBuilderNFA { sparses.clear(); // Whether the NFA is matched in reverse or not. We use this in some // conditional logic for dealing with the exceptionally annoying CRLF-aware // line anchors. let rev = nfa.is_reverse(); // The look-around matcher that our NFA is configured with. We don't // actually use it to match look-around assertions, but we do need its // configuration for constructing states consistent with how it matches. let lookm = nfa.look_matcher(); // Put the NFA state IDs into a sparse set in case we need to // re-compute their epsilon closure. // // Doing this state shuffling is technically not necessary unless some // kind of look-around is used in the DFA. Some ad hoc experiments // suggested that avoiding this didn't lead to much of an improvement, // but perhaps more rigorous experimentation should be done. And in // particular, avoiding this check requires some light refactoring of // the code below. state.iter_nfa_state_ids(|nfa_id| { sparses.set1.insert(nfa_id); }); // Compute look-ahead assertions originating from the current state. Based // on the input unit we're transitioning over, some additional set of // assertions may be true. Thus, we re-compute this state's epsilon closure // (but only if necessary). Notably, when we build a DFA state initially, // we don't enable any look-ahead assertions because we don't know whether // they're true or not at that point. if !state.look_need().is_empty() { // Add look-ahead assertions that are now true based on the current // input unit. let mut look_have = state.look_have().clone(); match unit.as_u8() { Some(b'\r') => { if !rev || !state.is_half_crlf() { look_have = look_have.insert(Look::EndCRLF); } } Some(b'\n') => { if rev || !state.is_half_crlf() { look_have = look_have.insert(Look::EndCRLF); } } Some(_) => {} None => { look_have = look_have .insert(Look::End) .insert(Look::EndLF) .insert(Look::EndCRLF); } } if unit.is_byte(lookm.get_line_terminator()) { look_have = look_have.insert(Look::EndLF); } if state.is_half_crlf() && ((rev && !unit.is_byte(b'\r')) || (!rev && !unit.is_byte(b'\n'))) { look_have = look_have.insert(Look::StartCRLF); } if state.is_from_word() == unit.is_word_byte() { look_have = look_have .insert(Look::WordAsciiNegate) .insert(Look::WordUnicodeNegate); } else { look_have = look_have.insert(Look::WordAscii).insert(Look::WordUnicode); } if !unit.is_word_byte() { look_have = look_have .insert(Look::WordEndHalfAscii) .insert(Look::WordEndHalfUnicode); } if state.is_from_word() && !unit.is_word_byte() { look_have = look_have .insert(Look::WordEndAscii) .insert(Look::WordEndUnicode); } else if !state.is_from_word() && unit.is_word_byte() { look_have = look_have .insert(Look::WordStartAscii) .insert(Look::WordStartUnicode); } // If we have new assertions satisfied that are among the set of // assertions that exist in this state (that is, just because we added // an EndLF assertion above doesn't mean there is an EndLF conditional // epsilon transition in this state), then we re-compute this state's // epsilon closure using the updated set of assertions. // // Note that since our DFA states omit unconditional epsilon // transitions, this check is necessary for correctness. If we re-did // the epsilon closure below needlessly, it could change based on the // fact that we omitted epsilon states originally. if !look_have .subtract(state.look_have()) .intersect(state.look_need()) .is_empty() { for nfa_id in sparses.set1.iter() { epsilon_closure( nfa, nfa_id, look_have, stack, &mut sparses.set2, ); } sparses.swap(); sparses.set2.clear(); } } // Convert our empty builder into one that can record assertions and match // pattern IDs. let mut builder = empty_builder.into_matches(); // Set whether the StartLF look-behind assertion is true for this // transition or not. The look-behind assertion for ASCII word boundaries // is handled below. if nfa.look_set_any().contains_anchor_line() && unit.is_byte(lookm.get_line_terminator()) { // Why only handle StartLF here and not Start? That's because Start // can only impact the starting state, which is special cased in // start state handling. builder.set_look_have(|have| have.insert(Look::StartLF)); } // We also need to add StartCRLF to our assertions too, if we can. This // is unfortunately a bit more complicated, because it depends on the // direction of the search. In the forward direction, ^ matches after a // \n, but in the reverse direction, ^ only matches after a \r. (This is // further complicated by the fact that reverse a regex means changing a ^ // to a $ and vice versa.) if nfa.look_set_any().contains_anchor_crlf() && ((rev && unit.is_byte(b'\r')) || (!rev && unit.is_byte(b'\n'))) { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } // And also for the start-half word boundary assertions. As long as the // look-behind byte is not a word char, then the assertions are satisfied. if nfa.look_set_any().contains_word() && !unit.is_word_byte() { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } for nfa_id in sparses.set1.iter() { match *nfa.state(nfa_id) { thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } | thompson::State::Fail | thompson::State::Look { .. } | thompson::State::Capture { .. } => {} thompson::State::Match { pattern_id } => { // Notice here that we are calling the NEW state a match // state if the OLD state we are transitioning from // contains an NFA match state. This is precisely how we // delay all matches by one byte and also what therefore // guarantees that starting states cannot be match states. // // If we didn't delay matches by one byte, then whether // a DFA is a matching state or not would be determined // by whether one of its own constituent NFA states // was a match state. (And that would be done in // 'add_nfa_states'.) // // Also, 'add_match_pattern_id' requires that callers never // pass duplicative pattern IDs. We do in fact uphold that // guarantee here, but it's subtle. In particular, a Thompson // NFA guarantees that each pattern has exactly one match // state. Moreover, since we're iterating over the NFA state // IDs in a set, we are guarateed not to have any duplicative // match states. Thus, it is impossible to add the same pattern // ID more than once. // // N.B. We delay matches by 1 byte as a way to hack 1-byte // look-around into DFA searches. This lets us support ^, $ // and ASCII-only \b. The delay is also why we need a special // "end-of-input" (EOI) sentinel and why we need to follow the // EOI sentinel at the end of every search. This final EOI // transition is necessary to report matches found at the end // of a haystack. builder.add_match_pattern_id(pattern_id); if !match_kind.continue_past_first_match() { break; } } thompson::State::ByteRange { ref trans } => { if trans.matches_unit(unit) { epsilon_closure( nfa, trans.next, builder.look_have(), stack, &mut sparses.set2, ); } } thompson::State::Sparse(ref sparse) => { if let Some(next) = sparse.matches_unit(unit) { epsilon_closure( nfa, next, builder.look_have(), stack, &mut sparses.set2, ); } } thompson::State::Dense(ref dense) => { if let Some(next) = dense.matches_unit(unit) { epsilon_closure( nfa, next, builder.look_have(), stack, &mut sparses.set2, ); } } } } // We only set the word byte if there's a word boundary look-around // anywhere in this regex. Otherwise, there's no point in bloating the // number of states if we don't have one. // // We also only set it when the state has a non-zero number of NFA states. // Otherwise, we could wind up with states that *should* be DEAD states // but are otherwise distinct from DEAD states because of this look-behind // assertion being set. While this can't technically impact correctness *in // theory*, it can create pathological DFAs that consume input until EOI or // a quit byte is seen. Consuming until EOI isn't a correctness problem, // but a (serious) perf problem. Hitting a quit byte, however, could be a // correctness problem since it could cause search routines to report an // error instead of a detected match once the quit state is entered. (The // search routine could be made to be a bit smarter by reporting a match // if one was detected once it enters a quit state (and indeed, the search // routines in this crate do just that), but it seems better to prevent // these things by construction if possible.) if !sparses.set2.is_empty() { if nfa.look_set_any().contains_word() && unit.is_word_byte() { builder.set_is_from_word(); } if nfa.look_set_any().contains_anchor_crlf() && ((rev && unit.is_byte(b'\n')) || (!rev && unit.is_byte(b'\r'))) { builder.set_is_half_crlf(); } } let mut builder_nfa = builder.into_nfa(); add_nfa_states(nfa, &sparses.set2, &mut builder_nfa); builder_nfa } /// Compute the epsilon closure for the given NFA state. The epsilon closure /// consists of all NFA state IDs, including `start_nfa_id`, that can be /// reached from `start_nfa_id` without consuming any input. These state IDs /// are written to `set` in the order they are visited, but only if they are /// not already in `set`. `start_nfa_id` must be a valid state ID for the NFA /// given. /// /// `look_have` consists of the satisfied assertions at the current /// position. For conditional look-around epsilon transitions, these are /// only followed if they are satisfied by `look_have`. /// /// `stack` must have length 0. It is used as scratch space for depth first /// traversal. After returning, it is guaranteed that `stack` will have length /// 0. pub(crate) fn epsilon_closure( nfa: &thompson::NFA, start_nfa_id: StateID, look_have: LookSet, stack: &mut Vec, set: &mut SparseSet, ) { assert!(stack.is_empty()); // If this isn't an epsilon state, then the epsilon closure is always just // itself, so there's no need to spin up the machinery below to handle it. if !nfa.state(start_nfa_id).is_epsilon() { set.insert(start_nfa_id); return; } stack.push(start_nfa_id); while let Some(mut id) = stack.pop() { // In many cases, we can avoid stack operations when an NFA state only // adds one new state to visit. In that case, we just set our ID to // that state and mush on. We only use the stack when an NFA state // introduces multiple new states to visit. loop { // Insert this NFA state, and if it's already in the set and thus // already visited, then we can move on to the next one. if !set.insert(id) { break; } match *nfa.state(id) { thompson::State::ByteRange { .. } | thompson::State::Sparse { .. } | thompson::State::Dense { .. } | thompson::State::Fail | thompson::State::Match { .. } => break, thompson::State::Look { look, next } => { if !look_have.contains(look) { break; } id = next; } thompson::State::Union { ref alternates } => { id = match alternates.get(0) { None => break, Some(&id) => id, }; // We need to process our alternates in order to preserve // match preferences, so put the earliest alternates closer // to the top of the stack. stack.extend(alternates[1..].iter().rev()); } thompson::State::BinaryUnion { alt1, alt2 } => { id = alt1; stack.push(alt2); } thompson::State::Capture { next, .. } => { id = next; } } } } } /// Add the NFA state IDs in the given `set` to the given DFA builder state. /// The order in which states are added corresponds to the order in which they /// were added to `set`. /// /// The DFA builder state given should already have its complete set of match /// pattern IDs added (if any) and any look-behind assertions (StartLF, Start /// and whether this state is being generated for a transition over a word byte /// when applicable) that are true immediately prior to transitioning into this /// state (via `builder.look_have()`). The match pattern IDs should correspond /// to matches that occurred on the previous transition, since all matches are /// delayed by one byte. The things that should _not_ be set are look-ahead /// assertions (EndLF, End and whether the next byte is a word byte or not). /// The builder state should also not have anything in `look_need` set, as this /// routine will compute that for you. /// /// The given NFA should be able to resolve all identifiers in `set` to a /// particular NFA state. Additionally, `set` must have capacity equivalent /// to `nfa.len()`. pub(crate) fn add_nfa_states( nfa: &thompson::NFA, set: &SparseSet, builder: &mut StateBuilderNFA, ) { for nfa_id in set.iter() { match *nfa.state(nfa_id) { thompson::State::ByteRange { .. } => { builder.add_nfa_state_id(nfa_id); } thompson::State::Sparse { .. } => { builder.add_nfa_state_id(nfa_id); } thompson::State::Dense { .. } => { builder.add_nfa_state_id(nfa_id); } thompson::State::Look { look, .. } => { builder.add_nfa_state_id(nfa_id); builder.set_look_need(|need| need.insert(look)); } thompson::State::Union { .. } | thompson::State::BinaryUnion { .. } => { // Pure epsilon transitions don't need to be tracked as part // of the DFA state. Tracking them is actually superfluous; // they won't cause any harm other than making determinization // slower. // // Why aren't these needed? Well, in an NFA, epsilon // transitions are really just jumping points to other states. // So once you hit an epsilon transition, the same set of // resulting states always appears. Therefore, putting them in // a DFA's set of ordered NFA states is strictly redundant. // // Look-around states are also epsilon transitions, but // they are *conditional*. So their presence could be // discriminatory, and thus, they are tracked above. // // But wait... why are epsilon states in our `set` in the first // place? Why not just leave them out? They're in our `set` // because it was generated by computing an epsilon closure, // and we want to keep track of all states we visited to avoid // re-visiting them. In exchange, we have to do this second // iteration over our collected states to finalize our DFA // state. In theory, we could avoid this second iteration if // we maintained two sets during epsilon closure: the set of // visited states (to avoid cycles) and the set of states that // will actually be used to construct the next DFA state. // // Note that this optimization requires that we re-compute the // epsilon closure to account for look-ahead in 'next' *only // when necessary*. Namely, only when the set of look-around // assertions changes and only when those changes are within // the set of assertions that are needed in order to step // through the closure correctly. Otherwise, if we re-do the // epsilon closure needlessly, it could change based on the // fact that we are omitting epsilon states here. // // ----- // // Welp, scratch the above. It turns out that recording these // is in fact necessary to seemingly handle one particularly // annoying case: when a conditional epsilon transition is // put inside of a repetition operator. One specific case I // ran into was the regex `(?:\b|%)+` on the haystack `z%`. // The correct leftmost first matches are: [0, 0] and [1, 1]. // But the DFA was reporting [0, 0] and [1, 2]. To understand // why this happens, consider the NFA for the aforementioned // regex: // // >000000: binary-union(4, 1) // 000001: \x00-\xFF => 0 // 000002: WordAscii => 5 // 000003: % => 5 // ^000004: binary-union(2, 3) // 000005: binary-union(4, 6) // 000006: MATCH(0) // // The problem here is that one of the DFA start states is // going to consist of the NFA states [2, 3] by computing the // epsilon closure of state 4. State 4 isn't included because // we previously were not keeping track of union states. But // only a subset of transitions out of this state will be able // to follow WordAscii, and in those cases, the epsilon closure // is redone. The only problem is that computing the epsilon // closure from [2, 3] is different than computing the epsilon // closure from [4]. In the former case, assuming the WordAscii // assertion is satisfied, you get: [2, 3, 6]. In the latter // case, you get: [2, 6, 3]. Notice that '6' is the match state // and appears AFTER '3' in the former case. This leads to a // preferential but incorrect match of '%' before returning // a match. In the latter case, the match is preferred over // continuing to accept the '%'. // // It almost feels like we might be able to fix the NFA states // to avoid this, or to at least only keep track of union // states where this actually matters, since in the vast // majority of cases, this doesn't matter. // // Another alternative would be to define a new HIR property // called "assertion is repeated anywhere" and compute it // inductively over the entire pattern. If it happens anywhere, // which is probably pretty rare, then we record union states. // Otherwise we don't. builder.add_nfa_state_id(nfa_id); } // Capture states we definitely do not need to record, since they // are unconditional epsilon transitions with no branching. thompson::State::Capture { .. } => {} // It's not totally clear whether we need to record fail states or // not, but we do so out of an abundance of caution. Since they are // quite rare in practice, there isn't much cost to recording them. thompson::State::Fail => { builder.add_nfa_state_id(nfa_id); } thompson::State::Match { .. } => { // Normally, the NFA match state doesn't actually need to // be inside the DFA state. But since we delay matches by // one byte, the matching DFA state corresponds to states // that transition from the one we're building here. And // the way we detect those cases is by looking for an NFA // match state. See 'next' for how this is handled. builder.add_nfa_state_id(nfa_id); } } } // If we know this state contains no look-around assertions, then // there's no reason to track which look-around assertions were // satisfied when this state was created. if builder.look_need().is_empty() { builder.set_look_have(|_| LookSet::empty()); } } /// Sets the appropriate look-behind assertions on the given state based on /// this starting configuration. pub(crate) fn set_lookbehind_from_start( nfa: &thompson::NFA, start: &Start, builder: &mut StateBuilderMatches, ) { let rev = nfa.is_reverse(); let lineterm = nfa.look_matcher().get_line_terminator(); let lookset = nfa.look_set_any(); match *start { Start::NonWordByte => { if lookset.contains_word() { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } } Start::WordByte => { if lookset.contains_word() { builder.set_is_from_word(); } } Start::Text => { if lookset.contains_anchor_haystack() { builder.set_look_have(|have| have.insert(Look::Start)); } if lookset.contains_anchor_line() { builder.set_look_have(|have| { have.insert(Look::StartLF).insert(Look::StartCRLF) }); } if lookset.contains_word() { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } } Start::LineLF => { if rev { if lookset.contains_anchor_crlf() { builder.set_is_half_crlf(); } if lookset.contains_anchor_line() { builder.set_look_have(|have| have.insert(Look::StartLF)); } } else { if lookset.contains_anchor_line() { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } } if lookset.contains_anchor_line() && lineterm == b'\n' { builder.set_look_have(|have| have.insert(Look::StartLF)); } if lookset.contains_word() { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } } Start::LineCR => { if lookset.contains_anchor_crlf() { if rev { builder.set_look_have(|have| have.insert(Look::StartCRLF)); } else { builder.set_is_half_crlf(); } } if lookset.contains_anchor_line() && lineterm == b'\r' { builder.set_look_have(|have| have.insert(Look::StartLF)); } if lookset.contains_word() { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } } Start::CustomLineTerminator => { if lookset.contains_anchor_line() { builder.set_look_have(|have| have.insert(Look::StartLF)); } // This is a bit of a tricky case, but if the line terminator was // set to a word byte, then we also need to behave as if the start // configuration is Start::WordByte. That is, we need to mark our // state as having come from a word byte. if lookset.contains_word() { if utf8::is_word_byte(lineterm) { builder.set_is_from_word(); } else { builder.set_look_have(|have| { have.insert(Look::WordStartHalfAscii) .insert(Look::WordStartHalfUnicode) }); } } } } } regex-automata-0.4.9/src/util/determinize/state.rs000064400000000000000000001042041046102023000203330ustar 00000000000000/*! This module defines a DFA state representation and builders for constructing DFA states. This representation is specifically for use in implementations of NFA-to-DFA conversion via powerset construction. (Also called "determinization" in this crate.) The term "DFA state" is somewhat overloaded in this crate. In some cases, it refers to the set of transitions over an alphabet for a particular state. In other cases, it refers to a set of NFA states. The former is really about the final representation of a state in a DFA's transition table, where as the latter---what this module is focused on---is closer to an intermediate form that is used to help eventually build the transition table. This module exports four types. All four types represent the same idea: an ordered set of NFA states. This ordered set represents the epsilon closure of a particular NFA state, where the "epsilon closure" is the set of NFA states that can be transitioned to without consuming any input. i.e., Follow all of the NFA state's epsilon transitions. In addition, this implementation of DFA states cares about two other things: the ordered set of pattern IDs corresponding to the patterns that match if the state is a match state, and the set of look-behind assertions that were true when the state was created. The first, `State`, is a frozen representation of a state that cannot be modified. It may be cheaply cloned without copying the state itself and can be accessed safely from multiple threads simultaneously. This type is useful for when one knows that the DFA state being constructed is distinct from any other previously constructed states. Namely, powerset construction, in practice, requires one to keep a cache of previously created DFA states. Otherwise, the number of DFA states created in memory balloons to an impractically large number. For this reason, equivalent states should endeavor to have an equivalent byte-level representation. (In general, "equivalency" here means, "equivalent assertions, pattern IDs and NFA state IDs." We do not require that full DFA minimization be implemented here. This form of equivalency is only surface deep and is more-or-less a practical necessity.) The other three types represent different phases in the construction of a DFA state. Internally, these three types (and `State`) all use the same byte-oriented representation. That means one can use any of the builder types to check whether the state it represents already exists or not. If it does, then there is no need to freeze it into a `State` (which requires an alloc and a copy). Here are the three types described succinctly: * `StateBuilderEmpty` represents a state with no pattern IDs, no assertions and no NFA states. Creating a `StateBuilderEmpty` performs no allocs. A `StateBuilderEmpty` can only be used to query its underlying memory capacity, or to convert into a builder for recording pattern IDs and/or assertions. * `StateBuilderMatches` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero NFA state IDs. A `StateBuilderMatches` can only be used for adding pattern IDs and recording assertions. * `StateBuilderNFA` represents a state with zero or more pattern IDs, zero or more satisfied assertions and zero or more NFA state IDs. A `StateBuilderNFA` can only be used for adding NFA state IDs and recording some assertions. The expected flow here is to use the above builders to construct a candidate DFA state to check if it already exists. If it does, then there's no need to freeze it into a `State`. If it doesn't exist, then `StateBuilderNFA::to_state` can be called to freeze the builder into an immutable `State`. In either case, `clear` should be called on the builder to turn it back into a `StateBuilderEmpty` that reuses the underlying memory. The main purpose for splitting the builder into these distinct types is to make it impossible to do things like adding a pattern ID after adding an NFA state ID. Namely, this makes it simpler to use a space-and-time efficient binary representation for the state. (The format is documented on the `Repr` type below.) If we just used one type for everything, it would be possible for callers to use an incorrect interleaving of calls and thus result in a corrupt representation. I chose to use more type machinery to make this impossible to do because 1) determinization is itself pretty complex and it wouldn't be too hard to foul this up and 2) there isn't too much machinery involved and it's well contained. As an optimization, sometimes states won't have certain things set. For example, if the underlying NFA has no word boundary assertions, then there is no reason to set a state's look-behind assertion as to whether it was generated from a word byte or not. Similarly, if a state has no NFA states corresponding to look-around assertions, then there is no reason to set `look_have` to a non-empty set. Finally, callers usually omit unconditional epsilon transitions when adding NFA state IDs since they aren't discriminatory. Finally, the binary representation used by these states is, thankfully, not serialized anywhere. So any kind of change can be made with reckless abandon, as long as everything in this module agrees. */ use core::mem; use alloc::{sync::Arc, vec::Vec}; use crate::util::{ int::{I32, U32}, look::LookSet, primitives::{PatternID, StateID}, wire::{self, Endian}, }; /// A DFA state that, at its core, is represented by an ordered set of NFA /// states. /// /// This type is intended to be used only in NFA-to-DFA conversion via powerset /// construction. /// /// It may be cheaply cloned and accessed safely from multiple threads /// simultaneously. #[derive(Clone, Eq, Hash, PartialEq, PartialOrd, Ord)] pub(crate) struct State(Arc<[u8]>); /// This Borrow impl permits us to lookup any state in a map by its byte /// representation. This is particularly convenient when one has a StateBuilder /// and we want to see if a correspondingly equivalent state already exists. If /// one does exist, then we can reuse the allocation required by StateBuilder /// without having to convert it into a State first. impl core::borrow::Borrow<[u8]> for State { fn borrow(&self) -> &[u8] { &*self.0 } } impl core::fmt::Debug for State { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("State").field(&self.repr()).finish() } } /// For docs on these routines, see the internal Repr and ReprVec types below. impl State { pub(crate) fn dead() -> State { StateBuilderEmpty::new().into_matches().into_nfa().to_state() } pub(crate) fn is_match(&self) -> bool { self.repr().is_match() } pub(crate) fn is_from_word(&self) -> bool { self.repr().is_from_word() } pub(crate) fn is_half_crlf(&self) -> bool { self.repr().is_half_crlf() } pub(crate) fn look_have(&self) -> LookSet { self.repr().look_have() } pub(crate) fn look_need(&self) -> LookSet { self.repr().look_need() } pub(crate) fn match_len(&self) -> usize { self.repr().match_len() } pub(crate) fn match_pattern(&self, index: usize) -> PatternID { self.repr().match_pattern(index) } pub(crate) fn match_pattern_ids(&self) -> Option> { self.repr().match_pattern_ids() } #[cfg(all(test, not(miri)))] pub(crate) fn iter_match_pattern_ids(&self, f: F) { self.repr().iter_match_pattern_ids(f) } pub(crate) fn iter_nfa_state_ids(&self, f: F) { self.repr().iter_nfa_state_ids(f) } pub(crate) fn memory_usage(&self) -> usize { self.0.len() } fn repr(&self) -> Repr<'_> { Repr(&*self.0) } } /// A state builder that represents an empty state. /// /// This is a useful "initial condition" for state construction. It has no /// NFA state IDs, no assertions set and no pattern IDs. No allocations are /// made when new() is called. Its main use is for being converted into a /// builder that can capture assertions and pattern IDs. #[derive(Clone, Debug)] pub(crate) struct StateBuilderEmpty(Vec); /// For docs on these routines, see the internal Repr and ReprVec types below. impl StateBuilderEmpty { pub(crate) fn new() -> StateBuilderEmpty { StateBuilderEmpty(alloc::vec![]) } pub(crate) fn into_matches(mut self) -> StateBuilderMatches { self.0.extend_from_slice(&[0, 0, 0, 0, 0, 0, 0, 0, 0]); StateBuilderMatches(self.0) } fn clear(&mut self) { self.0.clear(); } pub(crate) fn capacity(&self) -> usize { self.0.capacity() } } /// A state builder that collects assertions and pattern IDs. /// /// When collecting pattern IDs is finished, this can be converted into a /// builder that collects NFA state IDs. #[derive(Clone)] pub(crate) struct StateBuilderMatches(Vec); impl core::fmt::Debug for StateBuilderMatches { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("StateBuilderMatches").field(&self.repr()).finish() } } /// For docs on these routines, see the internal Repr and ReprVec types below. impl StateBuilderMatches { pub(crate) fn into_nfa(mut self) -> StateBuilderNFA { self.repr_vec().close_match_pattern_ids(); StateBuilderNFA { repr: self.0, prev_nfa_state_id: StateID::ZERO } } pub(crate) fn set_is_from_word(&mut self) { self.repr_vec().set_is_from_word() } pub(crate) fn set_is_half_crlf(&mut self) { self.repr_vec().set_is_half_crlf() } pub(crate) fn look_have(&self) -> LookSet { LookSet::read_repr(&self.0[1..]) } pub(crate) fn set_look_have( &mut self, set: impl FnMut(LookSet) -> LookSet, ) { self.repr_vec().set_look_have(set) } pub(crate) fn add_match_pattern_id(&mut self, pid: PatternID) { self.repr_vec().add_match_pattern_id(pid) } fn repr(&self) -> Repr<'_> { Repr(&self.0) } fn repr_vec(&mut self) -> ReprVec<'_> { ReprVec(&mut self.0) } } /// A state builder that collects some assertions and NFA state IDs. /// /// When collecting NFA state IDs is finished, this can be used to build a /// `State` if necessary. /// /// When dont with building a state (regardless of whether it got kept or not), /// it's usually a good idea to call `clear` to get an empty builder back so /// that it can be reused to build the next state. #[derive(Clone)] pub(crate) struct StateBuilderNFA { repr: Vec, prev_nfa_state_id: StateID, } impl core::fmt::Debug for StateBuilderNFA { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("StateBuilderNFA").field(&self.repr()).finish() } } /// For docs on these routines, see the internal Repr and ReprVec types below. impl StateBuilderNFA { pub(crate) fn to_state(&self) -> State { State(Arc::from(&*self.repr)) } pub(crate) fn clear(self) -> StateBuilderEmpty { let mut builder = StateBuilderEmpty(self.repr); builder.clear(); builder } pub(crate) fn look_need(&self) -> LookSet { self.repr().look_need() } pub(crate) fn set_look_have( &mut self, set: impl FnMut(LookSet) -> LookSet, ) { self.repr_vec().set_look_have(set) } pub(crate) fn set_look_need( &mut self, set: impl FnMut(LookSet) -> LookSet, ) { self.repr_vec().set_look_need(set) } pub(crate) fn add_nfa_state_id(&mut self, sid: StateID) { ReprVec(&mut self.repr) .add_nfa_state_id(&mut self.prev_nfa_state_id, sid) } pub(crate) fn as_bytes(&self) -> &[u8] { &self.repr } fn repr(&self) -> Repr<'_> { Repr(&self.repr) } fn repr_vec(&mut self) -> ReprVec<'_> { ReprVec(&mut self.repr) } } /// Repr is a read-only view into the representation of a DFA state. /// /// Primarily, a Repr is how we achieve DRY: we implement decoding the format /// in one place, and then use a Repr to implement the various methods on the /// public state types. /// /// The format is as follows: /// /// The first three bytes correspond to bitsets. /// /// Byte 0 is a bitset corresponding to miscellaneous flags associated with the /// state. Bit 0 is set to 1 if the state is a match state. Bit 1 is set to 1 /// if the state has pattern IDs explicitly written to it. (This is a flag that /// is not meant to be set by determinization, but rather, is used as part of /// an internal space-saving optimization.) Bit 2 is set to 1 if the state was /// generated by a transition over a "word" byte. (Callers may not always set /// this. For example, if the NFA has no word boundary assertion, then needing /// to track whether a state came from a word byte or not is superfluous and /// wasteful.) Bit 3 is set to 1 if the state was generated by a transition /// from a `\r` (forward search) or a `\n` (reverse search) when CRLF mode is /// enabled. /// /// Bytes 1..5 correspond to the look-behind assertions that were satisfied /// by the transition that created this state. (Look-ahead assertions are not /// tracked as part of states. Instead, these are applied by re-computing the /// epsilon closure of a state when computing the transition function. See /// `next` in the parent module.) /// /// Bytes 5..9 correspond to the set of look-around assertions (including both /// look-behind and look-ahead) that appear somewhere in this state's set of /// NFA state IDs. This is used to determine whether this state's epsilon /// closure should be re-computed when computing the transition function. /// Namely, look-around assertions are "just" conditional epsilon transitions, /// so if there are new assertions available when computing the transition /// function, we should only re-compute the epsilon closure if those new /// assertions are relevant to this particular state. /// /// Bytes 9..13 correspond to a 32-bit native-endian encoded integer /// corresponding to the number of patterns encoded in this state. If the state /// is not a match state (byte 0 bit 0 is 0) or if it's only pattern ID is /// PatternID::ZERO, then no integer is encoded at this position. Instead, byte /// offset 3 is the position at which the first NFA state ID is encoded. /// /// For a match state with at least one non-ZERO pattern ID, the next bytes /// correspond to a sequence of 32-bit native endian encoded integers that /// represent each pattern ID, in order, that this match state represents. /// /// After the pattern IDs (if any), NFA state IDs are delta encoded as /// varints.[1] The first NFA state ID is encoded as itself, and each /// subsequent NFA state ID is encoded as the difference between itself and the /// previous NFA state ID. /// /// [1] - https://developers.google.com/protocol-buffers/docs/encoding#varints struct Repr<'a>(&'a [u8]); impl<'a> Repr<'a> { /// Returns true if and only if this is a match state. /// /// If callers have added pattern IDs to this state, then callers MUST set /// this state as a match state explicitly. However, as a special case, /// states that are marked as match states but with no pattern IDs, then /// the state is treated as if it had a single pattern ID equivalent to /// PatternID::ZERO. fn is_match(&self) -> bool { self.0[0] & (1 << 0) > 0 } /// Returns true if and only if this state has had at least one pattern /// ID added to it. /// /// This is an internal-only flag that permits the representation to save /// space in the common case of an NFA with one pattern in it. In that /// case, a match state can only ever have exactly one pattern ID: /// PatternID::ZERO. So there's no need to represent it. fn has_pattern_ids(&self) -> bool { self.0[0] & (1 << 1) > 0 } /// Returns true if and only if this state is marked as having been created /// from a transition over a word byte. This is useful for checking whether /// a word boundary assertion is true or not, which requires look-behind /// (whether the current state came from a word byte or not) and look-ahead /// (whether the transition byte is a word byte or not). /// /// Since states with this set are distinct from states that don't have /// this set (even if they are otherwise equivalent), callers should not /// set this assertion unless the underlying NFA has at least one word /// boundary assertion somewhere. Otherwise, a superfluous number of states /// may be created. fn is_from_word(&self) -> bool { self.0[0] & (1 << 2) > 0 } /// Returns true if and only if this state is marked as being inside of a /// CRLF terminator. In the forward direction, this means the state was /// created after seeing a `\r`. In the reverse direction, this means the /// state was created after seeing a `\n`. fn is_half_crlf(&self) -> bool { self.0[0] & (1 << 3) > 0 } /// The set of look-behind assertions that were true in the transition that /// created this state. /// /// Generally, this should be empty if 'look_need' is empty, since there is /// no reason to track which look-behind assertions are true if the state /// has no conditional epsilon transitions. /// /// Satisfied look-ahead assertions are not tracked in states. Instead, /// these are re-computed on demand via epsilon closure when computing the /// transition function. fn look_have(&self) -> LookSet { LookSet::read_repr(&self.0[1..]) } /// The set of look-around (both behind and ahead) assertions that appear /// at least once in this state's set of NFA states. /// /// This is used to determine whether the epsilon closure needs to be /// re-computed when computing the transition function. Namely, if the /// state has no conditional epsilon transitions, then there is no need /// to re-compute the epsilon closure. fn look_need(&self) -> LookSet { LookSet::read_repr(&self.0[5..]) } /// Returns the total number of match pattern IDs in this state. /// /// If this state is not a match state, then this always returns 0. fn match_len(&self) -> usize { if !self.is_match() { return 0; } else if !self.has_pattern_ids() { 1 } else { self.encoded_pattern_len() } } /// Returns the pattern ID for this match state at the given index. /// /// If the given index is greater than or equal to `match_len()` for this /// state, then this could panic or return incorrect results. fn match_pattern(&self, index: usize) -> PatternID { if !self.has_pattern_ids() { PatternID::ZERO } else { let offset = 13 + index * PatternID::SIZE; // This is OK since we only ever serialize valid PatternIDs to // states. wire::read_pattern_id_unchecked(&self.0[offset..]).0 } } /// Returns a copy of all match pattern IDs in this state. If this state /// is not a match state, then this returns None. fn match_pattern_ids(&self) -> Option> { if !self.is_match() { return None; } let mut pids = alloc::vec![]; self.iter_match_pattern_ids(|pid| pids.push(pid)); Some(pids) } /// Calls the given function on every pattern ID in this state. fn iter_match_pattern_ids(&self, mut f: F) { if !self.is_match() { return; } // As an optimization for a very common case, when this is a match // state for an NFA with only one pattern, we don't actually write the // pattern ID to the state representation. Instead, we know it must // be there since it is the only possible choice. if !self.has_pattern_ids() { f(PatternID::ZERO); return; } let mut pids = &self.0[13..self.pattern_offset_end()]; while !pids.is_empty() { let pid = wire::read_u32(pids); pids = &pids[PatternID::SIZE..]; // This is OK since we only ever serialize valid PatternIDs to // states. And since pattern IDs can never exceed a usize, the // unwrap is OK. f(PatternID::new_unchecked(usize::try_from(pid).unwrap())); } } /// Calls the given function on every NFA state ID in this state. fn iter_nfa_state_ids(&self, mut f: F) { let mut sids = &self.0[self.pattern_offset_end()..]; let mut prev = 0i32; while !sids.is_empty() { let (delta, nr) = read_vari32(sids); sids = &sids[nr..]; let sid = prev + delta; prev = sid; // This is OK since we only ever serialize valid StateIDs to // states. And since state IDs can never exceed an isize, they must // always be able to fit into a usize, and thus cast is OK. f(StateID::new_unchecked(sid.as_usize())) } } /// Returns the offset into this state's representation where the pattern /// IDs end and the NFA state IDs begin. fn pattern_offset_end(&self) -> usize { let encoded = self.encoded_pattern_len(); if encoded == 0 { return 9; } // This arithmetic is OK since we were able to address this many bytes // when writing to the state, thus, it must fit into a usize. encoded.checked_mul(4).unwrap().checked_add(13).unwrap() } /// Returns the total number of *encoded* pattern IDs in this state. /// /// This may return 0 even when this is a match state, since the pattern /// ID `PatternID::ZERO` is not encoded when it's the only pattern ID in /// the match state (the overwhelming common case). fn encoded_pattern_len(&self) -> usize { if !self.has_pattern_ids() { return 0; } // This unwrap is OK since the total number of patterns is always // guaranteed to fit into a usize. usize::try_from(wire::read_u32(&self.0[9..13])).unwrap() } } impl<'a> core::fmt::Debug for Repr<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let mut nfa_ids = alloc::vec![]; self.iter_nfa_state_ids(|sid| nfa_ids.push(sid)); f.debug_struct("Repr") .field("is_match", &self.is_match()) .field("is_from_word", &self.is_from_word()) .field("is_half_crlf", &self.is_half_crlf()) .field("look_have", &self.look_have()) .field("look_need", &self.look_need()) .field("match_pattern_ids", &self.match_pattern_ids()) .field("nfa_state_ids", &nfa_ids) .finish() } } /// ReprVec is a write-only view into the representation of a DFA state. /// /// See Repr for more details on the purpose of this type and also the format. /// /// Note that not all possible combinations of methods may be called. This is /// precisely what the various StateBuilder types encapsulate: they only /// permit valid combinations via Rust's linear typing. struct ReprVec<'a>(&'a mut Vec); impl<'a> ReprVec<'a> { /// Set this state as a match state. /// /// This should not be exposed explicitly outside of this module. It is /// set automatically when a pattern ID is added. fn set_is_match(&mut self) { self.0[0] |= 1 << 0; } /// Set that this state has pattern IDs explicitly written to it. /// /// This should not be exposed explicitly outside of this module. This is /// used internally as a space saving optimization. Namely, if the state /// is a match state but does not have any pattern IDs written to it, /// then it is automatically inferred to have a pattern ID of ZERO. fn set_has_pattern_ids(&mut self) { self.0[0] |= 1 << 1; } /// Set this state as being built from a transition over a word byte. /// /// Setting this is only necessary when one needs to deal with word /// boundary assertions. Therefore, if the underlying NFA has no word /// boundary assertions, callers should not set this. fn set_is_from_word(&mut self) { self.0[0] |= 1 << 2; } /// Set this state as having seen half of a CRLF terminator. /// /// In the forward direction, this should be set when a `\r` has been seen. /// In the reverse direction, this should be set when a `\n` has been seen. fn set_is_half_crlf(&mut self) { self.0[0] |= 1 << 3; } /// The set of look-behind assertions that were true in the transition that /// created this state. fn look_have(&self) -> LookSet { self.repr().look_have() } /// The set of look-around (both behind and ahead) assertions that appear /// at least once in this state's set of NFA states. fn look_need(&self) -> LookSet { self.repr().look_need() } /// Mutate the set of look-behind assertions that were true in the /// transition that created this state. fn set_look_have(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { set(self.look_have()).write_repr(&mut self.0[1..]); } /// Mutate the set of look-around (both behind and ahead) assertions that /// appear at least once in this state's set of NFA states. fn set_look_need(&mut self, mut set: impl FnMut(LookSet) -> LookSet) { set(self.look_need()).write_repr(&mut self.0[5..]); } /// Add a pattern ID to this state. All match states must have at least /// one pattern ID associated with it. /// /// Callers must never add duplicative pattern IDs. /// /// The order in which patterns are added must correspond to the order /// in which patterns are reported as matches. fn add_match_pattern_id(&mut self, pid: PatternID) { // As a (somewhat small) space saving optimization, in the case where // a matching state has exactly one pattern ID, PatternID::ZERO, we do // not write either the pattern ID or the number of patterns encoded. // Instead, all we do is set the 'is_match' bit on this state. Overall, // this saves 8 bytes per match state for the overwhelming majority of // match states. // // In order to know whether pattern IDs need to be explicitly read or // not, we use another internal-only bit, 'has_pattern_ids', to // indicate whether they have been explicitly written or not. if !self.repr().has_pattern_ids() { if pid == PatternID::ZERO { self.set_is_match(); return; } // Make room for 'close_match_pattern_ids' to write the total // number of pattern IDs written. self.0.extend(core::iter::repeat(0).take(PatternID::SIZE)); self.set_has_pattern_ids(); // If this was already a match state, then the only way that's // possible when the state doesn't have pattern IDs is if // PatternID::ZERO was added by the caller previously. In this // case, we are now adding a non-ZERO pattern ID after it, in // which case, we want to make sure to represent ZERO explicitly // now. if self.repr().is_match() { write_u32(self.0, 0) } else { // Otherwise, just make sure the 'is_match' bit is set. self.set_is_match(); } } write_u32(self.0, pid.as_u32()); } /// Indicate that no more pattern IDs will be added to this state. /// /// Once this is called, callers must not call it or 'add_match_pattern_id' /// again. /// /// This should not be exposed explicitly outside of this module. It /// should be called only when converting a StateBuilderMatches into a /// StateBuilderNFA. fn close_match_pattern_ids(&mut self) { // If we never wrote any pattern IDs, then there's nothing to do here. if !self.repr().has_pattern_ids() { return; } let patsize = PatternID::SIZE; let pattern_bytes = self.0.len() - 13; // Every pattern ID uses 4 bytes, so number of bytes should be // divisible by 4. assert_eq!(pattern_bytes % patsize, 0); // This unwrap is OK since we are guaranteed that the maximum number // of possible patterns fits into a u32. let count32 = u32::try_from(pattern_bytes / patsize).unwrap(); wire::NE::write_u32(count32, &mut self.0[9..13]); } /// Add an NFA state ID to this state. The order in which NFA states are /// added matters. It is the caller's responsibility to ensure that /// duplicate NFA state IDs are not added. fn add_nfa_state_id(&mut self, prev: &mut StateID, sid: StateID) { let delta = sid.as_i32() - prev.as_i32(); write_vari32(self.0, delta); *prev = sid; } /// Return a read-only view of this state's representation. fn repr(&self) -> Repr<'_> { Repr(self.0.as_slice()) } } /// Write a signed 32-bit integer using zig-zag encoding. /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_vari32(data: &mut Vec, n: i32) { let mut un = n.to_bits() << 1; if n < 0 { un = !un; } write_varu32(data, un) } /// Read a signed 32-bit integer using zig-zag encoding. Also, return the /// number of bytes read. /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn read_vari32(data: &[u8]) -> (i32, usize) { let (un, i) = read_varu32(data); let mut n = i32::from_bits(un >> 1); if un & 1 != 0 { n = !n; } (n, i) } /// Write an unsigned 32-bit integer as a varint. In essence, `n` is written /// as a sequence of bytes where all bytes except for the last one have the /// most significant bit set. The least significant 7 bits correspond to the /// actual bits of `n`. So in the worst case, a varint uses 5 bytes, but in /// very common cases, it uses fewer than 4. /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn write_varu32(data: &mut Vec, mut n: u32) { while n >= 0b1000_0000 { data.push(n.low_u8() | 0b1000_0000); n >>= 7; } data.push(n.low_u8()); } /// Read an unsigned 32-bit varint. Also, return the number of bytes read. /// /// https://developers.google.com/protocol-buffers/docs/encoding#varints fn read_varu32(data: &[u8]) -> (u32, usize) { // N.B. We can assume correctness here since we know that all varuints are // written with write_varu32. Hence, the 'as' uses and unchecked arithmetic // is all okay. let mut n: u32 = 0; let mut shift: u32 = 0; for (i, &b) in data.iter().enumerate() { if b < 0b1000_0000 { return (n | (u32::from(b) << shift), i + 1); } n |= (u32::from(b) & 0b0111_1111) << shift; shift += 7; } (0, 0) } /// Push a native-endian encoded `n` on to `dst`. fn write_u32(dst: &mut Vec, n: u32) { use crate::util::wire::NE; let start = dst.len(); dst.extend(core::iter::repeat(0).take(mem::size_of::())); NE::write_u32(n, &mut dst[start..]); } #[cfg(test)] mod tests { use alloc::vec; use quickcheck::quickcheck; use super::*; #[cfg(not(miri))] quickcheck! { fn prop_state_read_write_nfa_state_ids(sids: Vec) -> bool { // Builders states do not permit duplicate IDs. let sids = dedup_state_ids(sids); let mut b = StateBuilderEmpty::new().into_matches().into_nfa(); for &sid in &sids { b.add_nfa_state_id(sid); } let s = b.to_state(); let mut got = vec![]; s.iter_nfa_state_ids(|sid| got.push(sid)); got == sids } fn prop_state_read_write_pattern_ids(pids: Vec) -> bool { // Builders states do not permit duplicate IDs. let pids = dedup_pattern_ids(pids); let mut b = StateBuilderEmpty::new().into_matches(); for &pid in &pids { b.add_match_pattern_id(pid); } let s = b.into_nfa().to_state(); let mut got = vec![]; s.iter_match_pattern_ids(|pid| got.push(pid)); got == pids } fn prop_state_read_write_nfa_state_and_pattern_ids( sids: Vec, pids: Vec ) -> bool { // Builders states do not permit duplicate IDs. let sids = dedup_state_ids(sids); let pids = dedup_pattern_ids(pids); let mut b = StateBuilderEmpty::new().into_matches(); for &pid in &pids { b.add_match_pattern_id(pid); } let mut b = b.into_nfa(); for &sid in &sids { b.add_nfa_state_id(sid); } let s = b.to_state(); let mut got_pids = vec![]; s.iter_match_pattern_ids(|pid| got_pids.push(pid)); let mut got_sids = vec![]; s.iter_nfa_state_ids(|sid| got_sids.push(sid)); got_pids == pids && got_sids == sids } } quickcheck! { fn prop_read_write_varu32(n: u32) -> bool { let mut buf = vec![]; write_varu32(&mut buf, n); let (got, nread) = read_varu32(&buf); nread == buf.len() && got == n } fn prop_read_write_vari32(n: i32) -> bool { let mut buf = vec![]; write_vari32(&mut buf, n); let (got, nread) = read_vari32(&buf); nread == buf.len() && got == n } } #[cfg(not(miri))] fn dedup_state_ids(sids: Vec) -> Vec { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; for sid in sids { if set.contains(&sid) { continue; } set.insert(sid); deduped.push(sid); } deduped } #[cfg(not(miri))] fn dedup_pattern_ids(pids: Vec) -> Vec { let mut set = alloc::collections::BTreeSet::new(); let mut deduped = vec![]; for pid in pids { if set.contains(&pid) { continue; } set.insert(pid); deduped.push(pid); } deduped } } regex-automata-0.4.9/src/util/empty.rs000064400000000000000000000323141046102023000160340ustar 00000000000000/*! This module provides helper routines for dealing with zero-width matches. The main problem being solved here is this: 1. The caller wants to search something that they know is valid UTF-8, such as a Rust `&str`. 2. The regex used by the caller can match the empty string. For example, `a*`. 3. The caller should never get match offsets returned that occur within the encoding of a UTF-8 codepoint. It is logically incorrect, and also means that, e.g., slicing the `&str` at those offsets will lead to a panic. So the question here is, how do we prevent the caller from getting match offsets that split a codepoint? For example, strictly speaking, the regex `a*` matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that underlies all of the matching engines in this crate doesn't have anything in its state graph that prevents matching between UTF-8 code units. Indeed, any engine derived from the `NFA` will match at those positions by virtue of the fact that the `NFA` is byte oriented. That is, its transitions are defined over bytes and the matching engines work by proceeding one byte at a time. (An alternative architecture would be to define the transitions in an `NFA` over codepoints, or `char`. And then make the matching engines proceed by decoding one codepoint at a time. This is a viable strategy, but it doesn't work for DFA matching engines because designing a fast and memory efficient transition table for an alphabet as large as Unicode is quite difficult. More to the point, the top-level `regex` crate supports matching on arbitrary bytes when Unicode mode is disabled and one is searching a `&[u8]`. So in that case, you can't just limit yourself to decoding codepoints and matching those. You really do need to be able to follow byte oriented transitions on the `NFA`.) In an older version of the regex crate, we handled this case not in the regex engine, but in the iterators over matches. Namely, since this case only arises when the match is empty, we "just" incremented the next starting position of the search by `N`, where `N` is the length of the codepoint encoded at the current position. The alternative or more "natural" solution of just incrementing by `1` would result in executing a search of `a*` on `☃` like this: * Start search at `0`. * Found match at `[0, 0]`. * Next start position is `0`. * To avoid an infinite loop, since it's an empty match, increment by `1`. * Start search at `1`. * Found match at `[1, 1]`. Oops. But if we instead incremented by `3` (the length in bytes of `☃`), then we get the following: * Start search at `0`. * Found match at `[0, 0]`. * Next start position is `0`. * To avoid an infinite loop, since it's an empty match, increment by `3`. * Start search at `3`. * Found match at `[3, 3]`. And we get the correct result. But does this technique work in all cases? Crucially, it requires that a zero-width match that splits a codepoint never occurs beyond the starting position of the search. Because if it did, merely incrementing the start position by the number of bytes in the codepoint at the current position wouldn't be enough. A zero-width match could just occur anywhere. It turns out that it is _almost_ true. We can convince ourselves by looking at all possible patterns that can match the empty string: * Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match the empty string. That is, assuming there isn't an `a` at the current position, they will all match the empty string at the start of a search. There is no way to move past it because any other match would not be "leftmost." * `^` only matches at the beginning of the haystack, where the start position is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8, then this entire problem goes away because it implies your string type supports invalid UTF-8 and thus must deal with offsets that not only split a codepoint but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches between the code units of a codepoint because the start of a valid UTF-8 string is never within the encoding of a codepoint. * `$` basically the same logic as `^`, but for the end of a string. A valid UTF-8 string can't have an incomplete codepoint at the end of it. * `(?m:^)` follows similarly to `^`, but it can match immediately following a `\n`. However, since a `\n` is always a codepoint itself and can never appear within a codepoint, it follows that the position immediately following a `\n` in a string that is valid UTF-8 is guaranteed to not be between the code units of another codepoint. (One caveat here is that the line terminator for multi-line anchors can now be changed to any arbitrary byte, including things like `\x98` which might occur within a codepoint. However, this wasn't supported by the old regex crate. If it was, it pose the same problems as `(?-u:\B)`, as we'll discuss below.) * `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a `(?m:$)` matches just before a `\n`. But the same argument applies. * `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`. Namely, since they only ever match at a boundary where one side is either a `\r` or a `\n`, neither of which can occur within a codepoint. * `\b` only matches at positions where both sides are valid codepoints, so this cannot split a codepoint. * `\B`, like `\b`, also only matches at positions where both sides are valid codepoints. So this cannot split a codepoint either. * `(?-u:\b)` matches only at positions where at least one side of it is an ASCII word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints (one of the many amazing qualities of UTF-8), it follows that this too cannot split a codepoint. * `(?-u:\B)` finally represents a problem. It can matches between *any* two bytes that are either both word bytes or non-word bytes. Since code units like `\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes, `(?-u:\B)` will match at the position between them. Thus, our approach of incrementing one codepoint at a time after seeing an empty match is flawed because `(?-u:\B)` can result in an empty match that splits a codepoint at a position past the starting point of a search. For example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2, 2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because they correspond to word boundaries since `a` is an ASCII word byte. So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from regexes that could match `&str`. That might sound extreme, but a lot of other things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and `(?-u:\W)` can match invalid UTF-8 too, including individual code units with a codepoint. The key difference is that those expressions could never produce an empty match. That ban happens when translating an `Ast` to an `Hir`, because that process that reason about whether an `Hir` can produce *non-empty* matches at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the `(?-u:\B)` issue by banning it. If banning `(?-u:\B)` were the only issue with the old regex crate's approach, then I probably would have kept it. `\B` is rarely used, so it's not such a big deal to have to work-around it. However, the problem with the above approach is that it doesn't compose. The logic for avoiding splitting a codepoint only lived in the iterator, which means if anyone wants to implement their own iterator over regex matches, they have to deal with this extremely subtle edge case to get full correctness. Instead, in this crate, we take the approach of pushing this complexity down to the lowest layers of each regex engine. The approach is pretty simple: * If this corner case doesn't apply, don't do anything. (For example, if UTF-8 mode isn't enabled or if the regex cannot match the empty string.) * If an empty match is reported, explicitly check if it splits a codepoint. * If it doesn't, we're done, return the match. * If it does, then ignore the match and re-run the search. * Repeat the above process until the end of the haystack is reached or a match is found that doesn't split a codepoint or isn't zero width. And that's pretty much what this module provides. Every regex engine uses these methods in their lowest level public APIs, but just above the layer where their internal engine is used. That way, all regex engines can be arbitrarily composed without worrying about handling this case, and iterators don't need to handle it explicitly. (It turns out that a new feature I added, support for changing the line terminator in a regex to any arbitrary byte, also provokes the above problem. Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that support would need to be limited or banned when UTF-8 mode is enabled, just like we did for `(?-u:\B)`. But thankfully our more robust approach in this crate handles that case just fine too.) */ use crate::util::search::{Input, MatchError}; #[cold] #[inline(never)] pub(crate) fn skip_splits_fwd( input: &Input<'_>, init_value: T, match_offset: usize, find: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { skip_splits(true, input, init_value, match_offset, find) } #[cold] #[inline(never)] pub(crate) fn skip_splits_rev( input: &Input<'_>, init_value: T, match_offset: usize, find: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { skip_splits(false, input, init_value, match_offset, find) } fn skip_splits( forward: bool, input: &Input<'_>, init_value: T, mut match_offset: usize, mut find: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { // If our config says to do an anchored search, then we're definitely // done. We just need to determine whether we have a valid match or // not. If we don't, then we're not allowed to continue, so we report // no match. // // This is actually quite a subtle correctness thing. The key here is // that if we got an empty match that splits a codepoint after doing an // anchored search in UTF-8 mode, then that implies that we must have // *started* the search at a location that splits a codepoint. This // follows from the fact that if a match is reported from an anchored // search, then the start offset of the match *must* match the start // offset of the search. // // It also follows that no other non-empty match is possible. For // example, you might write a regex like '(?:)|SOMETHING' and start its // search in the middle of a codepoint. The first branch is an empty // regex that will bubble up a match at the first position, and then // get rejected here and report no match. But what if 'SOMETHING' could // have matched? We reason that such a thing is impossible, because // if it does, it must report a match that starts in the middle of a // codepoint. This in turn implies that a match is reported whose span // does not correspond to valid UTF-8, and this breaks the promise // made when UTF-8 mode is enabled. (That promise *can* be broken, for // example, by enabling UTF-8 mode but building an by hand NFA that // produces non-empty matches that span invalid UTF-8. This is an unchecked // but documented precondition violation of UTF-8 mode, and is documented // to have unspecified behavior.) // // I believe this actually means that if an anchored search is run, and // UTF-8 mode is enabled and the start position splits a codepoint, // then it is correct to immediately report no match without even // executing the regex engine. But it doesn't really seem worth writing // out that case in every regex engine to save a tiny bit of work in an // extremely pathological case, so we just handle it here. if input.get_anchored().is_anchored() { return Ok(if input.is_char_boundary(match_offset) { Some(init_value) } else { None }); } // Otherwise, we have an unanchored search, so just keep looking for // matches until we have one that does not split a codepoint or we hit // EOI. let mut value = init_value; let mut input = input.clone(); while !input.is_char_boundary(match_offset) { if forward { // The unwrap is OK here because overflowing usize while // iterating over a slice is impossible, at it would require // a slice of length greater than isize::MAX, which is itself // impossible. input.set_start(input.start().checked_add(1).unwrap()); } else { input.set_end(match input.end().checked_sub(1) { None => return Ok(None), Some(end) => end, }); } match find(&input)? { None => return Ok(None), Some((new_value, new_match_end)) => { value = new_value; match_offset = new_match_end; } } } Ok(Some(value)) } regex-automata-0.4.9/src/util/escape.rs000064400000000000000000000056641046102023000161460ustar 00000000000000/*! Provides convenience routines for escaping raw bytes. Since this crate tends to deal with `&[u8]` everywhere and the default `Debug` implementation just shows decimal integers, it makes debugging those representations quite difficult. This module provides types that show `&[u8]` as if it were a string, with invalid UTF-8 escaped into its byte-by-byte hex representation. */ use crate::util::utf8; /// Provides a convenient `Debug` implementation for a `u8`. /// /// The `Debug` impl treats the byte as an ASCII, and emits a human readable /// representation of it. If the byte isn't ASCII, then it's emitted as a hex /// escape sequence. #[derive(Clone, Copy)] pub struct DebugByte(pub u8); impl core::fmt::Debug for DebugByte { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { // Special case ASCII space. It's too hard to read otherwise, so // put quotes around it. I sometimes wonder whether just '\x20' would // be better... if self.0 == b' ' { return write!(f, "' '"); } // 10 bytes is enough to cover any output from ascii::escape_default. let mut bytes = [0u8; 10]; let mut len = 0; for (i, mut b) in core::ascii::escape_default(self.0).enumerate() { // capitalize \xab to \xAB if i >= 2 && b'a' <= b && b <= b'f' { b -= 32; } bytes[len] = b; len += 1; } write!(f, "{}", core::str::from_utf8(&bytes[..len]).unwrap()) } } /// Provides a convenient `Debug` implementation for `&[u8]`. /// /// This generally works best when the bytes are presumed to be mostly UTF-8, /// but will work for anything. For any bytes that aren't UTF-8, they are /// emitted as hex escape sequences. pub struct DebugHaystack<'a>(pub &'a [u8]); impl<'a> core::fmt::Debug for DebugHaystack<'a> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "\"")?; // This is a sad re-implementation of a similar impl found in bstr. let mut bytes = self.0; while let Some(result) = utf8::decode(bytes) { let ch = match result { Ok(ch) => ch, Err(byte) => { write!(f, r"\x{:02x}", byte)?; bytes = &bytes[1..]; continue; } }; bytes = &bytes[ch.len_utf8()..]; match ch { '\0' => write!(f, "\\0")?, // ASCII control characters except \0, \n, \r, \t '\x01'..='\x08' | '\x0b' | '\x0c' | '\x0e'..='\x19' | '\x7f' => { write!(f, "\\x{:02x}", u32::from(ch))?; } '\n' | '\r' | '\t' | _ => { write!(f, "{}", ch.escape_debug())?; } } } write!(f, "\"")?; Ok(()) } } regex-automata-0.4.9/src/util/int.rs000064400000000000000000000145621046102023000154750ustar 00000000000000/*! This module provides several integer oriented traits for converting between both fixed size integers and integers whose size varies based on the target (like `usize`). The driving design principle of this module is to attempt to centralize as many `as` casts as possible here. And in particular, we separate casts into two buckets: * Casts that we use for their truncating behavior. In this case, we use more descriptive names, like `low_u32` and `high_u32`. * Casts that we use for converting back-and-forth between `usize`. These conversions are generally necessary because we often store indices in different formats to save on memory, which requires converting to and from `usize`. In this case, we very specifically do not want to overflow, and so the methods defined here will panic if the `as` cast would be lossy in debug mode. (A normal `as` cast will never panic!) For `as` casts between raw pointers, we use `cast`, so `as` isn't needed there. For regex engines, floating point is just never used, so we don't have to worry about `as` casts for those. Otherwise, this module pretty much covers all of our `as` needs except for one thing: const contexts. There are a select few places in this crate where we still need to use `as` because const functions on traits aren't stable yet. If we wind up significantly expanding our const footprint in this crate, it might be worth defining free functions to handle those cases. But at the time of writing, that just seemed like too much ceremony. Instead, I comment each such use of `as` in a const context with a "fixme" notice. NOTE: for simplicity, we don't take target pointer width into account here for `usize` conversions. Since we currently only panic in debug mode, skipping the check when it can be proven it isn't needed at compile time doesn't really matter. Now, if we wind up wanting to do as many checks as possible in release mode, then we would want to skip those when we know the conversions are always non-lossy. NOTE: this module isn't an exhaustive API. For example, we still use things like `u64::from` where possible, or even `usize::try_from()` for when we do explicitly want to panic or when we want to return an error for overflow. */ // We define a little more than what we need, but I'd rather just have // everything via a consistent and uniform API then have holes. #![allow(dead_code)] pub(crate) trait U8 { fn as_usize(self) -> usize; } impl U8 for u8 { fn as_usize(self) -> usize { usize::from(self) } } pub(crate) trait U16 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn high_u8(self) -> u8; } impl U16 for u16 { fn as_usize(self) -> usize { usize::from(self) } fn low_u8(self) -> u8 { self as u8 } fn high_u8(self) -> u8 { (self >> 8) as u8 } } pub(crate) trait U32 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn low_u16(self) -> u16; fn high_u16(self) -> u16; } impl U32 for u32 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("u32 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn low_u8(self) -> u8 { self as u8 } fn low_u16(self) -> u16 { self as u16 } fn high_u16(self) -> u16 { (self >> 16) as u16 } } pub(crate) trait U64 { fn as_usize(self) -> usize; fn low_u8(self) -> u8; fn low_u16(self) -> u16; fn low_u32(self) -> u32; fn high_u32(self) -> u32; } impl U64 for u64 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("u64 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn low_u8(self) -> u8 { self as u8 } fn low_u16(self) -> u16 { self as u16 } fn low_u32(self) -> u32 { self as u32 } fn high_u32(self) -> u32 { (self >> 32) as u32 } } pub(crate) trait I32 { fn as_usize(self) -> usize; fn to_bits(self) -> u32; fn from_bits(n: u32) -> i32; } impl I32 for i32 { fn as_usize(self) -> usize { #[cfg(debug_assertions)] { usize::try_from(self).expect("i32 overflowed usize") } #[cfg(not(debug_assertions))] { self as usize } } fn to_bits(self) -> u32 { self as u32 } fn from_bits(n: u32) -> i32 { n as i32 } } pub(crate) trait Usize { fn as_u8(self) -> u8; fn as_u16(self) -> u16; fn as_u32(self) -> u32; fn as_u64(self) -> u64; } impl Usize for usize { fn as_u8(self) -> u8 { #[cfg(debug_assertions)] { u8::try_from(self).expect("usize overflowed u8") } #[cfg(not(debug_assertions))] { self as u8 } } fn as_u16(self) -> u16 { #[cfg(debug_assertions)] { u16::try_from(self).expect("usize overflowed u16") } #[cfg(not(debug_assertions))] { self as u16 } } fn as_u32(self) -> u32 { #[cfg(debug_assertions)] { u32::try_from(self).expect("usize overflowed u32") } #[cfg(not(debug_assertions))] { self as u32 } } fn as_u64(self) -> u64 { #[cfg(debug_assertions)] { u64::try_from(self).expect("usize overflowed u64") } #[cfg(not(debug_assertions))] { self as u64 } } } // Pointers aren't integers, but we convert pointers to integers to perform // offset arithmetic in some places. (And no, we don't convert the integers // back to pointers.) So add 'as_usize' conversions here too for completeness. // // These 'as' casts are actually okay because they're always non-lossy. But the // idea here is to just try and remove as much 'as' as possible, particularly // in this crate where we are being really paranoid about offsets and making // sure we don't panic on inputs that might be untrusted. This way, the 'as' // casts become easier to audit if they're all in one place, even when some of // them are actually okay 100% of the time. pub(crate) trait Pointer { fn as_usize(self) -> usize; } impl Pointer for *const T { fn as_usize(self) -> usize { self as usize } } regex-automata-0.4.9/src/util/interpolate.rs000064400000000000000000000416351046102023000172320ustar 00000000000000/*! Provides routines for interpolating capture group references. That is, if a replacement string contains references like `$foo` or `${foo1}`, then they are replaced with the corresponding capture values for the groups named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` is supported as well, with `1` corresponding to a capture group index and not a name. This module provides the free functions [`string`] and [`bytes`], which interpolate Rust Unicode strings and byte strings, respectively. # Format These routines support two different kinds of capture references: unbraced and braced. For the unbraced format, the format supported is `$ref` where `name` can be any character in the class `[0-9A-Za-z_]`. `ref` is always the longest possible parse. So for example, `$1a` corresponds to the capture group named `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then it is treated as a capture group index itself and not a name. For the braced format, the format supported is `${ref}` where `ref` can be any sequence of bytes except for `}`. If no closing brace occurs, then it is not considered a capture reference. As with the unbraced format, if `ref` matches `^[0-9]+$`, then it is treated as a capture group index and not a name. The braced format is useful for exerting precise control over the name of the capture reference. For example, `${1}a` corresponds to the capture group reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) corresponds to the capture group reference `1a`. The braced format is also useful for expressing capture group names that use characters not supported by the unbraced format. For example, `${foo[bar].baz}` refers to the capture group named `foo[bar].baz`. If a capture group reference is found and it does not refer to a valid capture group, then it will be replaced with the empty string. To write a literal `$`, use `$$`. To be clear, and as exhibited via the type signatures in the routines in this module, it is impossible for a replacement string to be invalid. A replacement string may not have the intended semantics, but the interpolation procedure itself can never fail. */ use alloc::{string::String, vec::Vec}; use crate::util::memchr::memchr; /// Accepts a replacement string and interpolates capture references with their /// corresponding values. /// /// `append` should be a function that appends the string value of a capture /// group at a particular index to the string given. If the capture group /// index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. /// /// # Example /// /// ``` /// use regex_automata::util::interpolate; /// /// let mut dst = String::new(); /// interpolate::string( /// "foo $bar baz", /// |index, dst| { /// if index == 0 { /// dst.push_str("BAR"); /// } /// }, /// |name| { /// if name == "bar" { /// Some(0) /// } else { /// None /// } /// }, /// &mut dst, /// ); /// assert_eq!("foo BAR baz", dst); /// ``` pub fn string( mut replacement: &str, mut append: impl FnMut(usize, &mut String), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut String, ) { while !replacement.is_empty() { match memchr(b'$', replacement.as_bytes()) { None => break, Some(i) => { dst.push_str(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { dst.push_str("$"); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement.as_bytes()) { Some(cap_ref) => cap_ref, None => { dst.push_str("$"); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.push_str(replacement); } /// Accepts a replacement byte string and interpolates capture references with /// their corresponding values. /// /// `append` should be a function that appends the byte string value of a /// capture group at a particular index to the byte string given. If the /// capture group index is invalid, then nothing should be appended. /// /// `name_to_index` should be a function that maps a capture group name to a /// capture group index. If the given name doesn't exist, then `None` should /// be returned. /// /// Finally, `dst` is where the final interpolated contents should be written. /// If `replacement` contains no capture group references, then `dst` will be /// equivalent to `replacement`. /// /// See the [module documentation](self) for details about the format /// supported. /// /// # Example /// /// ``` /// use regex_automata::util::interpolate; /// /// let mut dst = vec![]; /// interpolate::bytes( /// b"foo $bar baz", /// |index, dst| { /// if index == 0 { /// dst.extend_from_slice(b"BAR"); /// } /// }, /// |name| { /// if name == "bar" { /// Some(0) /// } else { /// None /// } /// }, /// &mut dst, /// ); /// assert_eq!(&b"foo BAR baz"[..], dst); /// ``` pub fn bytes( mut replacement: &[u8], mut append: impl FnMut(usize, &mut Vec), mut name_to_index: impl FnMut(&str) -> Option, dst: &mut Vec, ) { while !replacement.is_empty() { match memchr(b'$', replacement) { None => break, Some(i) => { dst.extend_from_slice(&replacement[..i]); replacement = &replacement[i..]; } } // Handle escaping of '$'. if replacement.get(1).map_or(false, |&b| b == b'$') { dst.push(b'$'); replacement = &replacement[2..]; continue; } debug_assert!(!replacement.is_empty()); let cap_ref = match find_cap_ref(replacement) { Some(cap_ref) => cap_ref, None => { dst.push(b'$'); replacement = &replacement[1..]; continue; } }; replacement = &replacement[cap_ref.end..]; match cap_ref.cap { Ref::Number(i) => append(i, dst), Ref::Named(name) => { if let Some(i) = name_to_index(name) { append(i, dst); } } } } dst.extend_from_slice(replacement); } /// `CaptureRef` represents a reference to a capture group inside some text. /// The reference is either a capture group name or a number. /// /// It is also tagged with the position in the text following the /// capture reference. #[derive(Clone, Copy, Debug, Eq, PartialEq)] struct CaptureRef<'a> { cap: Ref<'a>, end: usize, } /// A reference to a capture group in some text. /// /// e.g., `$2`, `$foo`, `${foo}`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] enum Ref<'a> { Named(&'a str), Number(usize), } impl<'a> From<&'a str> for Ref<'a> { fn from(x: &'a str) -> Ref<'a> { Ref::Named(x) } } impl From for Ref<'static> { fn from(x: usize) -> Ref<'static> { Ref::Number(x) } } /// Parses a possible reference to a capture group name in the given text, /// starting at the beginning of `replacement`. /// /// If no such valid reference could be found, None is returned. /// /// Note that this returns a "possible" reference because this routine doesn't /// know whether the reference is to a valid group or not. If it winds up not /// being a valid reference, then it should be replaced with the empty string. fn find_cap_ref(replacement: &[u8]) -> Option> { let mut i = 0; let rep: &[u8] = replacement; if rep.len() <= 1 || rep[0] != b'$' { return None; } i += 1; if rep[i] == b'{' { return find_cap_ref_braced(rep, i + 1); } let mut cap_end = i; while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { cap_end += 1; } if cap_end == i { return None; } // We just verified that the range 0..cap_end is valid ASCII, so it must // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 // check via an unchecked conversion or by parsing the number straight from // &[u8]. let cap = core::str::from_utf8(&rep[i..cap_end]) .expect("valid UTF-8 capture name"); Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: cap_end, }) } /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening /// brace has been found at `i-1` in `rep`. This then looks for a closing /// brace and returns the capture reference within the brace. fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option> { assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); let start = i; while rep.get(i).map_or(false, |&b| b != b'}') { i += 1; } if !rep.get(i).map_or(false, |&b| b == b'}') { return None; } // When looking at braced names, we don't put any restrictions on the name, // so it's possible it could be invalid UTF-8. But a capture group name // can never be invalid UTF-8, so if we have invalid UTF-8, then we can // safely return None. let cap = match core::str::from_utf8(&rep[start..i]) { Err(_) => return None, Ok(cap) => cap, }; Some(CaptureRef { cap: match cap.parse::() { Ok(i) => Ref::Number(i), Err(_) => Ref::Named(cap), }, end: i + 1, }) } /// Returns true if and only if the given byte is allowed in a capture name /// written in non-brace form. fn is_valid_cap_letter(b: u8) -> bool { match b { b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, _ => false, } } #[cfg(test)] mod tests { use alloc::{string::String, vec, vec::Vec}; use super::{find_cap_ref, CaptureRef}; macro_rules! find { ($name:ident, $text:expr) => { #[test] fn $name() { assert_eq!(None, find_cap_ref($text.as_bytes())); } }; ($name:ident, $text:expr, $capref:expr) => { #[test] fn $name() { assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); } }; } macro_rules! c { ($name_or_number:expr, $pos:expr) => { CaptureRef { cap: $name_or_number.into(), end: $pos } }; } find!(find_cap_ref1, "$foo", c!("foo", 4)); find!(find_cap_ref2, "${foo}", c!("foo", 6)); find!(find_cap_ref3, "$0", c!(0, 2)); find!(find_cap_ref4, "$5", c!(5, 2)); find!(find_cap_ref5, "$10", c!(10, 3)); // See https://github.com/rust-lang/regex/pull/585 // for more on characters following numbers find!(find_cap_ref6, "$42a", c!("42a", 4)); find!(find_cap_ref7, "${42}a", c!(42, 5)); find!(find_cap_ref8, "${42"); find!(find_cap_ref9, "${42 "); find!(find_cap_ref10, " $0 "); find!(find_cap_ref11, "$"); find!(find_cap_ref12, " "); find!(find_cap_ref13, ""); find!(find_cap_ref14, "$1-$2", c!(1, 2)); find!(find_cap_ref15, "$1_$2", c!("1_", 3)); find!(find_cap_ref16, "$x-$y", c!("x", 2)); find!(find_cap_ref17, "$x_$y", c!("x_", 3)); find!(find_cap_ref18, "${#}", c!("#", 4)); find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); find!(find_cap_ref20, "${¾}", c!("¾", 5)); find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); find!(find_cap_ref23, "${☃}", c!("☃", 6)); find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); find!(find_cap_ref26, "${名字}", c!("名字", 9)); fn interpolate_string( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = String::new(); super::string( replacement, |i, dst| { if let Some(&s) = caps.get(i) { dst.push_str(s); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); dst } fn interpolate_bytes( mut name_to_index: Vec<(&'static str, usize)>, caps: Vec<&'static str>, replacement: &str, ) -> String { name_to_index.sort_by_key(|x| x.0); let mut dst = vec![]; super::bytes( replacement.as_bytes(), |i, dst| { if let Some(&s) = caps.get(i) { dst.extend_from_slice(s.as_bytes()); } }, |name| -> Option { name_to_index .binary_search_by_key(&name, |x| x.0) .ok() .map(|i| name_to_index[i].1) }, &mut dst, ); String::from_utf8(dst).unwrap() } macro_rules! interp { ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { #[test] fn $name() { assert_eq!( $expected, interpolate_string($map, $caps, $hay), "interpolate::string failed", ); assert_eq!( $expected, interpolate_bytes($map, $caps, $hay), "interpolate::bytes failed", ); } }; } interp!( interp1, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo test", "test xxx test", ); interp!( interp2, vec![("foo", 2)], vec!["", "", "xxx"], "test$footest", "test", ); interp!( interp3, vec![("foo", 2)], vec!["", "", "xxx"], "test${foo}test", "testxxxtest", ); interp!( interp4, vec![("foo", 2)], vec!["", "", "xxx"], "test$2test", "test", ); interp!( interp5, vec![("foo", 2)], vec!["", "", "xxx"], "test${2}test", "testxxxtest", ); interp!( interp6, vec![("foo", 2)], vec!["", "", "xxx"], "test $$foo test", "test $foo test", ); interp!( interp7, vec![("foo", 2)], vec!["", "", "xxx"], "test $foo", "test xxx", ); interp!( interp8, vec![("foo", 2)], vec!["", "", "xxx"], "$foo test", "xxx test", ); interp!( interp9, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $bar$foo", "test yyyxxx", ); interp!( interp10, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test $ test", "test $ test", ); interp!( interp11, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${} test", "test test", ); interp!( interp12, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${ } test", "test test", ); interp!( interp13, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a b} test", "test test", ); interp!( interp14, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${a} test", "test test", ); // This is a funny case where a braced reference is never closed, but // within the unclosed braced reference, there is an unbraced reference. // In this case, the braced reference is just treated literally and the // unbraced reference is found. interp!( interp15, vec![("bar", 1), ("foo", 2)], vec!["", "yyy", "xxx"], "test ${wat $bar ok", "test ${wat yyy ok", ); } regex-automata-0.4.9/src/util/iter.rs000064400000000000000000001116151046102023000156430ustar 00000000000000/*! Generic helpers for iteration of matches from a regex engine in a haystack. The principle type in this module is a [`Searcher`]. A `Searcher` provides its own lower level iterator-like API in addition to methods for constructing types that implement `Iterator`. The documentation for `Searcher` explains a bit more about why these different APIs exist. Currently, this module supports iteration over any regex engine that works with the [`HalfMatch`], [`Match`] or [`Captures`] types. */ #[cfg(feature = "alloc")] use crate::util::captures::Captures; use crate::util::search::{HalfMatch, Input, Match, MatchError}; /// A searcher for creating iterators and performing lower level iteration. /// /// This searcher encapsulates the logic required for finding all successive /// non-overlapping matches in a haystack. In theory, iteration would look /// something like this: /// /// 1. Setting the start position to `0`. /// 2. Execute a regex search. If no match, end iteration. /// 3. Report the match and set the start position to the end of the match. /// 4. Go back to (2). /// /// And if this were indeed the case, it's likely that `Searcher` wouldn't /// exist. Unfortunately, because a regex may match the empty string, the above /// logic won't work for all possible regexes. Namely, if an empty match is /// found, then step (3) would set the start position of the search to the /// position it was at. Thus, iteration would never end. /// /// Instead, a `Searcher` knows how to detect these cases and forcefully /// advance iteration in the case of an empty match that overlaps with a /// previous match. /// /// If you know that your regex cannot match any empty string, then the simple /// algorithm described above will work correctly. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// In particular, a `Searcher` is not itself an iterator. Instead, it provides /// `advance` routines that permit moving the search along explicitly. It also /// provides various routines, like [`Searcher::into_matches_iter`], that /// accept a closure (representing how a regex engine executes a search) and /// returns a conventional iterator. /// /// The lifetime parameters come from the [`Input`] type passed to /// [`Searcher::new`]: /// /// * `'h` is the lifetime of the underlying haystack. /// /// # Searcher vs Iterator /// /// Why does a search type with "advance" APIs exist at all when we also have /// iterators? Unfortunately, the reasoning behind this split is a complex /// combination of the following things: /// /// 1. While many of the regex engines expose their own iterators, it is also /// nice to expose this lower level iteration helper because it permits callers /// to provide their own `Input` configuration. Moreover, a `Searcher` can work /// with _any_ regex engine instead of only the ones defined in this crate. /// This way, everyone benefits from a shared iteration implementation. /// 2. There are many different regex engines that, while they have the same /// match semantics, they have slightly different APIs. Iteration is just /// complex enough to want to share code, and so we need a way of abstracting /// over those different regex engines. While we could define a new trait that /// describes any regex engine search API, it would wind up looking very close /// to a closure. While there may still be reasons for the more generic trait /// to exist, for now and for the purposes of iteration, we use a closure. /// Closures also provide a lot of easy flexibility at the call site, in that /// they permit the caller to borrow any kind of state they want for use during /// each search call. /// 3. As a result of using closures, and because closures are anonymous types /// that cannot be named, it is difficult to encapsulate them without both /// costs to speed and added complexity to the public API. For example, in /// defining an iterator type like /// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches), /// if we use a closure internally, it's not possible to name this type in the /// return type of the iterator constructor. Thus, the only way around it is /// to erase the type by boxing it and turning it into a `Box`. /// This boxed closure is unlikely to be inlined _and_ it infects the public /// API in subtle ways. Namely, unless you declare the closure as implementing /// `Send` and `Sync`, then the resulting iterator type won't implement it /// either. But there are practical issues with requiring the closure to /// implement `Send` and `Sync` that result in other API complexities that /// are beyond the scope of this already long exposition. /// 4. Some regex engines expose more complex match information than just /// "which pattern matched" and "at what offsets." For example, the PikeVM /// exposes match spans for each capturing group that participated in the /// match. In such cases, it can be quite beneficial to reuse the capturing /// group allocation on subsequent searches. A proper iterator doesn't permit /// this API due to its interface, so it's useful to have something a bit lower /// level that permits callers to amortize allocations while also reusing a /// shared implementation of iteration. (See the documentation for /// [`Searcher::advance`] for an example of using the "advance" API with the /// PikeVM.) /// /// What this boils down to is that there are "advance" APIs which require /// handing a closure to it for every call, and there are also APIs to create /// iterators from a closure. The former are useful for _implementing_ /// iterators or when you need more flexibility, while the latter are useful /// for conveniently writing custom iterators on-the-fly. /// /// # Example: iterating with captures /// /// Several regex engines in this crate over convenient iterator APIs over /// [`Captures`] values. To do so, this requires allocating a new `Captures` /// value for each iteration step. This can perhaps be more costly than you /// might want. Instead of implementing your own iterator to avoid that /// cost (which can be a little subtle if you want to handle empty matches /// correctly), you can use this `Searcher` to do it for you: /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::iter::Searcher, /// Input, Span, /// }; /// /// let re = PikeVM::new("foo(?P[0-9]+)")?; /// let haystack = "foo1 foo12 foo123"; /// /// let mut caps = re.create_captures(); /// let mut cache = re.create_cache(); /// let mut matches = vec![]; /// let mut searcher = Searcher::new(Input::new(haystack)); /// while let Some(_) = searcher.advance(|input| { /// re.search(&mut cache, input, &mut caps); /// Ok(caps.get_match()) /// }) { /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// matches.push(caps.get_group_by_name("numbers").unwrap()); /// } /// assert_eq!(matches, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Searcher<'h> { /// The input parameters to give to each regex engine call. /// /// The start position of the search is mutated during iteration. input: Input<'h>, /// Records the end offset of the most recent match. This is necessary to /// handle a corner case for preventing empty matches from overlapping with /// the ending bounds of a prior match. last_match_end: Option, } impl<'h> Searcher<'h> { /// Create a new fallible non-overlapping matches iterator. /// /// The given `input` provides the parameters (including the haystack), /// while the `finder` represents a closure that calls the underlying regex /// engine. The closure may borrow any additional state that is needed, /// such as a prefilter scanner. pub fn new(input: Input<'h>) -> Searcher<'h> { Searcher { input, last_match_end: None } } /// Returns the current `Input` used by this searcher. /// /// The `Input` returned is generally equivalent to the one given to /// [`Searcher::new`], but its start position may be different to reflect /// the start of the next search to be executed. pub fn input<'s>(&'s self) -> &'s Input<'h> { &self.input } /// Return the next half match for an infallible search if one exists, and /// advance to the next position. /// /// This is like `try_advance_half`, except errors are converted into /// panics. /// /// # Panics /// /// If the given closure returns an error, then this panics. This is useful /// when you know your underlying regex engine has been configured to not /// return an error. /// /// # Example /// /// This example shows how to use a `Searcher` to iterate over all matches /// when using a DFA, which only provides "half" matches. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// util::iter::Searcher, /// HalfMatch, Input, /// }; /// /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); /// let mut it = Searcher::new(input); /// /// let expected = Some(HalfMatch::must(0, 10)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(HalfMatch::must(0, 21)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(HalfMatch::must(0, 32)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = None; /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// This correctly moves iteration forward even when an empty match occurs: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// util::iter::Searcher, /// HalfMatch, Input, /// }; /// /// let re = DFA::new(r"a|")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("abba"); /// let mut it = Searcher::new(input); /// /// let expected = Some(HalfMatch::must(0, 1)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(HalfMatch::must(0, 2)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(HalfMatch::must(0, 4)); /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = None; /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input)); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn advance_half(&mut self, finder: F) -> Option where F: FnMut(&Input<'_>) -> Result, MatchError>, { match self.try_advance_half(finder) { Ok(m) => m, Err(err) => panic!( "unexpected regex half find error: {}\n\ to handle find errors, use 'try' or 'search' methods", err, ), } } /// Return the next match for an infallible search if one exists, and /// advance to the next position. /// /// The search is advanced even in the presence of empty matches by /// forbidding empty matches from overlapping with any other match. /// /// This is like `try_advance`, except errors are converted into panics. /// /// # Panics /// /// If the given closure returns an error, then this panics. This is useful /// when you know your underlying regex engine has been configured to not /// return an error. /// /// # Example /// /// This example shows how to use a `Searcher` to iterate over all matches /// when using a regex based on lazy DFAs: /// /// ``` /// use regex_automata::{ /// hybrid::regex::Regex, /// util::iter::Searcher, /// Match, Input, /// }; /// /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); /// let mut it = Searcher::new(input); /// /// let expected = Some(Match::must(0, 0..10)); /// let got = it.advance(|input| re.try_search(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(Match::must(0, 11..21)); /// let got = it.advance(|input| re.try_search(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = Some(Match::must(0, 22..32)); /// let got = it.advance(|input| re.try_search(&mut cache, input)); /// assert_eq!(expected, got); /// /// let expected = None; /// let got = it.advance(|input| re.try_search(&mut cache, input)); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// This example shows the same as above, but with the PikeVM. This example /// is useful because it shows how to use this API even when the regex /// engine doesn't directly return a `Match`. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::iter::Searcher, /// Match, Input, /// }; /// /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); /// let mut it = Searcher::new(input); /// /// let expected = Some(Match::must(0, 0..10)); /// let got = it.advance(|input| { /// re.search(&mut cache, input, &mut caps); /// Ok(caps.get_match()) /// }); /// // Note that if we wanted to extract capturing group spans, we could /// // do that here with 'caps'. /// assert_eq!(expected, got); /// /// let expected = Some(Match::must(0, 11..21)); /// let got = it.advance(|input| { /// re.search(&mut cache, input, &mut caps); /// Ok(caps.get_match()) /// }); /// assert_eq!(expected, got); /// /// let expected = Some(Match::must(0, 22..32)); /// let got = it.advance(|input| { /// re.search(&mut cache, input, &mut caps); /// Ok(caps.get_match()) /// }); /// assert_eq!(expected, got); /// /// let expected = None; /// let got = it.advance(|input| { /// re.search(&mut cache, input, &mut caps); /// Ok(caps.get_match()) /// }); /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn advance(&mut self, finder: F) -> Option where F: FnMut(&Input<'_>) -> Result, MatchError>, { match self.try_advance(finder) { Ok(m) => m, Err(err) => panic!( "unexpected regex find error: {}\n\ to handle find errors, use 'try' or 'search' methods", err, ), } } /// Return the next half match for a fallible search if one exists, and /// advance to the next position. /// /// This is like `advance_half`, except it permits callers to handle errors /// during iteration. #[inline] pub fn try_advance_half( &mut self, mut finder: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { let mut m = match finder(&self.input)? { None => return Ok(None), Some(m) => m, }; if Some(m.offset()) == self.last_match_end { m = match self.handle_overlapping_empty_half_match(m, finder)? { None => return Ok(None), Some(m) => m, }; } self.input.set_start(m.offset()); self.last_match_end = Some(m.offset()); Ok(Some(m)) } /// Return the next match for a fallible search if one exists, and advance /// to the next position. /// /// This is like `advance`, except it permits callers to handle errors /// during iteration. #[inline] pub fn try_advance( &mut self, mut finder: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { let mut m = match finder(&self.input)? { None => return Ok(None), Some(m) => m, }; if m.is_empty() && Some(m.end()) == self.last_match_end { m = match self.handle_overlapping_empty_match(m, finder)? { None => return Ok(None), Some(m) => m, }; } self.input.set_start(m.end()); self.last_match_end = Some(m.end()); Ok(Some(m)) } /// Given a closure that executes a single search, return an iterator over /// all successive non-overlapping half matches. /// /// The iterator returned yields result values. If the underlying regex /// engine is configured to never return an error, consider calling /// [`TryHalfMatchesIter::infallible`] to convert errors into panics. /// /// # Example /// /// This example shows how to use a `Searcher` to create a proper /// iterator over half matches. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// util::iter::Searcher, /// HalfMatch, Input, /// }; /// /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); /// let mut it = Searcher::new(input).into_half_matches_iter(|input| { /// re.try_search_fwd(&mut cache, input) /// }); /// /// let expected = Some(Ok(HalfMatch::must(0, 10))); /// assert_eq!(expected, it.next()); /// /// let expected = Some(Ok(HalfMatch::must(0, 21))); /// assert_eq!(expected, it.next()); /// /// let expected = Some(Ok(HalfMatch::must(0, 32))); /// assert_eq!(expected, it.next()); /// /// let expected = None; /// assert_eq!(expected, it.next()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn into_half_matches_iter( self, finder: F, ) -> TryHalfMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { TryHalfMatchesIter { it: self, finder } } /// Given a closure that executes a single search, return an iterator over /// all successive non-overlapping matches. /// /// The iterator returned yields result values. If the underlying regex /// engine is configured to never return an error, consider calling /// [`TryMatchesIter::infallible`] to convert errors into panics. /// /// # Example /// /// This example shows how to use a `Searcher` to create a proper /// iterator over matches. /// /// ``` /// use regex_automata::{ /// hybrid::regex::Regex, /// util::iter::Searcher, /// Match, Input, /// }; /// /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?; /// let mut cache = re.create_cache(); /// /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22"); /// let mut it = Searcher::new(input).into_matches_iter(|input| { /// re.try_search(&mut cache, input) /// }); /// /// let expected = Some(Ok(Match::must(0, 0..10))); /// assert_eq!(expected, it.next()); /// /// let expected = Some(Ok(Match::must(0, 11..21))); /// assert_eq!(expected, it.next()); /// /// let expected = Some(Ok(Match::must(0, 22..32))); /// assert_eq!(expected, it.next()); /// /// let expected = None; /// assert_eq!(expected, it.next()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn into_matches_iter(self, finder: F) -> TryMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { TryMatchesIter { it: self, finder } } /// Given a closure that executes a single search, return an iterator over /// all successive non-overlapping `Captures` values. /// /// The iterator returned yields result values. If the underlying regex /// engine is configured to never return an error, consider calling /// [`TryCapturesIter::infallible`] to convert errors into panics. /// /// Unlike the other iterator constructors, this accepts an initial /// `Captures` value. This `Captures` value is reused for each search, and /// the iterator implementation clones it before returning it. The caller /// must provide this value because the iterator is purposely ignorant /// of the underlying regex engine and thus doesn't know how to create /// one itself. More to the point, a `Captures` value itself has a few /// different constructors, which change which kind of information is /// available to query in exchange for search performance. /// /// # Example /// /// This example shows how to use a `Searcher` to create a proper iterator /// over `Captures` values, which provides access to all capturing group /// spans for each match. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::iter::Searcher, /// Input, /// }; /// /// let re = PikeVM::new( /// r"(?P[0-9]{4})-(?P[0-9]{2})-(?P[0-9]{2})", /// )?; /// let (mut cache, caps) = (re.create_cache(), re.create_captures()); /// /// let haystack = "2010-03-14 2016-10-08 2020-10-22"; /// let input = Input::new(haystack); /// let mut it = Searcher::new(input) /// .into_captures_iter(caps, |input, caps| { /// re.search(&mut cache, input, caps); /// Ok(()) /// }); /// /// let got = it.next().expect("first date")?; /// let year = got.get_group_by_name("y").expect("must match"); /// assert_eq!("2010", &haystack[year]); /// /// let got = it.next().expect("second date")?; /// let month = got.get_group_by_name("m").expect("must match"); /// assert_eq!("10", &haystack[month]); /// /// let got = it.next().expect("third date")?; /// let day = got.get_group_by_name("d").expect("must match"); /// assert_eq!("22", &haystack[day]); /// /// assert!(it.next().is_none()); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "alloc")] #[inline] pub fn into_captures_iter( self, caps: Captures, finder: F, ) -> TryCapturesIter<'h, F> where F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, { TryCapturesIter { it: self, caps, finder } } /// Handles the special case of a match that begins where the previous /// match ended. Without this special handling, it'd be possible to get /// stuck where an empty match never results in forward progress. This /// also makes it more consistent with how presiding general purpose regex /// engines work. #[cold] #[inline(never)] fn handle_overlapping_empty_half_match( &mut self, _: HalfMatch, mut finder: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { // Since we are only here when 'm.offset()' matches the offset of the // last match, it follows that this must have been an empty match. // Since we both need to make progress *and* prevent overlapping // matches, we discard this match and advance the search by 1. // // Note that this may start a search in the middle of a codepoint. The // regex engines themselves are expected to deal with that and not // report any matches within a codepoint if they are configured in // UTF-8 mode. self.input.set_start(self.input.start().checked_add(1).unwrap()); finder(&self.input) } /// Handles the special case of an empty match by ensuring that 1) the /// iterator always advances and 2) empty matches never overlap with other /// matches. /// /// (1) is necessary because we principally make progress by setting the /// starting location of the next search to the ending location of the last /// match. But if a match is empty, then this results in a search that does /// not advance and thus does not terminate. /// /// (2) is not strictly necessary, but makes intuitive sense and matches /// the presiding behavior of most general purpose regex engines. The /// "intuitive sense" here is that we want to report NON-overlapping /// matches. So for example, given the regex 'a|(?:)' against the haystack /// 'a', without the special handling, you'd get the matches [0, 1) and [1, /// 1), where the latter overlaps with the end bounds of the former. /// /// Note that we mark this cold and forcefully prevent inlining because /// handling empty matches like this is extremely rare and does require /// quite a bit of code, comparatively. Keeping this code out of the main /// iterator function keeps it smaller and more amenable to inlining /// itself. #[cold] #[inline(never)] fn handle_overlapping_empty_match( &mut self, m: Match, mut finder: F, ) -> Result, MatchError> where F: FnMut(&Input<'_>) -> Result, MatchError>, { assert!(m.is_empty()); self.input.set_start(self.input.start().checked_add(1).unwrap()); finder(&self.input) } } /// An iterator over all non-overlapping half matches for a fallible search. /// /// The iterator yields a `Result` value until no more /// matches could be found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_half_matches_iter`]. pub struct TryHalfMatchesIter<'h, F> { it: Searcher<'h>, finder: F, } impl<'h, F> TryHalfMatchesIter<'h, F> { /// Return an infallible version of this iterator. /// /// Any item yielded that corresponds to an error results in a panic. This /// is useful if your underlying regex engine is configured in a way that /// it is guaranteed to never return an error. pub fn infallible(self) -> HalfMatchesIter<'h, F> { HalfMatchesIter(self) } /// Returns the current `Input` used by this iterator. /// /// The `Input` returned is generally equivalent to the one used to /// construct this iterator, but its start position may be different to /// reflect the start of the next search to be executed. pub fn input<'i>(&'i self) -> &'i Input<'h> { self.it.input() } } impl<'h, F> Iterator for TryHalfMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { type Item = Result; #[inline] fn next(&mut self) -> Option> { self.it.try_advance_half(&mut self.finder).transpose() } } impl<'h, F> core::fmt::Debug for TryHalfMatchesIter<'h, F> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("TryHalfMatchesIter") .field("it", &self.it) .field("finder", &"") .finish() } } /// An iterator over all non-overlapping half matches for an infallible search. /// /// The iterator yields a [`HalfMatch`] value until no more matches could be /// found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_half_matches_iter`] and /// then calling [`TryHalfMatchesIter::infallible`]. #[derive(Debug)] pub struct HalfMatchesIter<'h, F>(TryHalfMatchesIter<'h, F>); impl<'h, F> HalfMatchesIter<'h, F> { /// Returns the current `Input` used by this iterator. /// /// The `Input` returned is generally equivalent to the one used to /// construct this iterator, but its start position may be different to /// reflect the start of the next search to be executed. pub fn input<'i>(&'i self) -> &'i Input<'h> { self.0.it.input() } } impl<'h, F> Iterator for HalfMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { type Item = HalfMatch; #[inline] fn next(&mut self) -> Option { match self.0.next()? { Ok(m) => Some(m), Err(err) => panic!( "unexpected regex half find error: {}\n\ to handle find errors, use 'try' or 'search' methods", err, ), } } } /// An iterator over all non-overlapping matches for a fallible search. /// /// The iterator yields a `Result` value until no more /// matches could be found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_matches_iter`]. pub struct TryMatchesIter<'h, F> { it: Searcher<'h>, finder: F, } impl<'h, F> TryMatchesIter<'h, F> { /// Return an infallible version of this iterator. /// /// Any item yielded that corresponds to an error results in a panic. This /// is useful if your underlying regex engine is configured in a way that /// it is guaranteed to never return an error. pub fn infallible(self) -> MatchesIter<'h, F> { MatchesIter(self) } /// Returns the current `Input` used by this iterator. /// /// The `Input` returned is generally equivalent to the one used to /// construct this iterator, but its start position may be different to /// reflect the start of the next search to be executed. pub fn input<'i>(&'i self) -> &'i Input<'h> { self.it.input() } } impl<'h, F> Iterator for TryMatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { type Item = Result; #[inline] fn next(&mut self) -> Option> { self.it.try_advance(&mut self.finder).transpose() } } impl<'h, F> core::fmt::Debug for TryMatchesIter<'h, F> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("TryMatchesIter") .field("it", &self.it) .field("finder", &"") .finish() } } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_matches_iter`] and /// then calling [`TryMatchesIter::infallible`]. #[derive(Debug)] pub struct MatchesIter<'h, F>(TryMatchesIter<'h, F>); impl<'h, F> MatchesIter<'h, F> { /// Returns the current `Input` used by this iterator. /// /// The `Input` returned is generally equivalent to the one used to /// construct this iterator, but its start position may be different to /// reflect the start of the next search to be executed. pub fn input<'i>(&'i self) -> &'i Input<'h> { self.0.it.input() } } impl<'h, F> Iterator for MatchesIter<'h, F> where F: FnMut(&Input<'_>) -> Result, MatchError>, { type Item = Match; #[inline] fn next(&mut self) -> Option { match self.0.next()? { Ok(m) => Some(m), Err(err) => panic!( "unexpected regex find error: {}\n\ to handle find errors, use 'try' or 'search' methods", err, ), } } } /// An iterator over all non-overlapping captures for a fallible search. /// /// The iterator yields a `Result` value until no more /// matches could be found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_captures_iter`]. #[cfg(feature = "alloc")] pub struct TryCapturesIter<'h, F> { it: Searcher<'h>, caps: Captures, finder: F, } #[cfg(feature = "alloc")] impl<'h, F> TryCapturesIter<'h, F> { /// Return an infallible version of this iterator. /// /// Any item yielded that corresponds to an error results in a panic. This /// is useful if your underlying regex engine is configured in a way that /// it is guaranteed to never return an error. pub fn infallible(self) -> CapturesIter<'h, F> { CapturesIter(self) } } #[cfg(feature = "alloc")] impl<'h, F> Iterator for TryCapturesIter<'h, F> where F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, { type Item = Result; #[inline] fn next(&mut self) -> Option> { let TryCapturesIter { ref mut it, ref mut caps, ref mut finder } = *self; let result = it .try_advance(|input| { (finder)(input, caps)?; Ok(caps.get_match()) }) .transpose()?; match result { Ok(_) => Some(Ok(caps.clone())), Err(err) => Some(Err(err)), } } } #[cfg(feature = "alloc")] impl<'h, F> core::fmt::Debug for TryCapturesIter<'h, F> { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("TryCapturesIter") .field("it", &self.it) .field("caps", &self.caps) .field("finder", &"") .finish() } } /// An iterator over all non-overlapping captures for an infallible search. /// /// The iterator yields a [`Captures`] value until no more matches could be /// found. /// /// The type parameters are as follows: /// /// * `F` represents the type of a closure that executes the search. /// /// The lifetime parameters come from the [`Input`] type: /// /// * `'h` is the lifetime of the underlying haystack. /// /// When possible, prefer the iterators defined on the regex engine you're /// using. This tries to abstract over the regex engine and is thus a bit more /// unwieldy to use. /// /// This iterator is created by [`Searcher::into_captures_iter`] and then /// calling [`TryCapturesIter::infallible`]. #[cfg(feature = "alloc")] #[derive(Debug)] pub struct CapturesIter<'h, F>(TryCapturesIter<'h, F>); #[cfg(feature = "alloc")] impl<'h, F> Iterator for CapturesIter<'h, F> where F: FnMut(&Input<'_>, &mut Captures) -> Result<(), MatchError>, { type Item = Captures; #[inline] fn next(&mut self) -> Option { match self.0.next()? { Ok(m) => Some(m), Err(err) => panic!( "unexpected regex captures error: {}\n\ to handle find errors, use 'try' or 'search' methods", err, ), } } } regex-automata-0.4.9/src/util/lazy.rs000064400000000000000000000447621046102023000156670ustar 00000000000000/*! A lazily initialized value for safe sharing between threads. The principal type in this module is `Lazy`, which makes it easy to construct values that are shared safely across multiple threads simultaneously. */ use core::fmt; /// A lazily initialized value that implements `Deref` for `T`. /// /// A `Lazy` takes an initialization function and permits callers from any /// thread to access the result of that initialization function in a safe /// manner. In effect, this permits one-time initialization of global resources /// in a (possibly) multi-threaded program. /// /// This type and its functionality are available even when neither the `alloc` /// nor the `std` features are enabled. In exchange, a `Lazy` does **not** /// guarantee that the given `create` function is called at most once. It /// might be called multiple times. Moreover, a call to `Lazy::get` (either /// explicitly or implicitly via `Lazy`'s `Deref` impl) may block until a `T` /// is available. /// /// This is very similar to `lazy_static` or `once_cell`, except it doesn't /// guarantee that the initialization function will be run once and it works /// in no-alloc no-std environments. With that said, if you need stronger /// guarantees or a more flexible API, then it is recommended to use either /// `lazy_static` or `once_cell`. /// /// # Warning: may use a spin lock /// /// When this crate is compiled _without_ the `alloc` feature, then this type /// may used a spin lock internally. This can have subtle effects that may /// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more /// thorough treatment of this topic. /// /// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// /// # Example /// /// This type is useful for creating regexes once, and then using them from /// multiple threads simultaneously without worrying about synchronization. /// /// ``` /// use regex_automata::{dfa::regex::Regex, util::lazy::Lazy, Match}; /// /// static RE: Lazy = Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); /// /// let expected = Some(Match::must(0, 3..14)); /// assert_eq!(expected, RE.find(b"zzzfoo12345barzzz")); /// ``` pub struct Lazy T>(lazy::Lazy); impl Lazy { /// Create a new `Lazy` value that is initialized via the given function. /// /// The `T` type is automatically inferred from the return type of the /// `create` function given. pub const fn new(create: F) -> Lazy { Lazy(lazy::Lazy::new(create)) } } impl T> Lazy { /// Return a reference to the lazily initialized value. /// /// This routine may block if another thread is initializing a `T`. /// /// Note that given a `x` which has type `Lazy`, this must be called via /// `Lazy::get(x)` and not `x.get()`. This routine is defined this way /// because `Lazy` impls `Deref` with a target of `T`. /// /// # Panics /// /// This panics if the `create` function inside this lazy value panics. /// If the panic occurred in another thread, then this routine _may_ also /// panic (but is not guaranteed to do so). pub fn get(this: &Lazy) -> &T { this.0.get() } } impl T> core::ops::Deref for Lazy { type Target = T; fn deref(&self) -> &T { Lazy::get(self) } } impl T> fmt::Debug for Lazy { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { self.0.fmt(f) } } #[cfg(feature = "alloc")] mod lazy { use core::{ fmt, marker::PhantomData, sync::atomic::{AtomicPtr, Ordering}, }; use alloc::boxed::Box; /// A non-std lazy initialized value. /// /// This might run the initialization function more than once, but will /// never block. /// /// I wish I could get these semantics into the non-alloc non-std Lazy /// type below, but I'm not sure how to do it. If you can do an alloc, /// then the implementation becomes very simple if you don't care about /// redundant work precisely because a pointer can be atomically swapped. /// /// Perhaps making this approach work in the non-alloc non-std case /// requires asking the caller for a pointer? It would make the API less /// convenient I think. pub(super) struct Lazy { data: AtomicPtr, create: F, // This indicates to the compiler that this type can drop T. It's not // totally clear how the absence of this marker could lead to trouble, // but putting here doesn't have any downsides so we hedge until somone // can from the Unsafe Working Group can tell us definitively that we // don't need it. // // See: https://github.com/BurntSushi/regex-automata/issues/30 owned: PhantomData>, } // SAFETY: So long as T and &T (and F and &F) can themselves be safely // shared among threads, so to can a Lazy. Namely, the Lazy API only // permits accessing a &T and initialization is free of data races. So if T // is thread safe, then so to is Lazy. // // We specifically require that T: Send in order for Lazy to be Sync. // Without that requirement, it's possible to send a T from one thread to // another via Lazy's destructor. // // It's not clear whether we need F: Send+Sync for Lazy to be Sync. But // we're conservative for now and keep both. unsafe impl Sync for Lazy {} impl Lazy { /// Create a new alloc but non-std lazy value that is racily /// initialized. That is, the 'create' function may be called more than /// once. pub(super) const fn new(create: F) -> Lazy { Lazy { data: AtomicPtr::new(core::ptr::null_mut()), create, owned: PhantomData, } } } impl T> Lazy { /// Get the underlying lazy value. If it hasn't been initialized /// yet, then always attempt to initialize it (even if some other /// thread is initializing it) and atomically attach it to this lazy /// value before returning it. pub(super) fn get(&self) -> &T { if let Some(data) = self.poll() { return data; } let data = (self.create)(); let mut ptr = Box::into_raw(Box::new(data)); // We attempt to stuff our initialized value into our atomic // pointer. Upon success, we don't need to do anything. But if // someone else beat us to the punch, then we need to make sure // our newly created value is dropped. let result = self.data.compare_exchange( core::ptr::null_mut(), ptr, Ordering::AcqRel, Ordering::Acquire, ); if let Err(old) = result { // SAFETY: We created 'ptr' via Box::into_raw above, so turning // it back into a Box via from_raw is safe. drop(unsafe { Box::from_raw(ptr) }); ptr = old; } // SAFETY: We just set the pointer above to a non-null value, even // in the error case, and set it to a fully initialized value // returned by 'create'. unsafe { &*ptr } } /// If this lazy value has been initialized successfully, then return /// that value. Otherwise return None immediately. This never attempts /// to run initialization itself. fn poll(&self) -> Option<&T> { let ptr = self.data.load(Ordering::Acquire); if ptr.is_null() { return None; } // SAFETY: We just checked that the pointer is not null. Since it's // not null, it must have been fully initialized by 'get' at some // point. Some(unsafe { &*ptr }) } } impl T> fmt::Debug for Lazy { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Lazy").field("data", &self.poll()).finish() } } impl Drop for Lazy { fn drop(&mut self) { let ptr = *self.data.get_mut(); if !ptr.is_null() { // SAFETY: We just checked that 'ptr' is not null. And since // we have exclusive access, there are no races to worry about. drop(unsafe { Box::from_raw(ptr) }); } } } } #[cfg(not(feature = "alloc"))] mod lazy { use core::{ cell::Cell, fmt, mem::MaybeUninit, panic::{RefUnwindSafe, UnwindSafe}, sync::atomic::{AtomicU8, Ordering}, }; /// Our 'Lazy' value can be in one of three states: /// /// * INIT is where it starts, and also ends up back here if the /// 'create' routine panics. /// * BUSY is where it sits while initialization is running in exactly /// one thread. /// * DONE is where it sits after 'create' has completed and 'data' has /// been fully initialized. const LAZY_STATE_INIT: u8 = 0; const LAZY_STATE_BUSY: u8 = 1; const LAZY_STATE_DONE: u8 = 2; /// A non-alloc non-std lazy initialized value. /// /// This guarantees initialization only happens once, but uses a spinlock /// to block in the case of simultaneous access. Blocking occurs so that /// one thread waits while another thread initializes the value. /// /// I would much rather have the semantics of the 'alloc' Lazy type above. /// Namely, that we might run the initialization function more than once, /// but we never otherwise block. However, I don't know how to do that in /// a non-alloc non-std context. pub(super) struct Lazy { state: AtomicU8, create: Cell>, data: Cell>, } // SAFETY: So long as T and &T (and F and &F) can themselves be safely // shared among threads, so to can a Lazy. Namely, the Lazy API only // permits accessing a &T and initialization is free of data races. So if T // is thread safe, then so to is Lazy. unsafe impl Sync for Lazy {} // A reference to a Lazy is unwind safe because we specifically take // precautions to poison all accesses to a Lazy if the caller-provided // 'create' function panics. impl RefUnwindSafe for Lazy { } impl Lazy { /// Create a new non-alloc non-std lazy value that is initialized /// exactly once on first use using the given function. pub(super) const fn new(create: F) -> Lazy { Lazy { state: AtomicU8::new(LAZY_STATE_INIT), create: Cell::new(Some(create)), data: Cell::new(MaybeUninit::uninit()), } } } impl T> Lazy { /// Get the underlying lazy value. If it isn't been initialized /// yet, then either initialize it or block until some other thread /// initializes it. If the 'create' function given to Lazy::new panics /// (even in another thread), then this panics too. pub(super) fn get(&self) -> &T { // This is effectively a spinlock. We loop until we enter a DONE // state, and if possible, initialize it ourselves. The only way // we exit the loop is if 'create' panics, we initialize 'data' or // some other thread initializes 'data'. // // Yes, I have read spinlocks considered harmful[1]. And that // article is why this spinlock is only active when 'alloc' isn't // enabled. I did this because I don't think there is really // another choice without 'alloc', other than not providing this at // all. But I think that's a big bummer. // // [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html while self.state.load(Ordering::Acquire) != LAZY_STATE_DONE { // Check if we're the first ones to get here. If so, we'll be // the ones who initialize. let result = self.state.compare_exchange( LAZY_STATE_INIT, LAZY_STATE_BUSY, Ordering::AcqRel, Ordering::Acquire, ); // This means we saw the INIT state and nobody else can. So we // must take responsibility for initializing. And by virtue of // observing INIT, we have also told anyone else trying to // get here that we are BUSY. If someone else sees BUSY, then // they will spin until we finish initialization. if let Ok(_) = result { // Since we are guaranteed to be the only ones here, we // know that 'create' is there... Unless someone else got // here before us and 'create' panicked. In which case, // 'self.create' is now 'None' and we forward the panic // to the caller. (i.e., We implement poisoning.) // // SAFETY: Our use of 'self.state' guarantees that we are // the only thread executing this line, and thus there are // no races. let create = unsafe { (*self.create.as_ptr()).take().expect( "Lazy's create function panicked, \ preventing initialization, poisoning current thread", ) }; let guard = Guard { state: &self.state }; // SAFETY: Our use of 'self.state' guarantees that we are // the only thread executing this line, and thus there are // no races. unsafe { (*self.data.as_ptr()).as_mut_ptr().write(create()); } // All is well. 'self.create' ran successfully, so we // forget the guard. core::mem::forget(guard); // Everything is initialized, so we can declare success. self.state.store(LAZY_STATE_DONE, Ordering::Release); break; } core::hint::spin_loop(); } // We only get here if data is fully initialized, and thus poll // will always return something. self.poll().unwrap() } /// If this lazy value has been initialized successfully, then return /// that value. Otherwise return None immediately. This never blocks. fn poll(&self) -> Option<&T> { if self.state.load(Ordering::Acquire) == LAZY_STATE_DONE { // SAFETY: The DONE state only occurs when data has been fully // initialized. Some(unsafe { &*(*self.data.as_ptr()).as_ptr() }) } else { None } } } impl T> fmt::Debug for Lazy { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Lazy") .field("state", &self.state.load(Ordering::Acquire)) .field("create", &"") .field("data", &self.poll()) .finish() } } impl Drop for Lazy { fn drop(&mut self) { if *self.state.get_mut() == LAZY_STATE_DONE { // SAFETY: state is DONE if and only if data has been fully // initialized. At which point, it is safe to drop. unsafe { self.data.get_mut().assume_init_drop(); } } } } /// A guard that will reset a Lazy's state back to INIT when dropped. The /// idea here is to 'forget' this guard on success. On failure (when a /// panic occurs), the Drop impl runs and causes all in-progress and future /// 'get' calls to panic. Without this guard, all in-progress and future /// 'get' calls would spin forever. Crashing is much better than getting /// stuck in an infinite loop. struct Guard<'a> { state: &'a AtomicU8, } impl<'a> Drop for Guard<'a> { fn drop(&mut self) { // We force ourselves back into an INIT state. This will in turn // cause any future 'get' calls to attempt calling 'self.create' // again which will in turn panic because 'self.create' will now // be 'None'. self.state.store(LAZY_STATE_INIT, Ordering::Release); } } } #[cfg(test)] mod tests { use super::*; fn assert_send() {} fn assert_sync() {} fn assert_unwind() {} fn assert_refunwind() {} #[test] fn oibits() { assert_send::>(); assert_sync::>(); assert_unwind::>(); assert_refunwind::>(); } // This is a regression test because we used to rely on the inferred Sync // impl for the Lazy type defined above (for 'alloc' mode). In the // inferred impl, it only requires that T: Sync for Lazy: Sync. But // if we have that, we can actually make use of the fact that Lazy drops // T to create a value on one thread and drop it on another. This *should* // require T: Send, but our missing bounds before let it sneak by. // // Basically, this test should not compile, so we... comment it out. We // don't have a great way of testing compile-fail tests right now. // // See: https://github.com/BurntSushi/regex-automata/issues/30 /* #[test] fn sync_not_send() { #[allow(dead_code)] fn inner() { let lazy = Lazy::new(move || T::default()); std::thread::scope(|scope| { scope.spawn(|| { Lazy::get(&lazy); // We create T in this thread }); }); // And drop in this thread. drop(lazy); // So we have send a !Send type over threads. (with some more // legwork, its possible to even sneak the value out of drop // through thread local) } } */ } regex-automata-0.4.9/src/util/look.rs000064400000000000000000002762111046102023000156500ustar 00000000000000/*! Types and routines for working with look-around assertions. This module principally defines two types: * [`Look`] enumerates all of the assertions supported by this crate. * [`LookSet`] provides a way to efficiently store a set of [`Look`] values. * [`LookMatcher`] provides routines for checking whether a `Look` or a `LookSet` matches at a particular position in a haystack. */ // LAMENTATION: Sadly, a lot of the API of `Look` and `LookSet` were basically // copied verbatim from the regex-syntax crate. I would have no problems using // the regex-syntax types and defining the matching routines (only found // in this crate) as free functions, except the `Look` and `LookSet` types // are used in lots of places. Including in places we expect to work when // regex-syntax is *not* enabled, such as in the definition of the NFA itself. // // Thankfully the code we copy is pretty simple and there isn't much of it. // Otherwise, the rest of this module deals with *matching* the assertions, // which is not something that regex-syntax handles. use crate::util::{escape::DebugByte, utf8}; /// A look-around assertion. /// /// An assertion matches at a position between characters in a haystack. /// Namely, it does not actually "consume" any input as most parts of a regular /// expression do. Assertions are a way of stating that some property must be /// true at a particular point during matching. /// /// For example, `(?m)^[a-z]+$` is a pattern that: /// /// * Scans the haystack for a position at which `(?m:^)` is satisfied. That /// occurs at either the beginning of the haystack, or immediately following /// a `\n` character. /// * Looks for one or more occurrences of `[a-z]`. /// * Once `[a-z]+` has matched as much as it can, an overall match is only /// reported when `[a-z]+` stops just before a `\n`. /// /// So in this case, `abc` and `\nabc\n` match, but `\nabc1\n` does not. /// /// Assertions are also called "look-around," "look-behind" and "look-ahead." /// Specifically, some assertions are look-behind (like `^`), other assertions /// are look-ahead (like `$`) and yet other assertions are both look-ahead and /// look-behind (like `\b`). /// /// # Assertions in an NFA /// /// An assertion in a [`thompson::NFA`](crate::nfa::thompson::NFA) can be /// thought of as a conditional epsilon transition. That is, a matching engine /// like the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) only permits /// moving through conditional epsilon transitions when their condition /// is satisfied at whatever position the `PikeVM` is currently at in the /// haystack. /// /// How assertions are handled in a `DFA` is trickier, since a DFA does not /// have epsilon transitions at all. In this case, they are compiled into the /// automaton itself, at the expense of more states than what would be required /// without an assertion. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Look { /// Match the beginning of text. Specifically, this matches at the starting /// position of the input. Start = 1 << 0, /// Match the end of text. Specifically, this matches at the ending /// position of the input. End = 1 << 1, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following a `\n` character. StartLF = 1 << 2, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\n` character. EndLF = 1 << 3, /// Match the beginning of a line or the beginning of text. Specifically, /// this matches at the starting position of the input, or at the position /// immediately following either a `\r` or `\n` character, but never after /// a `\r` when a `\n` follows. StartCRLF = 1 << 4, /// Match the end of a line or the end of text. Specifically, this matches /// at the end position of the input, or at the position immediately /// preceding a `\r` or `\n` character, but never before a `\n` when a `\r` /// precedes it. EndCRLF = 1 << 5, /// Match an ASCII-only word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. WordAscii = 1 << 6, /// Match an ASCII-only negation of a word boundary. WordAsciiNegate = 1 << 7, /// Match a Unicode-aware word boundary. That is, this matches a position /// where the left adjacent character and right adjacent character /// correspond to a word and non-word or a non-word and word character. WordUnicode = 1 << 8, /// Match a Unicode-aware negation of a word boundary. WordUnicodeNegate = 1 << 9, /// Match the start of an ASCII-only word boundary. That is, this matches a /// position at either the beginning of the haystack or where the previous /// character is not a word character and the following character is a word /// character. WordStartAscii = 1 << 10, /// Match the end of an ASCII-only word boundary. That is, this matches /// a position at either the end of the haystack or where the previous /// character is a word character and the following character is not a word /// character. WordEndAscii = 1 << 11, /// Match the start of a Unicode word boundary. That is, this matches a /// position at either the beginning of the haystack or where the previous /// character is not a word character and the following character is a word /// character. WordStartUnicode = 1 << 12, /// Match the end of a Unicode word boundary. That is, this matches a /// position at either the end of the haystack or where the previous /// character is a word character and the following character is not a word /// character. WordEndUnicode = 1 << 13, /// Match the start half of an ASCII-only word boundary. That is, this /// matches a position at either the beginning of the haystack or where the /// previous character is not a word character. WordStartHalfAscii = 1 << 14, /// Match the end half of an ASCII-only word boundary. That is, this /// matches a position at either the end of the haystack or where the /// following character is not a word character. WordEndHalfAscii = 1 << 15, /// Match the start half of a Unicode word boundary. That is, this matches /// a position at either the beginning of the haystack or where the /// previous character is not a word character. WordStartHalfUnicode = 1 << 16, /// Match the end half of a Unicode word boundary. That is, this matches /// a position at either the end of the haystack or where the following /// character is not a word character. WordEndHalfUnicode = 1 << 17, } impl Look { /// Flip the look-around assertion to its equivalent for reverse searches. /// For example, `StartLF` gets translated to `EndLF`. /// /// Some assertions, such as `WordUnicode`, remain the same since they /// match the same positions regardless of the direction of the search. #[inline] pub const fn reversed(self) -> Look { match self { Look::Start => Look::End, Look::End => Look::Start, Look::StartLF => Look::EndLF, Look::EndLF => Look::StartLF, Look::StartCRLF => Look::EndCRLF, Look::EndCRLF => Look::StartCRLF, Look::WordAscii => Look::WordAscii, Look::WordAsciiNegate => Look::WordAsciiNegate, Look::WordUnicode => Look::WordUnicode, Look::WordUnicodeNegate => Look::WordUnicodeNegate, Look::WordStartAscii => Look::WordEndAscii, Look::WordEndAscii => Look::WordStartAscii, Look::WordStartUnicode => Look::WordEndUnicode, Look::WordEndUnicode => Look::WordStartUnicode, Look::WordStartHalfAscii => Look::WordEndHalfAscii, Look::WordEndHalfAscii => Look::WordStartHalfAscii, Look::WordStartHalfUnicode => Look::WordEndHalfUnicode, Look::WordEndHalfUnicode => Look::WordStartHalfUnicode, } } /// Return the underlying representation of this look-around enumeration /// as an integer. Giving the return value to the [`Look::from_repr`] /// constructor is guaranteed to return the same look-around variant that /// one started with within a semver compatible release of this crate. #[inline] pub const fn as_repr(self) -> u32 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. self as u32 } /// Given the underlying representation of a `Look` value, return the /// corresponding `Look` value if the representation is valid. Otherwise /// `None` is returned. #[inline] pub const fn from_repr(repr: u32) -> Option { match repr { 0b00_0000_0000_0000_0001 => Some(Look::Start), 0b00_0000_0000_0000_0010 => Some(Look::End), 0b00_0000_0000_0000_0100 => Some(Look::StartLF), 0b00_0000_0000_0000_1000 => Some(Look::EndLF), 0b00_0000_0000_0001_0000 => Some(Look::StartCRLF), 0b00_0000_0000_0010_0000 => Some(Look::EndCRLF), 0b00_0000_0000_0100_0000 => Some(Look::WordAscii), 0b00_0000_0000_1000_0000 => Some(Look::WordAsciiNegate), 0b00_0000_0001_0000_0000 => Some(Look::WordUnicode), 0b00_0000_0010_0000_0000 => Some(Look::WordUnicodeNegate), 0b00_0000_0100_0000_0000 => Some(Look::WordStartAscii), 0b00_0000_1000_0000_0000 => Some(Look::WordEndAscii), 0b00_0001_0000_0000_0000 => Some(Look::WordStartUnicode), 0b00_0010_0000_0000_0000 => Some(Look::WordEndUnicode), 0b00_0100_0000_0000_0000 => Some(Look::WordStartHalfAscii), 0b00_1000_0000_0000_0000 => Some(Look::WordEndHalfAscii), 0b01_0000_0000_0000_0000 => Some(Look::WordStartHalfUnicode), 0b10_0000_0000_0000_0000 => Some(Look::WordEndHalfUnicode), _ => None, } } /// Returns a convenient single codepoint representation of this /// look-around assertion. Each assertion is guaranteed to be represented /// by a distinct character. /// /// This is useful for succinctly representing a look-around assertion in /// human friendly but succinct output intended for a programmer working on /// regex internals. #[inline] pub const fn as_char(self) -> char { match self { Look::Start => 'A', Look::End => 'z', Look::StartLF => '^', Look::EndLF => '$', Look::StartCRLF => 'r', Look::EndCRLF => 'R', Look::WordAscii => 'b', Look::WordAsciiNegate => 'B', Look::WordUnicode => '𝛃', Look::WordUnicodeNegate => '𝚩', Look::WordStartAscii => '<', Look::WordEndAscii => '>', Look::WordStartUnicode => '〈', Look::WordEndUnicode => '〉', Look::WordStartHalfAscii => '◁', Look::WordEndHalfAscii => '▷', Look::WordStartHalfUnicode => '◀', Look::WordEndHalfUnicode => '▶', } } } /// LookSet is a memory-efficient set of look-around assertions. /// /// This is useful for efficiently tracking look-around assertions. For /// example, a [`thompson::NFA`](crate::nfa::thompson::NFA) provides properties /// that return `LookSet`s. #[derive(Clone, Copy, Default, Eq, PartialEq)] pub struct LookSet { /// The underlying representation this set is exposed to make it possible /// to store it somewhere efficiently. The representation is that /// of a bitset, where each assertion occupies bit `i` where /// `i = Look::as_repr()`. /// /// Note that users of this internal representation must permit the full /// range of `u16` values to be represented. For example, even if the /// current implementation only makes use of the 10 least significant bits, /// it may use more bits in a future semver compatible release. pub bits: u32, } impl LookSet { /// Create an empty set of look-around assertions. #[inline] pub fn empty() -> LookSet { LookSet { bits: 0 } } /// Create a full set of look-around assertions. /// /// This set contains all possible look-around assertions. #[inline] pub fn full() -> LookSet { LookSet { bits: !0 } } /// Create a look-around set containing the look-around assertion given. /// /// This is a convenience routine for creating an empty set and inserting /// one look-around assertions. #[inline] pub fn singleton(look: Look) -> LookSet { LookSet::empty().insert(look) } /// Returns the total number of look-around assertions in this set. #[inline] pub fn len(self) -> usize { // OK because max value always fits in a u8, which in turn always // fits in a usize, regardless of target. usize::try_from(self.bits.count_ones()).unwrap() } /// Returns true if and only if this set is empty. #[inline] pub fn is_empty(self) -> bool { self.len() == 0 } /// Returns true if and only if the given look-around assertion is in this /// set. #[inline] pub fn contains(self, look: Look) -> bool { self.bits & look.as_repr() != 0 } /// Returns true if and only if this set contains any anchor assertions. /// This includes both "start/end of haystack" and "start/end of line." #[inline] pub fn contains_anchor(&self) -> bool { self.contains_anchor_haystack() || self.contains_anchor_line() } /// Returns true if and only if this set contains any "start/end of /// haystack" anchors. This doesn't include "start/end of line" anchors. #[inline] pub fn contains_anchor_haystack(&self) -> bool { self.contains(Look::Start) || self.contains(Look::End) } /// Returns true if and only if this set contains any "start/end of line" /// anchors. This doesn't include "start/end of haystack" anchors. This /// includes both `\n` line anchors and CRLF (`\r\n`) aware line anchors. #[inline] pub fn contains_anchor_line(&self) -> bool { self.contains(Look::StartLF) || self.contains(Look::EndLF) || self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } /// Returns true if and only if this set contains any "start/end of line" /// anchors that only treat `\n` as line terminators. This does not include /// haystack anchors or CRLF aware line anchors. #[inline] pub fn contains_anchor_lf(&self) -> bool { self.contains(Look::StartLF) || self.contains(Look::EndLF) } /// Returns true if and only if this set contains any "start/end of line" /// anchors that are CRLF-aware. This doesn't include "start/end of /// haystack" or "start/end of line-feed" anchors. #[inline] pub fn contains_anchor_crlf(&self) -> bool { self.contains(Look::StartCRLF) || self.contains(Look::EndCRLF) } /// Returns true if and only if this set contains any word boundary or /// negated word boundary assertions. This include both Unicode and ASCII /// word boundaries. #[inline] pub fn contains_word(self) -> bool { self.contains_word_unicode() || self.contains_word_ascii() } /// Returns true if and only if this set contains any Unicode word boundary /// or negated Unicode word boundary assertions. #[inline] pub fn contains_word_unicode(self) -> bool { self.contains(Look::WordUnicode) || self.contains(Look::WordUnicodeNegate) || self.contains(Look::WordStartUnicode) || self.contains(Look::WordEndUnicode) || self.contains(Look::WordStartHalfUnicode) || self.contains(Look::WordEndHalfUnicode) } /// Returns true if and only if this set contains any ASCII word boundary /// or negated ASCII word boundary assertions. #[inline] pub fn contains_word_ascii(self) -> bool { self.contains(Look::WordAscii) || self.contains(Look::WordAsciiNegate) || self.contains(Look::WordStartAscii) || self.contains(Look::WordEndAscii) || self.contains(Look::WordStartHalfAscii) || self.contains(Look::WordEndHalfAscii) } /// Returns an iterator over all of the look-around assertions in this set. #[inline] pub fn iter(self) -> LookSetIter { LookSetIter { set: self } } /// Return a new set that is equivalent to the original, but with the given /// assertion added to it. If the assertion is already in the set, then the /// returned set is equivalent to the original. #[inline] pub fn insert(self, look: Look) -> LookSet { LookSet { bits: self.bits | look.as_repr() } } /// Updates this set in place with the result of inserting the given /// assertion into this set. #[inline] pub fn set_insert(&mut self, look: Look) { *self = self.insert(look); } /// Return a new set that is equivalent to the original, but with the given /// assertion removed from it. If the assertion is not in the set, then the /// returned set is equivalent to the original. #[inline] pub fn remove(self, look: Look) -> LookSet { LookSet { bits: self.bits & !look.as_repr() } } /// Updates this set in place with the result of removing the given /// assertion from this set. #[inline] pub fn set_remove(&mut self, look: Look) { *self = self.remove(look); } /// Returns a new set that is the result of subtracting the given set from /// this set. #[inline] pub fn subtract(self, other: LookSet) -> LookSet { LookSet { bits: self.bits & !other.bits } } /// Updates this set in place with the result of subtracting the given set /// from this set. #[inline] pub fn set_subtract(&mut self, other: LookSet) { *self = self.subtract(other); } /// Returns a new set that is the union of this and the one given. #[inline] pub fn union(self, other: LookSet) -> LookSet { LookSet { bits: self.bits | other.bits } } /// Updates this set in place with the result of unioning it with the one /// given. #[inline] pub fn set_union(&mut self, other: LookSet) { *self = self.union(other); } /// Returns a new set that is the intersection of this and the one given. #[inline] pub fn intersect(self, other: LookSet) -> LookSet { LookSet { bits: self.bits & other.bits } } /// Updates this set in place with the result of intersecting it with the /// one given. #[inline] pub fn set_intersect(&mut self, other: LookSet) { *self = self.intersect(other); } /// Return a `LookSet` from the slice given as a native endian 32-bit /// integer. /// /// # Panics /// /// This panics if `slice.len() < 4`. #[inline] pub fn read_repr(slice: &[u8]) -> LookSet { let bits = u32::from_ne_bytes(slice[..4].try_into().unwrap()); LookSet { bits } } /// Write a `LookSet` as a native endian 32-bit integer to the beginning /// of the slice given. /// /// # Panics /// /// This panics if `slice.len() < 4`. #[inline] pub fn write_repr(self, slice: &mut [u8]) { let raw = self.bits.to_ne_bytes(); slice[0] = raw[0]; slice[1] = raw[1]; slice[2] = raw[2]; slice[3] = raw[3]; } /// Checks that all assertions in this set can be matched. /// /// Some assertions, such as Unicode word boundaries, require optional (but /// enabled by default) tables that may not be available. If there are /// assertions in this set that require tables that are not available, then /// this will return an error. /// /// Specifically, this returns an error when the the /// `unicode-word-boundary` feature is _not_ enabled _and_ this set /// contains a Unicode word boundary assertion. /// /// It can be useful to use this on the result of /// [`NFA::look_set_any`](crate::nfa::thompson::NFA::look_set_any) /// when building a matcher engine to ensure methods like /// [`LookMatcher::matches_set`] do not panic at search time. pub fn available(self) -> Result<(), UnicodeWordBoundaryError> { if self.contains_word_unicode() { UnicodeWordBoundaryError::check()?; } Ok(()) } } impl core::fmt::Debug for LookSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { if self.is_empty() { return write!(f, "∅"); } for look in self.iter() { write!(f, "{}", look.as_char())?; } Ok(()) } } /// An iterator over all look-around assertions in a [`LookSet`]. /// /// This iterator is created by [`LookSet::iter`]. #[derive(Clone, Debug)] pub struct LookSetIter { set: LookSet, } impl Iterator for LookSetIter { type Item = Look; #[inline] fn next(&mut self) -> Option { if self.set.is_empty() { return None; } // We'll never have more than u8::MAX distinct look-around assertions, // so 'bit' will always fit into a u16. let bit = u16::try_from(self.set.bits.trailing_zeros()).unwrap(); let look = Look::from_repr(1 << bit)?; self.set = self.set.remove(look); Some(look) } } /// A matcher for look-around assertions. /// /// This matcher permits configuring aspects of how look-around assertions are /// matched. /// /// # Example /// /// A `LookMatcher` can change the line terminator used for matching multi-line /// anchors such as `(?m:^)` and `(?m:$)`. /// /// ``` /// use regex_automata::{ /// nfa::thompson::{self, pikevm::PikeVM}, /// util::look::LookMatcher, /// Match, Input, /// }; /// /// let mut lookm = LookMatcher::new(); /// lookm.set_line_terminator(b'\x00'); /// /// let re = PikeVM::builder() /// .thompson(thompson::Config::new().look_matcher(lookm)) /// .build(r"(?m)^[a-z]+$")?; /// let mut cache = re.create_cache(); /// /// // Multi-line assertions now use NUL as a terminator. /// assert_eq!( /// Some(Match::must(0, 1..4)), /// re.find(&mut cache, b"\x00abc\x00"), /// ); /// // ... and \n is no longer recognized as a terminator. /// assert_eq!( /// None, /// re.find(&mut cache, b"\nabc\n"), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct LookMatcher { lineterm: DebugByte, } impl LookMatcher { /// Creates a new default matcher for look-around assertions. pub fn new() -> LookMatcher { LookMatcher { lineterm: DebugByte(b'\n') } } /// Sets the line terminator for use with `(?m:^)` and `(?m:$)`. /// /// Namely, instead of `^` matching after `\n` and `$` matching immediately /// before a `\n`, this will cause it to match after and before the byte /// given. /// /// It can occasionally be useful to use this to configure the line /// terminator to the NUL byte when searching binary data. /// /// Note that this does not apply to CRLF-aware line anchors such as /// `(?Rm:^)` and `(?Rm:$)`. CRLF-aware line anchors are hard-coded to /// use `\r` and `\n`. pub fn set_line_terminator(&mut self, byte: u8) -> &mut LookMatcher { self.lineterm.0 = byte; self } /// Returns the line terminator that was configured for this matcher. /// /// If no line terminator was configured, then this returns `\n`. /// /// Note that the line terminator should only be used for matching `(?m:^)` /// and `(?m:$)` assertions. It specifically should _not_ be used for /// matching the CRLF aware assertions `(?Rm:^)` and `(?Rm:$)`. pub fn get_line_terminator(&self) -> u8 { self.lineterm.0 } /// Returns true when the position `at` in `haystack` satisfies the given /// look-around assertion. /// /// # Panics /// /// This panics when testing any Unicode word boundary assertion in this /// set and when the Unicode word data is not available. Specifically, this /// only occurs when the `unicode-word-boundary` feature is not enabled. /// /// Since it's generally expected that this routine is called inside of /// a matching engine, callers should check the error condition when /// building the matching engine. If there is a Unicode word boundary /// in the matcher and the data isn't available, then the matcher should /// fail to build. /// /// Callers can check the error condition with [`LookSet::available`]. /// /// This also may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn matches(&self, look: Look, haystack: &[u8], at: usize) -> bool { self.matches_inline(look, haystack, at) } /// Like `matches`, but forcefully inlined. /// /// # Panics /// /// This panics when testing any Unicode word boundary assertion in this /// set and when the Unicode word data is not available. Specifically, this /// only occurs when the `unicode-word-boundary` feature is not enabled. /// /// Since it's generally expected that this routine is called inside of /// a matching engine, callers should check the error condition when /// building the matching engine. If there is a Unicode word boundary /// in the matcher and the data isn't available, then the matcher should /// fail to build. /// /// Callers can check the error condition with [`LookSet::available`]. /// /// This also may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_inline( &self, look: Look, haystack: &[u8], at: usize, ) -> bool { match look { Look::Start => self.is_start(haystack, at), Look::End => self.is_end(haystack, at), Look::StartLF => self.is_start_lf(haystack, at), Look::EndLF => self.is_end_lf(haystack, at), Look::StartCRLF => self.is_start_crlf(haystack, at), Look::EndCRLF => self.is_end_crlf(haystack, at), Look::WordAscii => self.is_word_ascii(haystack, at), Look::WordAsciiNegate => self.is_word_ascii_negate(haystack, at), Look::WordUnicode => self.is_word_unicode(haystack, at).unwrap(), Look::WordUnicodeNegate => { self.is_word_unicode_negate(haystack, at).unwrap() } Look::WordStartAscii => self.is_word_start_ascii(haystack, at), Look::WordEndAscii => self.is_word_end_ascii(haystack, at), Look::WordStartUnicode => { self.is_word_start_unicode(haystack, at).unwrap() } Look::WordEndUnicode => { self.is_word_end_unicode(haystack, at).unwrap() } Look::WordStartHalfAscii => { self.is_word_start_half_ascii(haystack, at) } Look::WordEndHalfAscii => { self.is_word_end_half_ascii(haystack, at) } Look::WordStartHalfUnicode => { self.is_word_start_half_unicode(haystack, at).unwrap() } Look::WordEndHalfUnicode => { self.is_word_end_half_unicode(haystack, at).unwrap() } } } /// Returns true when _all_ of the assertions in the given set match at the /// given position in the haystack. /// /// # Panics /// /// This panics when testing any Unicode word boundary assertion in this /// set and when the Unicode word data is not available. Specifically, this /// only occurs when the `unicode-word-boundary` feature is not enabled. /// /// Since it's generally expected that this routine is called inside of /// a matching engine, callers should check the error condition when /// building the matching engine. If there is a Unicode word boundary /// in the matcher and the data isn't available, then the matcher should /// fail to build. /// /// Callers can check the error condition with [`LookSet::available`]. /// /// This also may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn matches_set( &self, set: LookSet, haystack: &[u8], at: usize, ) -> bool { self.matches_set_inline(set, haystack, at) } /// Like `LookSet::matches`, but forcefully inlined for perf. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn matches_set_inline( &self, set: LookSet, haystack: &[u8], at: usize, ) -> bool { // This used to luse LookSet::iter with Look::matches on each element, // but that proved to be quite diastrous for perf. The manual "if // the set has this assertion, check it" turns out to be quite a bit // faster. if set.contains(Look::Start) { if !self.is_start(haystack, at) { return false; } } if set.contains(Look::End) { if !self.is_end(haystack, at) { return false; } } if set.contains(Look::StartLF) { if !self.is_start_lf(haystack, at) { return false; } } if set.contains(Look::EndLF) { if !self.is_end_lf(haystack, at) { return false; } } if set.contains(Look::StartCRLF) { if !self.is_start_crlf(haystack, at) { return false; } } if set.contains(Look::EndCRLF) { if !self.is_end_crlf(haystack, at) { return false; } } if set.contains(Look::WordAscii) { if !self.is_word_ascii(haystack, at) { return false; } } if set.contains(Look::WordAsciiNegate) { if !self.is_word_ascii_negate(haystack, at) { return false; } } if set.contains(Look::WordUnicode) { if !self.is_word_unicode(haystack, at).unwrap() { return false; } } if set.contains(Look::WordUnicodeNegate) { if !self.is_word_unicode_negate(haystack, at).unwrap() { return false; } } if set.contains(Look::WordStartAscii) { if !self.is_word_start_ascii(haystack, at) { return false; } } if set.contains(Look::WordEndAscii) { if !self.is_word_end_ascii(haystack, at) { return false; } } if set.contains(Look::WordStartUnicode) { if !self.is_word_start_unicode(haystack, at).unwrap() { return false; } } if set.contains(Look::WordEndUnicode) { if !self.is_word_end_unicode(haystack, at).unwrap() { return false; } } if set.contains(Look::WordStartHalfAscii) { if !self.is_word_start_half_ascii(haystack, at) { return false; } } if set.contains(Look::WordEndHalfAscii) { if !self.is_word_end_half_ascii(haystack, at) { return false; } } if set.contains(Look::WordStartHalfUnicode) { if !self.is_word_start_half_unicode(haystack, at).unwrap() { return false; } } if set.contains(Look::WordEndHalfUnicode) { if !self.is_word_end_half_unicode(haystack, at).unwrap() { return false; } } true } /// Split up the given byte classes into equivalence classes in a way that /// is consistent with this look-around assertion. #[cfg(feature = "alloc")] pub(crate) fn add_to_byteset( &self, look: Look, set: &mut crate::util::alphabet::ByteClassSet, ) { match look { Look::Start | Look::End => {} Look::StartLF | Look::EndLF => { set.set_range(self.lineterm.0, self.lineterm.0); } Look::StartCRLF | Look::EndCRLF => { set.set_range(b'\r', b'\r'); set.set_range(b'\n', b'\n'); } Look::WordAscii | Look::WordAsciiNegate | Look::WordUnicode | Look::WordUnicodeNegate | Look::WordStartAscii | Look::WordEndAscii | Look::WordStartUnicode | Look::WordEndUnicode | Look::WordStartHalfAscii | Look::WordEndHalfAscii | Look::WordStartHalfUnicode | Look::WordEndHalfUnicode => { // We need to mark all ranges of bytes whose pairs result in // evaluating \b differently. This isn't technically correct // for Unicode word boundaries, but DFAs can't handle those // anyway, and thus, the byte classes don't need to either // since they are themselves only used in DFAs. // // FIXME: It seems like the calls to 'set_range' here are // completely invariant, which means we could just hard-code // them here without needing to write a loop. And we only need // to do this dance at most once per regex. // // FIXME: Is this correct for \B? let iswb = utf8::is_word_byte; // This unwrap is OK because we guard every use of 'asu8' with // a check that the input is <= 255. let asu8 = |b: u16| u8::try_from(b).unwrap(); let mut b1: u16 = 0; let mut b2: u16; while b1 <= 255 { b2 = b1 + 1; while b2 <= 255 && iswb(asu8(b1)) == iswb(asu8(b2)) { b2 += 1; } // The guards above guarantee that b2 can never get any // bigger. assert!(b2 <= 256); // Subtracting 1 from b2 is always OK because it is always // at least 1 greater than b1, and the assert above // guarantees that the asu8 conversion will succeed. set.set_range(asu8(b1), asu8(b2.checked_sub(1).unwrap())); b1 = b2; } } } } /// Returns true when [`Look::Start`] is satisfied `at` the given position /// in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_start(&self, _haystack: &[u8], at: usize) -> bool { at == 0 } /// Returns true when [`Look::End`] is satisfied `at` the given position in /// `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_end(&self, haystack: &[u8], at: usize) -> bool { at == haystack.len() } /// Returns true when [`Look::StartLF`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_start_lf(&self, haystack: &[u8], at: usize) -> bool { self.is_start(haystack, at) || haystack[at - 1] == self.lineterm.0 } /// Returns true when [`Look::EndLF`] is satisfied `at` the given position /// in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_end_lf(&self, haystack: &[u8], at: usize) -> bool { self.is_end(haystack, at) || haystack[at] == self.lineterm.0 } /// Returns true when [`Look::StartCRLF`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_start_crlf(&self, haystack: &[u8], at: usize) -> bool { self.is_start(haystack, at) || haystack[at - 1] == b'\n' || (haystack[at - 1] == b'\r' && (at >= haystack.len() || haystack[at] != b'\n')) } /// Returns true when [`Look::EndCRLF`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_end_crlf(&self, haystack: &[u8], at: usize) -> bool { self.is_end(haystack, at) || haystack[at] == b'\r' || (haystack[at] == b'\n' && (at == 0 || haystack[at - 1] != b'\r')) } /// Returns true when [`Look::WordAscii`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_ascii(&self, haystack: &[u8], at: usize) -> bool { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before != word_after } /// Returns true when [`Look::WordAsciiNegate`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_ascii_negate(&self, haystack: &[u8], at: usize) -> bool { !self.is_word_ascii(haystack, at) } /// Returns true when [`Look::WordUnicode`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_unicode( &self, haystack: &[u8], at: usize, ) -> Result { let word_before = is_word_char::rev(haystack, at)?; let word_after = is_word_char::fwd(haystack, at)?; Ok(word_before != word_after) } /// Returns true when [`Look::WordUnicodeNegate`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_unicode_negate( &self, haystack: &[u8], at: usize, ) -> Result { // This is pretty subtle. Why do we need to do UTF-8 decoding here? // Well... at time of writing, the is_word_char_{fwd,rev} routines will // only return true if there is a valid UTF-8 encoding of a "word" // codepoint, and false in every other case (including invalid UTF-8). // This means that in regions of invalid UTF-8 (which might be a // subset of valid UTF-8!), it would result in \B matching. While this // would be questionable in the context of truly invalid UTF-8, it is // *certainly* wrong to report match boundaries that split the encoding // of a codepoint. So to work around this, we ensure that we can decode // a codepoint on either side of `at`. If either direction fails, then // we don't permit \B to match at all. // // Now, this isn't exactly optimal from a perf perspective. We could // try and detect this in is_word_char::{fwd,rev}, but it's not clear // if it's worth it. \B is, after all, rarely used. Even worse, // is_word_char::{fwd,rev} could do its own UTF-8 decoding, and so this // will wind up doing UTF-8 decoding twice. Owch. We could fix this // with more code complexity, but it just doesn't feel worth it for \B. // // And in particular, we do *not* have to do this with \b, because \b // *requires* that at least one side of `at` be a "word" codepoint, // which in turn implies one side of `at` must be valid UTF-8. This in // turn implies that \b can never split a valid UTF-8 encoding of a // codepoint. In the case where one side of `at` is truly invalid UTF-8 // and the other side IS a word codepoint, then we want \b to match // since it represents a valid UTF-8 boundary. It also makes sense. For // example, you'd want \b\w+\b to match 'abc' in '\xFFabc\xFF'. // // Note also that this is not just '!is_word_unicode(..)' like it is // for the ASCII case. For example, neither \b nor \B is satisfied // within invalid UTF-8 sequences. let word_before = at > 0 && match utf8::decode_last(&haystack[..at]) { None | Some(Err(_)) => return Ok(false), Some(Ok(_)) => is_word_char::rev(haystack, at)?, }; let word_after = at < haystack.len() && match utf8::decode(&haystack[at..]) { None | Some(Err(_)) => return Ok(false), Some(Ok(_)) => is_word_char::fwd(haystack, at)?, }; Ok(word_before == word_after) } /// Returns true when [`Look::WordStartAscii`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_start_ascii(&self, haystack: &[u8], at: usize) -> bool { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); !word_before && word_after } /// Returns true when [`Look::WordEndAscii`] is satisfied `at` the given /// position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_end_ascii(&self, haystack: &[u8], at: usize) -> bool { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); word_before && !word_after } /// Returns true when [`Look::WordStartUnicode`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_start_unicode( &self, haystack: &[u8], at: usize, ) -> Result { let word_before = is_word_char::rev(haystack, at)?; let word_after = is_word_char::fwd(haystack, at)?; Ok(!word_before && word_after) } /// Returns true when [`Look::WordEndUnicode`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_end_unicode( &self, haystack: &[u8], at: usize, ) -> Result { let word_before = is_word_char::rev(haystack, at)?; let word_after = is_word_char::fwd(haystack, at)?; Ok(word_before && !word_after) } /// Returns true when [`Look::WordStartHalfAscii`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_start_half_ascii( &self, haystack: &[u8], at: usize, ) -> bool { let word_before = at > 0 && utf8::is_word_byte(haystack[at - 1]); !word_before } /// Returns true when [`Look::WordEndHalfAscii`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. #[inline] pub fn is_word_end_half_ascii(&self, haystack: &[u8], at: usize) -> bool { let word_after = at < haystack.len() && utf8::is_word_byte(haystack[at]); !word_after } /// Returns true when [`Look::WordStartHalfUnicode`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_start_half_unicode( &self, haystack: &[u8], at: usize, ) -> Result { // See `is_word_unicode_negate` for why we need to do this. We don't // need to do it for `is_word_start_unicode` because that guarantees // that the position matched falls on a valid UTF-8 boundary given // that the right side must be in \w. let word_before = at > 0 && match utf8::decode_last(&haystack[..at]) { None | Some(Err(_)) => return Ok(false), Some(Ok(_)) => is_word_char::rev(haystack, at)?, }; Ok(!word_before) } /// Returns true when [`Look::WordEndHalfUnicode`] is satisfied `at` the /// given position in `haystack`. /// /// # Panics /// /// This may panic when `at > haystack.len()`. Note that `at == /// haystack.len()` is legal and guaranteed not to panic. /// /// # Errors /// /// This returns an error when Unicode word boundary tables /// are not available. Specifically, this only occurs when the /// `unicode-word-boundary` feature is not enabled. #[inline] pub fn is_word_end_half_unicode( &self, haystack: &[u8], at: usize, ) -> Result { // See `is_word_unicode_negate` for why we need to do this. We don't // need to do it for `is_word_end_unicode` because that guarantees // that the position matched falls on a valid UTF-8 boundary given // that the left side must be in \w. let word_after = at < haystack.len() && match utf8::decode(&haystack[at..]) { None | Some(Err(_)) => return Ok(false), Some(Ok(_)) => is_word_char::fwd(haystack, at)?, }; Ok(!word_after) } } impl Default for LookMatcher { fn default() -> LookMatcher { LookMatcher::new() } } /// An error that occurs when the Unicode-aware `\w` class is unavailable. /// /// This error can occur when the data tables necessary for the Unicode aware /// Perl character class `\w` are unavailable. The `\w` class is used to /// determine whether a codepoint is considered a word character or not when /// determining whether a Unicode aware `\b` (or `\B`) matches at a particular /// position. /// /// This error can only occur when the `unicode-word-boundary` feature is /// disabled. #[derive(Clone, Debug)] pub struct UnicodeWordBoundaryError(()); impl UnicodeWordBoundaryError { #[cfg(not(feature = "unicode-word-boundary"))] pub(crate) fn new() -> UnicodeWordBoundaryError { UnicodeWordBoundaryError(()) } /// Returns an error if and only if Unicode word boundary data is /// unavailable. pub fn check() -> Result<(), UnicodeWordBoundaryError> { is_word_char::check() } } #[cfg(feature = "std")] impl std::error::Error for UnicodeWordBoundaryError {} impl core::fmt::Display for UnicodeWordBoundaryError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!( f, "Unicode-aware \\b and \\B are unavailable because the \ requisite data tables are missing, please enable the \ unicode-word-boundary feature" ) } } // Below are FOUR different ways for checking whether whether a "word" // codepoint exists at a particular position in the haystack. The four // different approaches are, in order of preference: // // 1. Parse '\w', convert to an NFA, convert to a fully compiled DFA on the // first call, and then use that DFA for all subsequent calls. // 2. Do UTF-8 decoding and use regex_syntax::is_word_character if available. // 3. Do UTF-8 decoding and use our own 'perl_word' table. // 4. Return an error. // // The reason for all of these approaches is a combination of perf and // permitting one to build regex-automata without the Unicode data necessary // for handling Unicode-aware word boundaries. (In which case, '(?-u:\b)' would // still work.) // // The DFA approach is the fastest, but it requires the regex parser, the // NFA compiler, the DFA builder and the DFA search runtime. That's a lot to // bring in, but if it's available, it's (probably) the best we can do. // // Approaches (2) and (3) are effectively equivalent, but (2) reuses the // data in regex-syntax and avoids duplicating it in regex-automata. // // Finally, (4) unconditionally returns an error since the requisite data isn't // available anywhere. // // There are actually more approaches possible that we didn't implement. For // example, if the DFA builder is available but the syntax parser is not, we // could technically hand construct our own NFA from the 'perl_word' data // table. But to avoid some pretty hairy code duplication, we would in turn // need to pull the UTF-8 compiler out of the NFA compiler. Yikes. // // A possibly more sensible alternative is to use a lazy DFA when the full // DFA builder isn't available... // // Yet another choice would be to build the full DFA and then embed it into the // source. Then we'd only need to bring in the DFA search runtime, which is // considerably smaller than the DFA builder code. The problem here is that the // Debian people have spooked me[1] into avoiding cyclic dependencies. Namely, // we'd need to build regex-cli, which depends on regex-automata in order to // build some part of regex-automata. But to be honest, something like this has // to be allowed somehow? I just don't know what the right process is. // // There are perhaps other choices as well. Why did I stop at these 4? Because // I wanted to preserve my sanity. I suspect I'll wind up adding the lazy DFA // approach eventually, as the benefits of the DFA approach are somewhat // compelling. The 'boundary-words-holmes' benchmark tests this. (Note that // the commands below no longer work. If necessary, we should re-capitulate // the benchmark from whole cloth in rebar.) // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > dfa.csv // // Then I changed the code below so that the util/unicode_data/perl_word table // was used and re-ran the benchmark: // // $ regex-cli bench measure -f boundary-words-holmes -e pikevm > table.csv // // And compared them: // // $ regex-cli bench diff dfa.csv table.csv // benchmark engine dfa table // --------- ------ --- ----- // internal/count/boundary-words-holmes regex/automata/pikevm 18.6 MB/s 12.9 MB/s // // Which is a nice improvement. // // UPDATE: It turns out that it takes approximately 22ms to build the reverse // DFA for \w. (And about 3ms for the forward DFA.) It's probably not much in // the grand scheme things, but that is a significant latency cost. So I'm not // sure that's a good idea. I then tried using a lazy DFA instead, and that // eliminated the overhead, but since the lazy DFA requires mutable working // memory, that requires introducing a 'Cache' for every simultaneous call. // // I ended up deciding for now to just keep the "UTF-8 decode and check the // table." The DFA and lazy DFA approaches are still below, but commented out. // // [1]: https://github.com/BurntSushi/ucd-generate/issues/11 /* /// A module that looks for word codepoints using lazy DFAs. #[cfg(all( feature = "unicode-word-boundary", feature = "syntax", feature = "unicode-perl", feature = "hybrid" ))] mod is_word_char { use alloc::vec::Vec; use crate::{ hybrid::dfa::{Cache, DFA}, nfa::thompson::NFA, util::{lazy::Lazy, pool::Pool, primitives::StateID}, Anchored, Input, }; pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn fwd( haystack: &[u8], mut at: usize, ) -> Result { static WORD: Lazy = Lazy::new(|| DFA::new(r"\w").unwrap()); static CACHE: Lazy> = Lazy::new(|| Pool::new(|| WORD.create_cache())); let dfa = Lazy::get(&WORD); let mut cache = Lazy::get(&CACHE).get(); let mut sid = dfa .start_state_forward( &mut cache, &Input::new("").anchored(Anchored::Yes), ) .unwrap(); while at < haystack.len() { let byte = haystack[at]; sid = dfa.next_state(&mut cache, sid, byte).unwrap(); at += 1; if sid.is_tagged() { if sid.is_match() { return Ok(true); } else if sid.is_dead() { return Ok(false); } } } Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn rev( haystack: &[u8], mut at: usize, ) -> Result { static WORD: Lazy = Lazy::new(|| { DFA::builder() .thompson(NFA::config().reverse(true)) .build(r"\w") .unwrap() }); static CACHE: Lazy> = Lazy::new(|| Pool::new(|| WORD.create_cache())); let dfa = Lazy::get(&WORD); let mut cache = Lazy::get(&CACHE).get(); let mut sid = dfa .start_state_reverse( &mut cache, &Input::new("").anchored(Anchored::Yes), ) .unwrap(); while at > 0 { at -= 1; let byte = haystack[at]; sid = dfa.next_state(&mut cache, sid, byte).unwrap(); if sid.is_tagged() { if sid.is_match() { return Ok(true); } else if sid.is_dead() { return Ok(false); } } } Ok(dfa.next_eoi_state(&mut cache, sid).unwrap().is_match()) } } */ /* /// A module that looks for word codepoints using fully compiled DFAs. #[cfg(all( feature = "unicode-word-boundary", feature = "syntax", feature = "unicode-perl", feature = "dfa-build" ))] mod is_word_char { use alloc::vec::Vec; use crate::{ dfa::{dense::DFA, Automaton, StartKind}, nfa::thompson::NFA, util::{lazy::Lazy, primitives::StateID}, Anchored, Input, }; pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn fwd( haystack: &[u8], mut at: usize, ) -> Result { static WORD: Lazy<(DFA>, StateID)> = Lazy::new(|| { let dfa = DFA::builder() .configure(DFA::config().start_kind(StartKind::Anchored)) .build(r"\w") .unwrap(); // OK because our regex has no look-around. let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); (dfa, start_id) }); let &(ref dfa, mut sid) = Lazy::get(&WORD); while at < haystack.len() { let byte = haystack[at]; sid = dfa.next_state(sid, byte); at += 1; if dfa.is_special_state(sid) { if dfa.is_match_state(sid) { return Ok(true); } else if dfa.is_dead_state(sid) { return Ok(false); } } } Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn rev( haystack: &[u8], mut at: usize, ) -> Result { static WORD: Lazy<(DFA>, StateID)> = Lazy::new(|| { let dfa = DFA::builder() .configure(DFA::config().start_kind(StartKind::Anchored)) // From ad hoc measurements, it looks like setting // shrink==false is slightly faster than shrink==true. I kind // of feel like this indicates that shrinking is probably a // failure, although it can help in some cases. Sigh. .thompson(NFA::config().reverse(true).shrink(false)) .build(r"\w") .unwrap(); // OK because our regex has no look-around. let start_id = dfa.universal_start_state(Anchored::Yes).unwrap(); (dfa, start_id) }); let &(ref dfa, mut sid) = Lazy::get(&WORD); while at > 0 { at -= 1; let byte = haystack[at]; sid = dfa.next_state(sid, byte); if dfa.is_special_state(sid) { if dfa.is_match_state(sid) { return Ok(true); } else if dfa.is_dead_state(sid) { return Ok(false); } } } Ok(dfa.is_match_state(dfa.next_eoi_state(sid))) } } */ /// A module that looks for word codepoints using regex-syntax's data tables. #[cfg(all( feature = "unicode-word-boundary", feature = "syntax", feature = "unicode-perl", ))] mod is_word_char { use regex_syntax::try_is_word_character; use crate::util::utf8; pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn fwd( haystack: &[u8], at: usize, ) -> Result { Ok(match utf8::decode(&haystack[at..]) { None | Some(Err(_)) => false, Some(Ok(ch)) => try_is_word_character(ch).expect( "since unicode-word-boundary, syntax and unicode-perl \ are all enabled, it is expected that \ try_is_word_character succeeds", ), }) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn rev( haystack: &[u8], at: usize, ) -> Result { Ok(match utf8::decode_last(&haystack[..at]) { None | Some(Err(_)) => false, Some(Ok(ch)) => try_is_word_character(ch).expect( "since unicode-word-boundary, syntax and unicode-perl \ are all enabled, it is expected that \ try_is_word_character succeeds", ), }) } } /// A module that looks for word codepoints using regex-automata's data tables /// (which are only compiled when regex-syntax's tables aren't available). /// /// Note that the cfg should match the one in src/util/unicode_data/mod.rs for /// perl_word. #[cfg(all( feature = "unicode-word-boundary", not(all(feature = "syntax", feature = "unicode-perl")), ))] mod is_word_char { use crate::util::utf8; pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn fwd( haystack: &[u8], at: usize, ) -> Result { Ok(match utf8::decode(&haystack[at..]) { None | Some(Err(_)) => false, Some(Ok(ch)) => is_word_character(ch), }) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn rev( haystack: &[u8], at: usize, ) -> Result { Ok(match utf8::decode_last(&haystack[..at]) { None | Some(Err(_)) => false, Some(Ok(ch)) => is_word_character(ch), }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_word_character(c: char) -> bool { use crate::util::{unicode_data::perl_word::PERL_WORD, utf8}; if u8::try_from(c).map_or(false, utf8::is_word_byte) { return true; } PERL_WORD .binary_search_by(|&(start, end)| { use core::cmp::Ordering; if start <= c && c <= end { Ordering::Equal } else if start > c { Ordering::Greater } else { Ordering::Less } }) .is_ok() } } /// A module that always returns an error if Unicode word boundaries are /// disabled. When this feature is disabled, then regex-automata will not /// include its own data tables even if regex-syntax is disabled. #[cfg(not(feature = "unicode-word-boundary"))] mod is_word_char { pub(super) fn check() -> Result<(), super::UnicodeWordBoundaryError> { Err(super::UnicodeWordBoundaryError::new()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn fwd( _bytes: &[u8], _at: usize, ) -> Result { Err(super::UnicodeWordBoundaryError::new()) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(super) fn rev( _bytes: &[u8], _at: usize, ) -> Result { Err(super::UnicodeWordBoundaryError::new()) } } #[cfg(test)] mod tests { use super::*; macro_rules! testlook { ($look:expr, $haystack:expr, $at:expr) => { LookMatcher::default().matches($look, $haystack.as_bytes(), $at) }; } #[test] fn look_matches_start_line() { let look = Look::StartLF; assert!(testlook!(look, "", 0)); assert!(testlook!(look, "\n", 0)); assert!(testlook!(look, "\n", 1)); assert!(testlook!(look, "a", 0)); assert!(testlook!(look, "\na", 1)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a\na", 1)); } #[test] fn look_matches_end_line() { let look = Look::EndLF; assert!(testlook!(look, "", 0)); assert!(testlook!(look, "\n", 1)); assert!(testlook!(look, "\na", 0)); assert!(testlook!(look, "\na", 2)); assert!(testlook!(look, "a\na", 1)); assert!(!testlook!(look, "a", 0)); assert!(!testlook!(look, "\na", 1)); assert!(!testlook!(look, "a\na", 0)); assert!(!testlook!(look, "a\na", 2)); } #[test] fn look_matches_start_text() { let look = Look::Start; assert!(testlook!(look, "", 0)); assert!(testlook!(look, "\n", 0)); assert!(testlook!(look, "a", 0)); assert!(!testlook!(look, "\n", 1)); assert!(!testlook!(look, "\na", 1)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a\na", 1)); } #[test] fn look_matches_end_text() { let look = Look::End; assert!(testlook!(look, "", 0)); assert!(testlook!(look, "\n", 1)); assert!(testlook!(look, "\na", 2)); assert!(!testlook!(look, "\na", 0)); assert!(!testlook!(look, "a\na", 1)); assert!(!testlook!(look, "a", 0)); assert!(!testlook!(look, "\na", 1)); assert!(!testlook!(look, "a\na", 0)); assert!(!testlook!(look, "a\na", 2)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_unicode() { let look = Look::WordUnicode; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_ascii() { let look = Look::WordAscii; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_unicode_negate() { let look = Look::WordUnicodeNegate; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃b", 4)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); // These don't match because they could otherwise return an offset that // splits the UTF-8 encoding of a codepoint. assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. These also don't // match because they could otherwise return an offset that splits the // UTF-8 encoding of a codepoint. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); // But this one does, since 𐆀 isn't a word codepoint, and 8 is the end // of the haystack. So the "end" of the haystack isn't a word and 𐆀 // isn't a word, thus, \B matches. assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_ascii_negate() { let look = Look::WordAsciiNegate; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); assert!(testlook!(look, "𝛃", 1)); assert!(testlook!(look, "𝛃", 2)); assert!(testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 1)); assert!(testlook!(look, "𝛃𐆀", 2)); assert!(testlook!(look, "𝛃𐆀", 3)); assert!(testlook!(look, "𝛃𐆀", 5)); assert!(testlook!(look, "𝛃𐆀", 6)); assert!(testlook!(look, "𝛃𐆀", 7)); assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_start_ascii() { let look = Look::WordStartAscii; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_end_ascii() { let look = Look::WordEndAscii; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(testlook!(look, "b𝛃", 1)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_start_unicode() { let look = Look::WordStartUnicode; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_end_unicode() { let look = Look::WordEndUnicode; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(!testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(!testlook!(look, "a ", 2)); assert!(!testlook!(look, " a ", 0)); assert!(!testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(!testlook!(look, "𝛃 ", 5)); assert!(!testlook!(look, " 𝛃 ", 0)); assert!(!testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(!testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_start_half_ascii() { let look = Look::WordStartHalfAscii; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); assert!(testlook!(look, "𝛃", 1)); assert!(testlook!(look, "𝛃", 2)); assert!(testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 1)); assert!(testlook!(look, "𝛃𐆀", 2)); assert!(testlook!(look, "𝛃𐆀", 3)); assert!(testlook!(look, "𝛃𐆀", 5)); assert!(testlook!(look, "𝛃𐆀", 6)); assert!(testlook!(look, "𝛃𐆀", 7)); assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_matches_word_end_half_ascii() { let look = Look::WordEndHalfAscii; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. Since this is // an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. Again, since // this is an ASCII word boundary, none of these match. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(testlook!(look, "b𝛃", 1)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); assert!(testlook!(look, "𝛃", 1)); assert!(testlook!(look, "𝛃", 2)); assert!(testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 1)); assert!(testlook!(look, "𝛃𐆀", 2)); assert!(testlook!(look, "𝛃𐆀", 3)); assert!(testlook!(look, "𝛃𐆀", 5)); assert!(testlook!(look, "𝛃𐆀", 6)); assert!(testlook!(look, "𝛃𐆀", 7)); assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_start_half_unicode() { let look = Look::WordStartHalfUnicode; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(testlook!(look, "a", 0)); assert!(!testlook!(look, "a", 1)); assert!(!testlook!(look, "a ", 1)); assert!(testlook!(look, " a ", 1)); assert!(!testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(testlook!(look, "𝛃", 0)); assert!(!testlook!(look, "𝛃", 4)); assert!(!testlook!(look, "𝛃 ", 4)); assert!(testlook!(look, " 𝛃 ", 1)); assert!(!testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(testlook!(look, "𝛃𐆀", 0)); assert!(!testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] #[cfg(all(not(miri), feature = "unicode-word-boundary"))] fn look_matches_word_end_half_unicode() { let look = Look::WordEndHalfUnicode; // \xF0\x9D\x9B\x83 = 𝛃 (in \w) // \xF0\x90\x86\x80 = 𐆀 (not in \w) // Simple ASCII word boundaries. assert!(!testlook!(look, "a", 0)); assert!(testlook!(look, "a", 1)); assert!(testlook!(look, "a ", 1)); assert!(!testlook!(look, " a ", 1)); assert!(testlook!(look, " a ", 2)); // Unicode word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃", 0)); assert!(testlook!(look, "𝛃", 4)); assert!(testlook!(look, "𝛃 ", 4)); assert!(!testlook!(look, " 𝛃 ", 1)); assert!(testlook!(look, " 𝛃 ", 5)); // Unicode word boundaries between non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 0)); assert!(testlook!(look, "𝛃𐆀", 4)); // Non word boundaries for ASCII. assert!(testlook!(look, "", 0)); assert!(!testlook!(look, "ab", 1)); assert!(testlook!(look, "a ", 2)); assert!(testlook!(look, " a ", 0)); assert!(testlook!(look, " a ", 3)); // Non word boundaries with a non-ASCII codepoint. assert!(!testlook!(look, "𝛃b", 4)); assert!(!testlook!(look, "b𝛃", 1)); assert!(testlook!(look, "𝛃 ", 5)); assert!(testlook!(look, " 𝛃 ", 0)); assert!(testlook!(look, " 𝛃 ", 6)); assert!(!testlook!(look, "𝛃", 1)); assert!(!testlook!(look, "𝛃", 2)); assert!(!testlook!(look, "𝛃", 3)); // Non word boundaries with non-ASCII codepoints. assert!(!testlook!(look, "𝛃𐆀", 1)); assert!(!testlook!(look, "𝛃𐆀", 2)); assert!(!testlook!(look, "𝛃𐆀", 3)); assert!(!testlook!(look, "𝛃𐆀", 5)); assert!(!testlook!(look, "𝛃𐆀", 6)); assert!(!testlook!(look, "𝛃𐆀", 7)); assert!(testlook!(look, "𝛃𐆀", 8)); } #[test] fn look_set() { let mut f = LookSet::default(); assert!(!f.contains(Look::Start)); assert!(!f.contains(Look::End)); assert!(!f.contains(Look::StartLF)); assert!(!f.contains(Look::EndLF)); assert!(!f.contains(Look::WordUnicode)); assert!(!f.contains(Look::WordUnicodeNegate)); assert!(!f.contains(Look::WordAscii)); assert!(!f.contains(Look::WordAsciiNegate)); f = f.insert(Look::Start); assert!(f.contains(Look::Start)); f = f.remove(Look::Start); assert!(!f.contains(Look::Start)); f = f.insert(Look::End); assert!(f.contains(Look::End)); f = f.remove(Look::End); assert!(!f.contains(Look::End)); f = f.insert(Look::StartLF); assert!(f.contains(Look::StartLF)); f = f.remove(Look::StartLF); assert!(!f.contains(Look::StartLF)); f = f.insert(Look::EndLF); assert!(f.contains(Look::EndLF)); f = f.remove(Look::EndLF); assert!(!f.contains(Look::EndLF)); f = f.insert(Look::StartCRLF); assert!(f.contains(Look::StartCRLF)); f = f.remove(Look::StartCRLF); assert!(!f.contains(Look::StartCRLF)); f = f.insert(Look::EndCRLF); assert!(f.contains(Look::EndCRLF)); f = f.remove(Look::EndCRLF); assert!(!f.contains(Look::EndCRLF)); f = f.insert(Look::WordUnicode); assert!(f.contains(Look::WordUnicode)); f = f.remove(Look::WordUnicode); assert!(!f.contains(Look::WordUnicode)); f = f.insert(Look::WordUnicodeNegate); assert!(f.contains(Look::WordUnicodeNegate)); f = f.remove(Look::WordUnicodeNegate); assert!(!f.contains(Look::WordUnicodeNegate)); f = f.insert(Look::WordAscii); assert!(f.contains(Look::WordAscii)); f = f.remove(Look::WordAscii); assert!(!f.contains(Look::WordAscii)); f = f.insert(Look::WordAsciiNegate); assert!(f.contains(Look::WordAsciiNegate)); f = f.remove(Look::WordAsciiNegate); assert!(!f.contains(Look::WordAsciiNegate)); f = f.insert(Look::WordStartAscii); assert!(f.contains(Look::WordStartAscii)); f = f.remove(Look::WordStartAscii); assert!(!f.contains(Look::WordStartAscii)); f = f.insert(Look::WordEndAscii); assert!(f.contains(Look::WordEndAscii)); f = f.remove(Look::WordEndAscii); assert!(!f.contains(Look::WordEndAscii)); f = f.insert(Look::WordStartUnicode); assert!(f.contains(Look::WordStartUnicode)); f = f.remove(Look::WordStartUnicode); assert!(!f.contains(Look::WordStartUnicode)); f = f.insert(Look::WordEndUnicode); assert!(f.contains(Look::WordEndUnicode)); f = f.remove(Look::WordEndUnicode); assert!(!f.contains(Look::WordEndUnicode)); f = f.insert(Look::WordStartHalfAscii); assert!(f.contains(Look::WordStartHalfAscii)); f = f.remove(Look::WordStartHalfAscii); assert!(!f.contains(Look::WordStartHalfAscii)); f = f.insert(Look::WordEndHalfAscii); assert!(f.contains(Look::WordEndHalfAscii)); f = f.remove(Look::WordEndHalfAscii); assert!(!f.contains(Look::WordEndHalfAscii)); f = f.insert(Look::WordStartHalfUnicode); assert!(f.contains(Look::WordStartHalfUnicode)); f = f.remove(Look::WordStartHalfUnicode); assert!(!f.contains(Look::WordStartHalfUnicode)); f = f.insert(Look::WordEndHalfUnicode); assert!(f.contains(Look::WordEndHalfUnicode)); f = f.remove(Look::WordEndHalfUnicode); assert!(!f.contains(Look::WordEndHalfUnicode)); } #[test] fn look_set_iter() { let set = LookSet::empty(); assert_eq!(0, set.iter().count()); let set = LookSet::full(); assert_eq!(18, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF).insert(Look::WordUnicode); assert_eq!(2, set.iter().count()); let set = LookSet::empty().insert(Look::StartLF); assert_eq!(1, set.iter().count()); let set = LookSet::empty().insert(Look::WordAsciiNegate); assert_eq!(1, set.iter().count()); let set = LookSet::empty().insert(Look::WordEndHalfUnicode); assert_eq!(1, set.iter().count()); } #[test] #[cfg(feature = "alloc")] fn look_set_debug() { let res = alloc::format!("{:?}", LookSet::empty()); assert_eq!("∅", res); let res = alloc::format!("{:?}", LookSet::full()); assert_eq!("Az^$rRbB𝛃𝚩<>〈〉◁▷◀▶", res); } } regex-automata-0.4.9/src/util/memchr.rs000064400000000000000000000054641046102023000161570ustar 00000000000000/*! This module defines simple wrapper routines for the memchr functions from the `memchr` crate. Basically, when the `memchr` crate is available, we use it, otherwise we use a naive implementation which is still pretty fast. */ pub(crate) use self::inner::*; #[cfg(feature = "perf-literal-substring")] pub(super) mod inner { #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option { memchr::memchr(n1, haystack) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { memchr::memchr2(n1, n2, haystack) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { memchr::memchr3(n1, n2, n3, haystack) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option { memchr::memrchr(n1, haystack) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { memchr::memrchr2(n1, n2, haystack) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { memchr::memrchr3(n1, n2, n3, haystack) } } #[cfg(not(feature = "perf-literal-substring"))] pub(super) mod inner { #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr(n1: u8, haystack: &[u8]) -> Option { haystack.iter().position(|&b| b == n1) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { haystack.iter().position(|&b| b == n1 || b == n2) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { haystack.iter().position(|&b| b == n1 || b == n2 || b == n3) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr(n1: u8, haystack: &[u8]) -> Option { haystack.iter().rposition(|&b| b == n1) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr2(n1: u8, n2: u8, haystack: &[u8]) -> Option { haystack.iter().rposition(|&b| b == n1 || b == n2) } #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn memrchr3( n1: u8, n2: u8, n3: u8, haystack: &[u8], ) -> Option { haystack.iter().rposition(|&b| b == n1 || b == n2 || b == n3) } } regex-automata-0.4.9/src/util/mod.rs000064400000000000000000000036531046102023000154610ustar 00000000000000/*! A collection of modules that provide APIs that are useful across many regex engines. While one should explore the sub-modules directly to get a sense of what's there, here are some highlights that tie the sub-modules to higher level use cases: * `alphabet` contains APIs that are useful if you're doing low level things with the DFAs in this crate. For example, implementing determinization or walking its state graph directly. * `captures` contains APIs for dealing with capture group matches and their mapping to "slots" used inside an NFA graph. This is also where you can find iterators over capture group names. * `escape` contains types for pretty-printing raw byte slices as strings. * `iter` contains API helpers for writing regex iterators. * `lazy` contains a no-std and no-alloc variant of `lazy_static!` and `once_cell`. * `look` contains APIs for matching and configuring look-around assertions. * `pool` provides a way to reuse mutable memory allocated in a thread safe manner. * `prefilter` provides APIs for building prefilters and using them in searches. * `primitives` are what you might use if you're doing lower level work on automata, such as walking an NFA state graph. * `syntax` provides some higher level convenience functions for interacting with the `regex-syntax` crate. * `wire` is useful if you're working with DFA serialization. */ pub mod alphabet; #[cfg(feature = "alloc")] pub mod captures; pub mod escape; #[cfg(feature = "alloc")] pub mod interpolate; pub mod iter; pub mod lazy; pub mod look; #[cfg(feature = "alloc")] pub mod pool; pub mod prefilter; pub mod primitives; pub mod start; #[cfg(feature = "syntax")] pub mod syntax; pub mod wire; #[cfg(any(feature = "dfa-build", feature = "hybrid"))] pub(crate) mod determinize; pub(crate) mod empty; pub(crate) mod int; pub(crate) mod memchr; pub(crate) mod search; #[cfg(feature = "alloc")] pub(crate) mod sparse_set; pub(crate) mod unicode_data; pub(crate) mod utf8; regex-automata-0.4.9/src/util/pool.rs000064400000000000000000001520411046102023000156470ustar 00000000000000// This module provides a relatively simple thread-safe pool of reusable // objects. For the most part, it's implemented by a stack represented by a // Mutex>. It has one small trick: because unlocking a mutex is somewhat // costly, in the case where a pool is accessed by the first thread that tried // to get a value, we bypass the mutex. Here are some benchmarks showing the // difference. // // 2022-10-15: These benchmarks are from the old regex crate and they aren't // easy to reproduce because some rely on older implementations of Pool that // are no longer around. I've left the results here for posterity, but any // enterprising individual should feel encouraged to re-litigate the way Pool // works. I am not at all certain it is the best approach. // // 1) misc::anchored_literal_long_non_match 21 (18571 MB/s) // 2) misc::anchored_literal_long_non_match 107 (3644 MB/s) // 3) misc::anchored_literal_long_non_match 45 (8666 MB/s) // 4) misc::anchored_literal_long_non_match 19 (20526 MB/s) // // (1) represents our baseline: the master branch at the time of writing when // using the 'thread_local' crate to implement the pool below. // // (2) represents a naive pool implemented completely via Mutex>. There // is no special trick for bypassing the mutex. // // (3) is the same as (2), except it uses Mutex>>. It is twice as // fast because a Box is much smaller than the T we use with a Pool in this // crate. So pushing and popping a Box from a Vec is quite a bit faster // than for T. // // (4) is the same as (3), but with the trick for bypassing the mutex in the // case of the first-to-get thread. // // Why move off of thread_local? Even though (4) is a hair faster than (1) // above, this was not the main goal. The main goal was to move off of // thread_local and find a way to *simply* re-capture some of its speed for // regex's specific case. So again, why move off of it? The *primary* reason is // because of memory leaks. See https://github.com/rust-lang/regex/issues/362 // for example. (Why do I want it to be simple? Well, I suppose what I mean is, // "use as much safe code as possible to minimize risk and be as sure as I can // be that it is correct.") // // My guess is that the thread_local design is probably not appropriate for // regex since its memory usage scales to the number of active threads that // have used a regex, where as the pool below scales to the number of threads // that simultaneously use a regex. While neither case permits contraction, // since we own the pool data structure below, we can add contraction if a // clear use case pops up in the wild. More pressingly though, it seems that // there are at least some use case patterns where one might have many threads // sitting around that might have used a regex at one point. While thread_local // does try to reuse space previously used by a thread that has since stopped, // its maximal memory usage still scales with the total number of active // threads. In contrast, the pool below scales with the total number of threads // *simultaneously* using the pool. The hope is that this uses less memory // overall. And if it doesn't, we can hopefully tune it somehow. // // It seems that these sort of conditions happen frequently // in FFI inside of other more "managed" languages. This was // mentioned in the issue linked above, and also mentioned here: // https://github.com/BurntSushi/rure-go/issues/3. And in particular, users // confirm that disabling the use of thread_local resolves the leak. // // There were other weaker reasons for moving off of thread_local as well. // Namely, at the time, I was looking to reduce dependencies. And for something // like regex, maintenance can be simpler when we own the full dependency tree. // // Note that I am not entirely happy with this pool. It has some subtle // implementation details and is overall still observable (even with the // thread owner optimization) in benchmarks. If someone wants to take a crack // at building something better, please file an issue. Even if it means a // different API. The API exposed by this pool is not the minimal thing that // something like a 'Regex' actually needs. It could adapt to, for example, // an API more like what is found in the 'thread_local' crate. However, we do // really need to support the no-std alloc-only context, or else the regex // crate wouldn't be able to support no-std alloc-only. However, I'm generally // okay with making the alloc-only context slower (as it is here), although I // do find it unfortunate. /*! A thread safe memory pool. The principal type in this module is a [`Pool`]. It main use case is for holding a thread safe collection of mutable scratch spaces (usually called `Cache` in this crate) that regex engines need to execute a search. This then permits sharing the same read-only regex object across multiple threads while having a quick way of reusing scratch space in a thread safe way. This avoids needing to re-create the scratch space for every search, which could wind up being quite expensive. */ /// A thread safe pool that works in an `alloc`-only context. /// /// Getting a value out comes with a guard. When that guard is dropped, the /// value is automatically put back in the pool. The guard provides both a /// `Deref` and a `DerefMut` implementation for easy access to an underlying /// `T`. /// /// A `Pool` impls `Sync` when `T` is `Send` (even if `T` is not `Sync`). This /// is possible because a pool is guaranteed to provide a value to exactly one /// thread at any time. /// /// Currently, a pool never contracts in size. Its size is proportional to the /// maximum number of simultaneous uses. This may change in the future. /// /// A `Pool` is a particularly useful data structure for this crate because /// many of the regex engines require a mutable "cache" in order to execute /// a search. Since regexes themselves tend to be global, the problem is then: /// how do you get a mutable cache to execute a search? You could: /// /// 1. Use a `thread_local!`, which requires the standard library and requires /// that the regex pattern be statically known. /// 2. Use a `Pool`. /// 3. Make the cache an explicit dependency in your code and pass it around. /// 4. Put the cache state in a `Mutex`, but this means only one search can /// execute at a time. /// 5. Create a new cache for every search. /// /// A `thread_local!` is perhaps the best choice if it works for your use case. /// Putting the cache in a mutex or creating a new cache for every search are /// perhaps the worst choices. Of the remaining two choices, whether you use /// this `Pool` or thread through a cache explicitly in your code is a matter /// of taste and depends on your code architecture. /// /// # Warning: may use a spin lock /// /// When this crate is compiled _without_ the `std` feature, then this type /// may used a spin lock internally. This can have subtle effects that may /// be undesirable. See [Spinlocks Considered Harmful][spinharm] for a more /// thorough treatment of this topic. /// /// [spinharm]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// /// # Example /// /// This example shows how to share a single hybrid regex among multiple /// threads, while also safely getting exclusive access to a hybrid's /// [`Cache`](crate::hybrid::regex::Cache) without preventing other searches /// from running while your thread uses the `Cache`. /// /// ``` /// use regex_automata::{ /// hybrid::regex::{Cache, Regex}, /// util::{lazy::Lazy, pool::Pool}, /// Match, /// }; /// /// static RE: Lazy = /// Lazy::new(|| Regex::new("foo[0-9]+bar").unwrap()); /// static CACHE: Lazy> = /// Lazy::new(|| Pool::new(|| RE.create_cache())); /// /// let expected = Some(Match::must(0, 3..14)); /// assert_eq!(expected, RE.find(&mut CACHE.get(), b"zzzfoo12345barzzz")); /// ``` pub struct Pool T>(alloc::boxed::Box>); impl Pool { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub fn new(create: F) -> Pool { Pool(alloc::boxed::Box::new(inner::Pool::new(create))) } } impl T> Pool { /// Get a value from the pool. The caller is guaranteed to have /// exclusive access to the given value. Namely, it is guaranteed that /// this will never return a value that was returned by another call to /// `get` but was not put back into the pool. /// /// When the guard goes out of scope and its destructor is called, then /// it will automatically be put back into the pool. Alternatively, /// [`PoolGuard::put`] may be used to explicitly put it back in the pool /// without relying on its destructor. /// /// Note that there is no guarantee provided about which value in the /// pool is returned. That is, calling get, dropping the guard (causing /// the value to go back into the pool) and then calling get again is /// *not* guaranteed to return the same value received in the first `get` /// call. #[inline] pub fn get(&self) -> PoolGuard<'_, T, F> { PoolGuard(self.0.get()) } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("Pool").field(&self.0).finish() } } /// A guard that is returned when a caller requests a value from the pool. /// /// The purpose of the guard is to use RAII to automatically put the value /// back in the pool once it's dropped. pub struct PoolGuard<'a, T: Send, F: Fn() -> T>(inner::PoolGuard<'a, T, F>); impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Consumes this guard and puts it back into the pool. /// /// This circumvents the guard's `Drop` implementation. This can be useful /// in circumstances where the automatic `Drop` results in poorer codegen, /// such as calling non-inlined functions. #[inline] pub fn put(this: PoolGuard<'_, T, F>) { inner::PoolGuard::put(this.0); } } impl<'a, T: Send, F: Fn() -> T> core::ops::Deref for PoolGuard<'a, T, F> { type Target = T; #[inline] fn deref(&self) -> &T { self.0.value() } } impl<'a, T: Send, F: Fn() -> T> core::ops::DerefMut for PoolGuard<'a, T, F> { #[inline] fn deref_mut(&mut self) -> &mut T { self.0.value_mut() } } impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug for PoolGuard<'a, T, F> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple("PoolGuard").field(&self.0).finish() } } #[cfg(feature = "std")] mod inner { use core::{ cell::UnsafeCell, panic::{RefUnwindSafe, UnwindSafe}, sync::atomic::{AtomicUsize, Ordering}, }; use alloc::{boxed::Box, vec, vec::Vec}; use std::{sync::Mutex, thread_local}; /// An atomic counter used to allocate thread IDs. /// /// We specifically start our counter at 3 so that we can use the values /// less than it as sentinels. static COUNTER: AtomicUsize = AtomicUsize::new(3); /// A thread ID indicating that there is no owner. This is the initial /// state of a pool. Once a pool has an owner, there is no way to change /// it. static THREAD_ID_UNOWNED: usize = 0; /// A thread ID indicating that the special owner value is in use and not /// available. This state is useful for avoiding a case where the owner /// of a pool calls `get` before putting the result of a previous `get` /// call back into the pool. static THREAD_ID_INUSE: usize = 1; /// This sentinel is used to indicate that a guard has already been dropped /// and should not be re-dropped. We use this because our drop code can be /// called outside of Drop and thus there could be a bug in the internal /// implementation that results in trying to put the same guard back into /// the same pool multiple times, and *that* could result in UB if we /// didn't mark the guard as already having been put back in the pool. /// /// So this isn't strictly necessary, but this let's us define some /// routines as safe (like PoolGuard::put_imp) that we couldn't otherwise /// do. static THREAD_ID_DROPPED: usize = 2; /// The number of stacks we use inside of the pool. These are only used for /// non-owners. That is, these represent the "slow" path. /// /// In the original implementation of this pool, we only used a single /// stack. While this might be okay for a couple threads, the prevalence of /// 32, 64 and even 128 core CPUs has made it untenable. The contention /// such an environment introduces when threads are doing a lot of searches /// on short haystacks (a not uncommon use case) is palpable and leads to /// huge slowdowns. /// /// This constant reflects a change from using one stack to the number of /// stacks that this constant is set to. The stack for a particular thread /// is simply chosen by `thread_id % MAX_POOL_STACKS`. The idea behind /// this setup is that there should be a good chance that accesses to the /// pool will be distributed over several stacks instead of all of them /// converging to one. /// /// This is not a particularly smart or dynamic strategy. Fixing this to a /// specific number has at least two downsides. First is that it will help, /// say, an 8 core CPU more than it will a 128 core CPU. (But, crucially, /// it will still help the 128 core case.) Second is that this may wind /// up being a little wasteful with respect to memory usage. Namely, if a /// regex is used on one thread and then moved to another thread, then it /// could result in creating a new copy of the data in the pool even though /// only one is actually needed. /// /// And that memory usage bit is why this is set to 8 and not, say, 64. /// Keeping it at 8 limits, to an extent, how much unnecessary memory can /// be allocated. /// /// In an ideal world, we'd be able to have something like this: /// /// * Grow the number of stacks as the number of concurrent callers /// increases. I spent a little time trying this, but even just adding an /// atomic addition/subtraction for each pop/push for tracking concurrent /// callers led to a big perf hit. Since even more work would seemingly be /// required than just an addition/subtraction, I abandoned this approach. /// * The maximum amount of memory used should scale with respect to the /// number of concurrent callers and *not* the total number of existing /// threads. This is primarily why the `thread_local` crate isn't used, as /// as some environments spin up a lot of threads. This led to multiple /// reports of extremely high memory usage (often described as memory /// leaks). /// * Even more ideally, the pool should contract in size. That is, it /// should grow with bursts and then shrink. But this is a pretty thorny /// issue to tackle and it might be better to just not. /// * It would be nice to explore the use of, say, a lock-free stack /// instead of using a mutex to guard a `Vec` that is ultimately just /// treated as a stack. The main thing preventing me from exploring this /// is the ABA problem. The `crossbeam` crate has tools for dealing with /// this sort of problem (via its epoch based memory reclamation strategy), /// but I can't justify bringing in all of `crossbeam` as a dependency of /// `regex` for this. /// /// See this issue for more context and discussion: /// https://github.com/rust-lang/regex/issues/934 const MAX_POOL_STACKS: usize = 8; thread_local!( /// A thread local used to assign an ID to a thread. static THREAD_ID: usize = { let next = COUNTER.fetch_add(1, Ordering::Relaxed); // SAFETY: We cannot permit the reuse of thread IDs since reusing a // thread ID might result in more than one thread "owning" a pool, // and thus, permit accessing a mutable value from multiple threads // simultaneously without synchronization. The intent of this panic // is to be a sanity check. It is not expected that the thread ID // space will actually be exhausted in practice. Even on a 32-bit // system, it would require spawning 2^32 threads (although they // wouldn't all need to run simultaneously, so it is in theory // possible). // // This checks that the counter never wraps around, since atomic // addition wraps around on overflow. if next == 0 { panic!("regex: thread ID allocation space exhausted"); } next }; ); /// This puts each stack in the pool below into its own cache line. This is /// an absolutely critical optimization that tends to have the most impact /// in high contention workloads. Without forcing each mutex protected /// into its own cache line, high contention exacerbates the performance /// problem by causing "false sharing." By putting each mutex in its own /// cache-line, we avoid the false sharing problem and the affects of /// contention are greatly reduced. #[derive(Debug)] #[repr(C, align(64))] struct CacheLine(T); /// A thread safe pool utilizing std-only features. /// /// The main difference between this and the simplistic alloc-only pool is /// the use of std::sync::Mutex and an "owner thread" optimization that /// makes accesses by the owner of a pool faster than all other threads. /// This makes the common case of running a regex within a single thread /// faster by avoiding mutex unlocking. pub(super) struct Pool { /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, /// Multiple stacks of T values to hand out. These are used when a Pool /// is accessed by a thread that didn't create it. /// /// Conceptually this is `Mutex>>`, but sharded out to make /// it scale better under high contention work-loads. We index into /// this sequence via `thread_id % stacks.len()`. stacks: Vec>>>>, /// The ID of the thread that owns this pool. The owner is the thread /// that makes the first call to 'get'. When the owner calls 'get', it /// gets 'owner_val' directly instead of returning a T from 'stack'. /// See comments elsewhere for details, but this is intended to be an /// optimization for the common case that makes getting a T faster. /// /// It is initialized to a value of zero (an impossible thread ID) as a /// sentinel to indicate that it is unowned. owner: AtomicUsize, /// A value to return when the caller is in the same thread that /// first called `Pool::get`. /// /// This is set to None when a Pool is first created, and set to Some /// once the first thread calls Pool::get. owner_val: UnsafeCell>, } // SAFETY: Since we want to use a Pool from multiple threads simultaneously // behind an Arc, we need for it to be Sync. In cases where T is sync, // Pool would be Sync. However, since we use a Pool to store mutable // scratch space, we wind up using a T that has interior mutability and is // thus itself not Sync. So what we *really* want is for our Pool to by // Sync even when T is not Sync (but is at least Send). // // The only non-sync aspect of a Pool is its 'owner_val' field, which is // used to implement faster access to a pool value in the common case of // a pool being accessed in the same thread in which it was created. The // 'stack' field is also shared, but a Mutex where T: Send is already // Sync. So we only need to worry about 'owner_val'. // // The key is to guarantee that 'owner_val' can only ever be accessed from // one thread. In our implementation below, we guarantee this by only // returning the 'owner_val' when the ID of the current thread matches the // ID of the thread that first called 'Pool::get'. Since this can only ever // be one thread, it follows that only one thread can access 'owner_val' at // any point in time. Thus, it is safe to declare that Pool is Sync when // T is Send. // // If there is a way to achieve our performance goals using safe code, then // I would very much welcome a patch. As it stands, the implementation // below tries to balance safety with performance. The case where a Regex // is used from multiple threads simultaneously will suffer a bit since // getting a value out of the pool will require unlocking a mutex. // // We require `F: Send + Sync` because we call `F` at any point on demand, // potentially from multiple threads simultaneously. unsafe impl Sync for Pool {} // If T is UnwindSafe, then since we provide exclusive access to any // particular value in the pool, the pool should therefore also be // considered UnwindSafe. // // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any // point on demand, so it needs to be unwind safe on both dimensions for // the entire Pool to be unwind safe. impl UnwindSafe for Pool {} // If T is UnwindSafe, then since we provide exclusive access to any // particular value in the pool, the pool should therefore also be // considered RefUnwindSafe. // // We require `F: UnwindSafe + RefUnwindSafe` because we call `F` at any // point on demand, so it needs to be unwind safe on both dimensions for // the entire Pool to be unwind safe. impl RefUnwindSafe for Pool { } impl Pool { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) fn new(create: F) -> Pool { // FIXME: Now that we require 1.65+, Mutex::new is available as // const... So we can almost mark this function as const. But of // course, we're creating a Vec of stacks below (we didn't when I // originally wrote this code). It seems like the best way to work // around this would be to use a `[Stack; MAX_POOL_STACKS]` instead // of a `Vec`. I refrained from making this change at time // of writing (2023/10/08) because I was making a lot of other // changes at the same time and wanted to do this more carefully. // Namely, because of the cache line optimization, that `[Stack; // MAX_POOL_STACKS]` would be quite big. It's unclear how bad (if // at all) that would be. // // Another choice would be to lazily allocate the stacks, but... // I'm not so sure about that. Seems like a fair bit of complexity? // // Maybe there's a simple solution I'm missing. // // ... OK, I tried to fix this. First, I did it by putting `stacks` // in an `UnsafeCell` and using a `Once` to lazily initialize it. // I benchmarked it and everything looked okay. I then made this // function `const` and thought I was just about done. But the // public pool type wraps its inner pool in a `Box` to keep its // size down. Blech. // // So then I thought that I could push the box down into this // type (and leave the non-std version unboxed) and use the same // `UnsafeCell` technique to lazily initialize it. This has the // downside of the `Once` now needing to get hit in the owner fast // path, but maybe that's OK? However, I then realized that we can // only lazily initialize `stacks`, `owner` and `owner_val`. The // `create` function needs to be put somewhere outside of the box. // So now the pool is a `Box`, `Once` and a function. Now we're // starting to defeat the point of boxing in the first place. So I // backed out that change too. // // Back to square one. I maybe we just don't make a pool's // constructor const and live with it. It's probably not a huge // deal. let mut stacks = Vec::with_capacity(MAX_POOL_STACKS); for _ in 0..stacks.capacity() { stacks.push(CacheLine(Mutex::new(vec![]))); } let owner = AtomicUsize::new(THREAD_ID_UNOWNED); let owner_val = UnsafeCell::new(None); // init'd on first access Pool { create, stacks, owner, owner_val } } } impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { // Our fast path checks if the caller is the thread that "owns" // this pool. Or stated differently, whether it is the first thread // that tried to extract a value from the pool. If it is, then we // can return a T to the caller without going through a mutex. // // SAFETY: We must guarantee that only one thread gets access // to this value. Since a thread is uniquely identified by the // THREAD_ID thread local, it follows that if the caller's thread // ID is equal to the owner, then only one thread may receive this // value. This is also why we can get away with what looks like a // racy load and a store. We know that if 'owner == caller', then // only one thread can be here, so we don't need to worry about any // other thread setting the owner to something else. let caller = THREAD_ID.with(|id| *id); let owner = self.owner.load(Ordering::Acquire); if caller == owner { // N.B. We could also do a CAS here instead of a load/store, // but ad hoc benchmarking suggests it is slower. And a lot // slower in the case where `get_slow` is common. self.owner.store(THREAD_ID_INUSE, Ordering::Release); return self.guard_owned(caller); } self.get_slow(caller, owner) } /// This is the "slow" version that goes through a mutex to pop an /// allocated value off a stack to return to the caller. (Or, if the /// stack is empty, a new value is created.) /// /// If the pool has no owner, then this will set the owner. #[cold] fn get_slow( &self, caller: usize, owner: usize, ) -> PoolGuard<'_, T, F> { if owner == THREAD_ID_UNOWNED { // This sentinel means this pool is not yet owned. We try to // atomically set the owner. If we do, then this thread becomes // the owner and we can return a guard that represents the // special T for the owner. // // Note that we set the owner to a different sentinel that // indicates that the owned value is in use. The owner ID will // get updated to the actual ID of this thread once the guard // returned by this function is put back into the pool. let res = self.owner.compare_exchange( THREAD_ID_UNOWNED, THREAD_ID_INUSE, Ordering::AcqRel, Ordering::Acquire, ); if res.is_ok() { // SAFETY: A successful CAS above implies this thread is // the owner and that this is the only such thread that // can reach here. Thus, there is no data race. unsafe { *self.owner_val.get() = Some((self.create)()); } return self.guard_owned(caller); } } let stack_id = caller % self.stacks.len(); // We try to acquire exclusive access to this thread's stack, and // if so, grab a value from it if we can. We put this in a loop so // that it's easy to tweak and experiment with a different number // of tries. In the end, I couldn't see anything obviously better // than one attempt in ad hoc testing. for _ in 0..1 { let mut stack = match self.stacks[stack_id].0.try_lock() { Err(_) => continue, Ok(stack) => stack, }; if let Some(value) = stack.pop() { return self.guard_stack(value); } // Unlock the mutex guarding the stack before creating a fresh // value since we no longer need the stack. drop(stack); let value = Box::new((self.create)()); return self.guard_stack(value); } // We're only here if we could get access to our stack, so just // create a new value. This seems like it could be wasteful, but // waiting for exclusive access to a stack when there's high // contention is brutal for perf. self.guard_stack_transient(Box::new((self.create)())) } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. #[inline] fn put_value(&self, value: Box) { let caller = THREAD_ID.with(|id| *id); let stack_id = caller % self.stacks.len(); // As with trying to pop a value from this thread's stack, we // merely attempt to get access to push this value back on the // stack. If there's too much contention, we just give up and throw // the value away. // // Interestingly, in ad hoc benchmarking, it is beneficial to // attempt to push the value back more than once, unlike when // popping the value. I don't have a good theory for why this is. // I guess if we drop too many values then that winds up forcing // the pop operation to create new fresh values and thus leads to // less reuse. There's definitely a balancing act here. for _ in 0..10 { let mut stack = match self.stacks[stack_id].0.try_lock() { Err(_) => continue, Ok(stack) => stack, }; stack.push(value); return; } } /// Create a guard that represents the special owned T. #[inline] fn guard_owned(&self, caller: usize) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Err(caller), discard: false } } /// Create a guard that contains a value from the pool's stack. #[inline] fn guard_stack(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: false } } /// Create a guard that contains a value from the pool's stack with an /// instruction to throw away the value instead of putting it back /// into the pool. #[inline] fn guard_stack_transient(&self, value: Box) -> PoolGuard<'_, T, F> { PoolGuard { pool: self, value: Ok(value), discard: true } } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool") .field("stacks", &self.stacks) .field("owner", &self.owner) .field("owner_val", &self.owner_val) .finish() } } /// A guard that is returned when a caller requests a value from the pool. pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { /// The pool that this guard is attached to. pool: &'a Pool, /// This is Err when the guard represents the special "owned" value. /// In which case, the value is retrieved from 'pool.owner_val'. And /// in the special case of `Err(THREAD_ID_DROPPED)`, it means the /// guard has been put back into the pool and should no longer be used. value: Result, usize>, /// When true, the value should be discarded instead of being pushed /// back into the pool. We tend to use this under high contention, and /// this allows us to avoid inflating the size of the pool. (Because /// under contention, we tend to create more values instead of waiting /// for access to a stack of existing values.) discard: bool, } impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. #[inline] pub(super) fn value(&self) -> &T { match self.value { Ok(ref v) => &**v, // SAFETY: This is safe because the only way a PoolGuard gets // created for self.value=Err is when the current thread // corresponds to the owning thread, of which there can only // be one. Thus, we are guaranteed to be providing exclusive // access here which makes this safe. // // Also, since 'owner_val' is guaranteed to be initialized // before an owned PoolGuard is created, the unchecked unwrap // is safe. Err(id) => unsafe { // This assert is *not* necessary for safety, since we // should never be here if the guard had been put back into // the pool. This is a sanity check to make sure we didn't // break an internal invariant. debug_assert_ne!(THREAD_ID_DROPPED, id); (*self.pool.owner_val.get()).as_ref().unwrap_unchecked() }, } } /// Return the underlying value as a mutable borrow. #[inline] pub(super) fn value_mut(&mut self) -> &mut T { match self.value { Ok(ref mut v) => &mut **v, // SAFETY: This is safe because the only way a PoolGuard gets // created for self.value=None is when the current thread // corresponds to the owning thread, of which there can only // be one. Thus, we are guaranteed to be providing exclusive // access here which makes this safe. // // Also, since 'owner_val' is guaranteed to be initialized // before an owned PoolGuard is created, the unwrap_unchecked // is safe. Err(id) => unsafe { // This assert is *not* necessary for safety, since we // should never be here if the guard had been put back into // the pool. This is a sanity check to make sure we didn't // break an internal invariant. debug_assert_ne!(THREAD_ID_DROPPED, id); (*self.pool.owner_val.get()).as_mut().unwrap_unchecked() }, } } /// Consumes this guard and puts it back into the pool. #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop // impl after doing this. I don't believe there is a correctness // problem with doing so, but there's definitely a perf problem // by redoing this work. So we avoid it. let mut this = core::mem::ManuallyDrop::new(this); this.put_imp(); } /// Puts this guard back into the pool by only borrowing the guard as /// mutable. This should be called at most once. #[inline(always)] fn put_imp(&mut self) { match core::mem::replace(&mut self.value, Err(THREAD_ID_DROPPED)) { Ok(value) => { // If we were told to discard this value then don't bother // trying to put it back into the pool. This occurs when // the pop operation failed to acquire a lock and we // decided to create a new value in lieu of contending for // the lock. if self.discard { return; } self.pool.put_value(value); } // If this guard has a value "owned" by the thread, then // the Pool guarantees that this is the ONLY such guard. // Therefore, in order to place it back into the pool and make // it available, we need to change the owner back to the owning // thread's ID. But note that we use the ID that was stored in // the guard, since a guard can be moved to another thread and // dropped. (A previous iteration of this code read from the // THREAD_ID thread local, which uses the ID of the current // thread which may not be the ID of the owning thread! This // also avoids the TLS access, which is likely a hair faster.) Err(owner) => { // If we hit this point, it implies 'put_imp' has been // called multiple times for the same guard which in turn // corresponds to a bug in this implementation. assert_ne!(THREAD_ID_DROPPED, owner); self.pool.owner.store(owner, Ordering::Release); } } } } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { #[inline] fn drop(&mut self) { self.put_imp(); } } impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug for PoolGuard<'a, T, F> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("PoolGuard") .field("pool", &self.pool) .field("value", &self.value) .finish() } } } // FUTURE: We should consider using Mara Bos's nearly-lock-free version of this // here: https://gist.github.com/m-ou-se/5fdcbdf7dcf4585199ce2de697f367a4. // // One reason why I did things with a "mutex" below is that it isolates the // safety concerns to just the Mutex, where as the safety of Mara's pool is a // bit more sprawling. I also expect this code to not be used that much, and // so is unlikely to get as much real world usage with which to test it. That // means the "obviously correct" lever is an important one. // // The specific reason to use Mara's pool is that it is likely faster and also // less likely to hit problems with spin-locks, although it is not completely // impervious to them. // // The best solution to this problem, probably, is a truly lock free pool. That // could be done with a lock free linked list. The issue is the ABA problem. It // is difficult to avoid, and doing so is complex. BUT, the upshot of that is // that if we had a truly lock free pool, then we could also use it above in // the 'std' pool instead of a Mutex because it should be completely free the // problems that come from spin-locks. #[cfg(not(feature = "std"))] mod inner { use core::{ cell::UnsafeCell, panic::{RefUnwindSafe, UnwindSafe}, sync::atomic::{AtomicBool, Ordering}, }; use alloc::{boxed::Box, vec, vec::Vec}; /// A thread safe pool utilizing alloc-only features. /// /// Unlike the std version, it doesn't seem possible(?) to implement the /// "thread owner" optimization because alloc-only doesn't have any concept /// of threads. So the best we can do is just a normal stack. This will /// increase latency in alloc-only environments. pub(super) struct Pool { /// A stack of T values to hand out. These are used when a Pool is /// accessed by a thread that didn't create it. stack: Mutex>>, /// A function to create more T values when stack is empty and a caller /// has requested a T. create: F, } // If T is UnwindSafe, then since we provide exclusive access to any // particular value in the pool, it should therefore also be considered // RefUnwindSafe. impl RefUnwindSafe for Pool {} impl Pool { /// Create a new pool. The given closure is used to create values in /// the pool when necessary. pub(super) const fn new(create: F) -> Pool { Pool { stack: Mutex::new(vec![]), create } } } impl T> Pool { /// Get a value from the pool. This may block if another thread is also /// attempting to retrieve a value from the pool. #[inline] pub(super) fn get(&self) -> PoolGuard<'_, T, F> { let mut stack = self.stack.lock(); let value = match stack.pop() { None => Box::new((self.create)()), Some(value) => value, }; PoolGuard { pool: self, value: Some(value) } } #[inline] fn put(&self, guard: PoolGuard<'_, T, F>) { let mut guard = core::mem::ManuallyDrop::new(guard); if let Some(value) = guard.value.take() { self.put_value(value); } } /// Puts a value back into the pool. Callers don't need to call this. /// Once the guard that's returned by 'get' is dropped, it is put back /// into the pool automatically. #[inline] fn put_value(&self, value: Box) { let mut stack = self.stack.lock(); stack.push(value); } } impl core::fmt::Debug for Pool { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { f.debug_struct("Pool").field("stack", &self.stack).finish() } } /// A guard that is returned when a caller requests a value from the pool. pub(super) struct PoolGuard<'a, T: Send, F: Fn() -> T> { /// The pool that this guard is attached to. pool: &'a Pool, /// This is None after the guard has been put back into the pool. value: Option>, } impl<'a, T: Send, F: Fn() -> T> PoolGuard<'a, T, F> { /// Return the underlying value. #[inline] pub(super) fn value(&self) -> &T { self.value.as_deref().unwrap() } /// Return the underlying value as a mutable borrow. #[inline] pub(super) fn value_mut(&mut self) -> &mut T { self.value.as_deref_mut().unwrap() } /// Consumes this guard and puts it back into the pool. #[inline] pub(super) fn put(this: PoolGuard<'_, T, F>) { // Since this is effectively consuming the guard and putting the // value back into the pool, there's no reason to run its Drop // impl after doing this. I don't believe there is a correctness // problem with doing so, but there's definitely a perf problem // by redoing this work. So we avoid it. let mut this = core::mem::ManuallyDrop::new(this); this.put_imp(); } /// Puts this guard back into the pool by only borrowing the guard as /// mutable. This should be called at most once. #[inline(always)] fn put_imp(&mut self) { if let Some(value) = self.value.take() { self.pool.put_value(value); } } } impl<'a, T: Send, F: Fn() -> T> Drop for PoolGuard<'a, T, F> { #[inline] fn drop(&mut self) { self.put_imp(); } } impl<'a, T: Send + core::fmt::Debug, F: Fn() -> T> core::fmt::Debug for PoolGuard<'a, T, F> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_struct("PoolGuard") .field("pool", &self.pool) .field("value", &self.value) .finish() } } /// A spin-lock based mutex. Yes, I have read spinlocks cosnidered /// harmful[1], and if there's a reasonable alternative choice, I'll /// happily take it. /// /// I suspect the most likely alternative here is a Treiber stack, but /// implementing one correctly in a way that avoids the ABA problem looks /// subtle enough that I'm not sure I want to attempt that. But otherwise, /// we only need a mutex in order to implement our pool, so if there's /// something simpler we can use that works for our `Pool` use case, then /// that would be great. /// /// Note that this mutex does not do poisoning. /// /// [1]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html #[derive(Debug)] struct Mutex { locked: AtomicBool, data: UnsafeCell, } // SAFETY: Since a Mutex guarantees exclusive access, as long as we can // send it across threads, it must also be Sync. unsafe impl Sync for Mutex {} impl Mutex { /// Create a new mutex for protecting access to the given value across /// multiple threads simultaneously. const fn new(value: T) -> Mutex { Mutex { locked: AtomicBool::new(false), data: UnsafeCell::new(value), } } /// Lock this mutex and return a guard providing exclusive access to /// `T`. This blocks if some other thread has already locked this /// mutex. #[inline] fn lock(&self) -> MutexGuard<'_, T> { while self .locked .compare_exchange( false, true, Ordering::AcqRel, Ordering::Acquire, ) .is_err() { core::hint::spin_loop(); } // SAFETY: The only way we're here is if we successfully set // 'locked' to true, which implies we must be the only thread here // and thus have exclusive access to 'data'. let data = unsafe { &mut *self.data.get() }; MutexGuard { locked: &self.locked, data } } } /// A guard that derefs to &T and &mut T. When it's dropped, the lock is /// released. #[derive(Debug)] struct MutexGuard<'a, T> { locked: &'a AtomicBool, data: &'a mut T, } impl<'a, T> core::ops::Deref for MutexGuard<'a, T> { type Target = T; #[inline] fn deref(&self) -> &T { self.data } } impl<'a, T> core::ops::DerefMut for MutexGuard<'a, T> { #[inline] fn deref_mut(&mut self) -> &mut T { self.data } } impl<'a, T> Drop for MutexGuard<'a, T> { #[inline] fn drop(&mut self) { // Drop means 'data' is no longer accessible, so we can unlock // the mutex. self.locked.store(false, Ordering::Release); } } } #[cfg(test)] mod tests { use core::panic::{RefUnwindSafe, UnwindSafe}; use alloc::{boxed::Box, vec, vec::Vec}; use super::*; #[test] fn oibits() { fn assert_oitbits() {} assert_oitbits::>>(); assert_oitbits::>>>(); assert_oitbits::< Pool< Vec, Box< dyn Fn() -> Vec + Send + Sync + UnwindSafe + RefUnwindSafe, >, >, >(); } // Tests that Pool implements the "single owner" optimization. That is, the // thread that first accesses the pool gets its own copy, while all other // threads get distinct copies. #[cfg(feature = "std")] #[test] fn thread_owner_optimization() { use std::{cell::RefCell, sync::Arc, vec}; let pool: Arc>>> = Arc::new(Pool::new(|| RefCell::new(vec!['a']))); pool.get().borrow_mut().push('x'); let pool1 = pool.clone(); let t1 = std::thread::spawn(move || { let guard = pool1.get(); guard.borrow_mut().push('y'); }); let pool2 = pool.clone(); let t2 = std::thread::spawn(move || { let guard = pool2.get(); guard.borrow_mut().push('z'); }); t1.join().unwrap(); t2.join().unwrap(); // If we didn't implement the single owner optimization, then one of // the threads above is likely to have mutated the [a, x] vec that // we stuffed in the pool before spawning the threads. But since // neither thread was first to access the pool, and because of the // optimization, we should be guaranteed that neither thread mutates // the special owned pool value. // // (Technically this is an implementation detail and not a contract of // Pool's API.) assert_eq!(vec!['a', 'x'], *pool.get().borrow()); } // This tests that if the "owner" of a pool asks for two values, then it // gets two distinct values and not the same one. This test failed in the // course of developing the pool, which in turn resulted in UB because it // permitted getting aliasing &mut borrows to the same place in memory. #[test] fn thread_owner_distinct() { let pool = Pool::new(|| vec!['a']); { let mut g1 = pool.get(); let v1 = &mut *g1; let mut g2 = pool.get(); let v2 = &mut *g2; v1.push('b'); v2.push('c'); assert_eq!(&mut vec!['a', 'b'], v1); assert_eq!(&mut vec!['a', 'c'], v2); } // This isn't technically guaranteed, but we // expect to now get the "owned" value (the first // call to 'get()' above) now that it's back in // the pool. assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); } // This tests that we can share a guard with another thread, mutate the // underlying value and everything works. This failed in the course of // developing a pool since the pool permitted 'get()' to return the same // value to the owner thread, even before the previous value was put back // into the pool. This in turn resulted in this test producing a data race. #[cfg(feature = "std")] #[test] fn thread_owner_sync() { let pool = Pool::new(|| vec!['a']); { let mut g1 = pool.get(); let mut g2 = pool.get(); std::thread::scope(|s| { s.spawn(|| { g1.push('b'); }); s.spawn(|| { g2.push('c'); }); }); let v1 = &mut *g1; let v2 = &mut *g2; assert_eq!(&mut vec!['a', 'b'], v1); assert_eq!(&mut vec!['a', 'c'], v2); } // This isn't technically guaranteed, but we // expect to now get the "owned" value (the first // call to 'get()' above) now that it's back in // the pool. assert_eq!(&mut vec!['a', 'b'], &mut *pool.get()); } // This tests that if we move a PoolGuard that is owned by the current // thread to another thread and drop it, then the thread owner doesn't // change. During development of the pool, this test failed because the // PoolGuard assumed it was dropped in the same thread from which it was // created, and thus used the current thread's ID as the owner, which could // be different than the actual owner of the pool. #[cfg(feature = "std")] #[test] fn thread_owner_send_drop() { let pool = Pool::new(|| vec!['a']); // Establishes this thread as the owner. { pool.get().push('b'); } std::thread::scope(|s| { // Sanity check that we get the same value back. // (Not technically guaranteed.) let mut g = pool.get(); assert_eq!(&vec!['a', 'b'], &*g); // Now push it to another thread and drop it. s.spawn(move || { g.push('c'); }) .join() .unwrap(); }); // Now check that we're still the owner. This is not technically // guaranteed by the API, but is true in practice given the thread // owner optimization. assert_eq!(&vec!['a', 'b', 'c'], &*pool.get()); } } regex-automata-0.4.9/src/util/prefilter/aho_corasick.rs000064400000000000000000000137471046102023000213300ustar 00000000000000use crate::util::{ prefilter::PrefilterI, search::{MatchKind, Span}, }; #[derive(Clone, Debug)] pub(crate) struct AhoCorasick { #[cfg(not(feature = "perf-literal-multisubstring"))] _unused: (), #[cfg(feature = "perf-literal-multisubstring")] ac: aho_corasick::AhoCorasick, } impl AhoCorasick { pub(crate) fn new>( kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { None } #[cfg(feature = "perf-literal-multisubstring")] { // We used to use `aho_corasick::MatchKind::Standard` here when // `kind` was `MatchKind::All`, but this is not correct. The // "standard" Aho-Corasick match semantics are to report a match // immediately as soon as it is seen, but `All` isn't like that. // In particular, with "standard" semantics, given the needles // "abc" and "b" and the haystack "abc," it would report a match // at offset 1 before a match at offset 0. This is never what we // want in the context of the regex engine, regardless of whether // we have leftmost-first or 'all' semantics. Namely, we always // want the leftmost match. let ac_match_kind = match kind { MatchKind::LeftmostFirst | MatchKind::All => { aho_corasick::MatchKind::LeftmostFirst } }; // This is kind of just an arbitrary number, but basically, if we // have a small enough set of literals, then we try to use the VERY // memory hungry DFA. Otherwise, we whimp out and use an NFA. The // upshot is that the NFA is quite lean and decently fast. Faster // than a naive Aho-Corasick NFA anyway. let ac_kind = if needles.len() <= 500 { aho_corasick::AhoCorasickKind::DFA } else { aho_corasick::AhoCorasickKind::ContiguousNFA }; let result = aho_corasick::AhoCorasick::builder() .kind(Some(ac_kind)) .match_kind(ac_match_kind) .start_kind(aho_corasick::StartKind::Both) // We try to handle all of the prefilter cases in the super // module, and only use Aho-Corasick for the actual automaton. // The aho-corasick crate does have some extra prefilters, // namely, looking for rare bytes to feed to memchr{,2,3} // instead of just the first byte. If we end up wanting // those---and they are somewhat tricky to implement---then // we could port them to this crate. // // The main reason for doing things this way is so we have a // complete and easy to understand picture of which prefilters // are available and how they work. Otherwise it seems too // easy to get into a situation where we have a prefilter // layered on top of prefilter, and that might have unintended // consequences. .prefilter(false) .build(needles); let ac = match result { Ok(ac) => ac, Err(_err) => { debug!("aho-corasick prefilter failed to build: {}", _err); return None; } }; Some(AhoCorasick { ac }) } } } impl PrefilterI for AhoCorasick { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { let input = aho_corasick::Input::new(haystack).span(span.start..span.end); self.ac .find(input) .map(|m| Span { start: m.start(), end: m.end() }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { let input = aho_corasick::Input::new(haystack) .anchored(aho_corasick::Anchored::Yes) .span(span.start..span.end); self.ac .find(input) .map(|m| Span { start: m.start(), end: m.end() }) } } fn memory_usage(&self) -> usize { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { self.ac.memory_usage() } } fn is_fast(&self) -> bool { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { // Aho-Corasick is never considered "fast" because it's never // going to be even close to an order of magnitude faster than the // regex engine itself (assuming a DFA is used). In fact, it is // usually slower. The magic of Aho-Corasick is that it can search // a *large* number of literals with a relatively small amount of // memory. The regex engines are far more wasteful. // // Aho-Corasick may be "fast" when the regex engine corresponds // to, say, the PikeVM. That happens when the lazy DFA couldn't be // built or used for some reason. But in these cases, the regex // itself is likely quite big and we're probably hosed no matter // what we do. (In this case, the best bet is for the caller to // increase some of the memory limits on the hybrid cache capacity // and hope that's enough.) false } } } regex-automata-0.4.9/src/util/prefilter/byteset.rs000064400000000000000000000026461046102023000203560ustar 00000000000000use crate::util::{ prefilter::PrefilterI, search::{MatchKind, Span}, }; #[derive(Clone, Debug)] pub(crate) struct ByteSet([bool; 256]); impl ByteSet { pub(crate) fn new>( _kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { None } #[cfg(feature = "perf-literal-multisubstring")] { let mut set = [false; 256]; for needle in needles.iter() { let needle = needle.as_ref(); if needle.len() != 1 { return None; } set[usize::from(needle[0])] = true; } Some(ByteSet(set)) } } } impl PrefilterI for ByteSet { fn find(&self, haystack: &[u8], span: Span) -> Option { haystack[span].iter().position(|&b| self.0[usize::from(b)]).map(|i| { let start = span.start + i; let end = start + 1; Span { start, end } }) } fn prefix(&self, haystack: &[u8], span: Span) -> Option { let b = *haystack.get(span.start)?; if self.0[usize::from(b)] { Some(Span { start: span.start, end: span.start + 1 }) } else { None } } fn memory_usage(&self) -> usize { 0 } fn is_fast(&self) -> bool { false } } regex-automata-0.4.9/src/util/prefilter/memchr.rs000064400000000000000000000110221046102023000201360ustar 00000000000000use crate::util::{ prefilter::PrefilterI, search::{MatchKind, Span}, }; #[derive(Clone, Debug)] pub(crate) struct Memchr(u8); impl Memchr { pub(crate) fn new>( _kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { None } #[cfg(feature = "perf-literal-substring")] { if needles.len() != 1 { return None; } if needles[0].as_ref().len() != 1 { return None; } Some(Memchr(needles[0].as_ref()[0])) } } } impl PrefilterI for Memchr { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { unreachable!() } #[cfg(feature = "perf-literal-substring")] { memchr::memchr(self.0, &haystack[span]).map(|i| { let start = span.start + i; let end = start + 1; Span { start, end } }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { let b = *haystack.get(span.start)?; if self.0 == b { Some(Span { start: span.start, end: span.start + 1 }) } else { None } } fn memory_usage(&self) -> usize { 0 } fn is_fast(&self) -> bool { true } } #[derive(Clone, Debug)] pub(crate) struct Memchr2(u8, u8); impl Memchr2 { pub(crate) fn new>( _kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { None } #[cfg(feature = "perf-literal-substring")] { if needles.len() != 2 { return None; } if !needles.iter().all(|n| n.as_ref().len() == 1) { return None; } let b1 = needles[0].as_ref()[0]; let b2 = needles[1].as_ref()[0]; Some(Memchr2(b1, b2)) } } } impl PrefilterI for Memchr2 { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { unreachable!() } #[cfg(feature = "perf-literal-substring")] { memchr::memchr2(self.0, self.1, &haystack[span]).map(|i| { let start = span.start + i; let end = start + 1; Span { start, end } }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { let b = *haystack.get(span.start)?; if self.0 == b || self.1 == b { Some(Span { start: span.start, end: span.start + 1 }) } else { None } } fn memory_usage(&self) -> usize { 0 } fn is_fast(&self) -> bool { true } } #[derive(Clone, Debug)] pub(crate) struct Memchr3(u8, u8, u8); impl Memchr3 { pub(crate) fn new>( _kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { None } #[cfg(feature = "perf-literal-substring")] { if needles.len() != 3 { return None; } if !needles.iter().all(|n| n.as_ref().len() == 1) { return None; } let b1 = needles[0].as_ref()[0]; let b2 = needles[1].as_ref()[0]; let b3 = needles[2].as_ref()[0]; Some(Memchr3(b1, b2, b3)) } } } impl PrefilterI for Memchr3 { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-substring"))] { unreachable!() } #[cfg(feature = "perf-literal-substring")] { memchr::memchr3(self.0, self.1, self.2, &haystack[span]).map(|i| { let start = span.start + i; let end = start + 1; Span { start, end } }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { let b = *haystack.get(span.start)?; if self.0 == b || self.1 == b || self.2 == b { Some(Span { start: span.start, end: span.start + 1 }) } else { None } } fn memory_usage(&self) -> usize { 0 } fn is_fast(&self) -> bool { true } } regex-automata-0.4.9/src/util/prefilter/memmem.rs000064400000000000000000000047651046102023000201600ustar 00000000000000use crate::util::{ prefilter::PrefilterI, search::{MatchKind, Span}, }; #[derive(Clone, Debug)] pub(crate) struct Memmem { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] _unused: (), #[cfg(all(feature = "std", feature = "perf-literal-substring"))] finder: memchr::memmem::Finder<'static>, } impl Memmem { pub(crate) fn new>( _kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] { None } #[cfg(all(feature = "std", feature = "perf-literal-substring"))] { if needles.len() != 1 { return None; } let needle = needles[0].as_ref(); let finder = memchr::memmem::Finder::new(needle).into_owned(); Some(Memmem { finder }) } } } impl PrefilterI for Memmem { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] { unreachable!() } #[cfg(all(feature = "std", feature = "perf-literal-substring"))] { self.finder.find(&haystack[span]).map(|i| { let start = span.start + i; let end = start + self.finder.needle().len(); Span { start, end } }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] { unreachable!() } #[cfg(all(feature = "std", feature = "perf-literal-substring"))] { let needle = self.finder.needle(); if haystack[span].starts_with(needle) { Some(Span { end: span.start + needle.len(), ..span }) } else { None } } } fn memory_usage(&self) -> usize { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] { unreachable!() } #[cfg(all(feature = "std", feature = "perf-literal-substring"))] { self.finder.needle().len() } } fn is_fast(&self) -> bool { #[cfg(not(all(feature = "std", feature = "perf-literal-substring")))] { unreachable!() } #[cfg(all(feature = "std", feature = "perf-literal-substring"))] { true } } } regex-automata-0.4.9/src/util/prefilter/mod.rs000064400000000000000000000635701046102023000174610ustar 00000000000000/*! Defines a prefilter for accelerating regex searches. A prefilter can be created by building a [`Prefilter`] value. A prefilter represents one of the most important optimizations available for accelerating regex searches. The idea of a prefilter is to very quickly find candidate locations in a haystack where a regex _could_ match. Once a candidate is found, it is then intended for the regex engine to run at that position to determine whether the candidate is a match or a false positive. In the aforementioned description of the prefilter optimization also lay its demise. Namely, if a prefilter has a high false positive rate and it produces lots of candidates, then a prefilter can overall make a regex search slower. It can run more slowly because more time is spent ping-ponging between the prefilter search and the regex engine attempting to confirm each candidate as a match. This ping-ponging has overhead that adds up, and is exacerbated by a high false positive rate. Nevertheless, the optimization is still generally worth performing in most cases. Particularly given just how much throughput can be improved. (It is not uncommon for prefilter optimizations to improve throughput by one or two orders of magnitude.) Typically a prefilter is used to find occurrences of literal prefixes from a regex pattern, but this isn't required. A prefilter can be used to look for suffixes or even inner literals. Note that as of now, prefilters throw away information about which pattern each literal comes from. In other words, when a prefilter finds a match, there's no way to know which pattern (or patterns) it came from. Therefore, in order to confirm a match, you'll have to check all of the patterns by running the full regex engine. */ mod aho_corasick; mod byteset; mod memchr; mod memmem; mod teddy; use core::{ borrow::Borrow, fmt::Debug, panic::{RefUnwindSafe, UnwindSafe}, }; #[cfg(feature = "alloc")] use alloc::sync::Arc; #[cfg(feature = "syntax")] use regex_syntax::hir::{literal, Hir}; use crate::util::search::{MatchKind, Span}; pub(crate) use crate::util::prefilter::{ aho_corasick::AhoCorasick, byteset::ByteSet, memchr::{Memchr, Memchr2, Memchr3}, memmem::Memmem, teddy::Teddy, }; /// A prefilter for accelerating regex searches. /// /// If you already have your literals that you want to search with, /// then the vanilla [`Prefilter::new`] constructor is for you. But /// if you have an [`Hir`] value from the `regex-syntax` crate, then /// [`Prefilter::from_hir_prefix`] might be more convenient. Namely, it uses /// the [`regex-syntax::hir::literal`](regex_syntax::hir::literal) module to /// extract literal prefixes for you, optimize them and then select and build a /// prefilter matcher. /// /// A prefilter must have **zero false negatives**. However, by its very /// nature, it may produce false positives. That is, a prefilter will never /// skip over a position in the haystack that corresponds to a match of the /// original regex pattern, but it *may* produce a match for a position /// in the haystack that does *not* correspond to a match of the original /// regex pattern. If you use either the [`Prefilter::from_hir_prefix`] or /// [`Prefilter::from_hirs_prefix`] constructors, then this guarantee is /// upheld for you automatically. This guarantee is not preserved if you use /// [`Prefilter::new`] though, since it is up to the caller to provide correct /// literal strings with respect to the original regex pattern. /// /// # Cloning /// /// It is an API guarantee that cloning a prefilter is cheap. That is, cloning /// it will not duplicate whatever heap memory is used to represent the /// underlying matcher. /// /// # Example /// /// This example shows how to attach a `Prefilter` to the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) in order to accelerate /// searches. /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Bruce "]) /// .expect("a prefilter"); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 6..23)), /// re.find(&mut cache, "Hello Bruce Springsteen!"), /// ); /// # Ok::<(), Box>(()) /// ``` /// /// But note that if you get your prefilter incorrect, it could lead to an /// incorrect result! /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// // This prefilter is wrong! /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Patti "]) /// .expect("a prefilter"); /// let re = PikeVM::builder() /// .configure(PikeVM::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let mut cache = re.create_cache(); /// // We find no match even though the regex does match. /// assert_eq!( /// None, /// re.find(&mut cache, "Hello Bruce Springsteen!"), /// ); /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Prefilter { #[cfg(not(feature = "alloc"))] _unused: (), #[cfg(feature = "alloc")] pre: Arc, #[cfg(feature = "alloc")] is_fast: bool, #[cfg(feature = "alloc")] max_needle_len: usize, } impl Prefilter { /// Create a new prefilter from a sequence of needles and a corresponding /// match semantics. /// /// This may return `None` for a variety of reasons, for example, if /// a suitable prefilter could not be constructed. That might occur /// if they are unavailable (e.g., the `perf-literal-substring` and /// `perf-literal-multisubstring` features aren't enabled), or it might /// occur because of heuristics or other artifacts of how the prefilter /// works. /// /// Note that if you have an [`Hir`] expression, it may be more convenient /// to use [`Prefilter::from_hir_prefix`]. It will automatically handle the /// task of extracting prefix literals for you. /// /// # Example /// /// This example shows how match semantics can impact the matching /// algorithm used by the prefilter. For this reason, it is important to /// ensure that the match semantics given here are consistent with the /// match semantics intended for the regular expression that the literals /// were extracted from. /// /// ``` /// use regex_automata::{ /// util::{prefilter::Prefilter, syntax}, /// MatchKind, Span, /// }; /// /// let hay = "Hello samwise"; /// /// // With leftmost-first, we find 'samwise' here because it comes /// // before 'sam' in the sequence we give it.. /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["samwise", "sam"]) /// .expect("a prefilter"); /// assert_eq!( /// Some(Span::from(6..13)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// // Still with leftmost-first but with the literals reverse, now 'sam' /// // will match instead! /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["sam", "samwise"]) /// .expect("a prefilter"); /// assert_eq!( /// Some(Span::from(6..9)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn new>( kind: MatchKind, needles: &[B], ) -> Option { Choice::new(kind, needles).and_then(|choice| { let max_needle_len = needles.iter().map(|b| b.as_ref().len()).max().unwrap_or(0); Prefilter::from_choice(choice, max_needle_len) }) } /// This turns a prefilter selection into a `Prefilter`. That is, in turns /// the enum given into a trait object. fn from_choice( choice: Choice, max_needle_len: usize, ) -> Option { #[cfg(not(feature = "alloc"))] { None } #[cfg(feature = "alloc")] { let pre: Arc = match choice { Choice::Memchr(p) => Arc::new(p), Choice::Memchr2(p) => Arc::new(p), Choice::Memchr3(p) => Arc::new(p), Choice::Memmem(p) => Arc::new(p), Choice::Teddy(p) => Arc::new(p), Choice::ByteSet(p) => Arc::new(p), Choice::AhoCorasick(p) => Arc::new(p), }; let is_fast = pre.is_fast(); Some(Prefilter { pre, is_fast, max_needle_len }) } } /// This attempts to extract prefixes from the given `Hir` expression for /// the given match semantics, and if possible, builds a prefilter for /// them. /// /// # Example /// /// This example shows how to build a prefilter directly from an [`Hir`] /// expression, and use to find an occurrence of a prefix from the regex /// pattern. /// /// ``` /// use regex_automata::{ /// util::{prefilter::Prefilter, syntax}, /// MatchKind, Span, /// }; /// /// let hir = syntax::parse(r"(Bruce|Patti) \w+")?; /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) /// .expect("a prefilter"); /// let hay = "Hello Patti Scialfa!"; /// assert_eq!( /// Some(Span::from(6..12)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn from_hir_prefix(kind: MatchKind, hir: &Hir) -> Option { Prefilter::from_hirs_prefix(kind, &[hir]) } /// This attempts to extract prefixes from the given `Hir` expressions for /// the given match semantics, and if possible, builds a prefilter for /// them. /// /// Note that as of now, prefilters throw away information about which /// pattern each literal comes from. In other words, when a prefilter finds /// a match, there's no way to know which pattern (or patterns) it came /// from. Therefore, in order to confirm a match, you'll have to check all /// of the patterns by running the full regex engine. /// /// # Example /// /// This example shows how to build a prefilter directly from multiple /// `Hir` expressions expression, and use it to find an occurrence of a /// prefix from the regex patterns. /// /// ``` /// use regex_automata::{ /// util::{prefilter::Prefilter, syntax}, /// MatchKind, Span, /// }; /// /// let hirs = syntax::parse_many(&[ /// r"(Bruce|Patti) \w+", /// r"Mrs?\. Doubtfire", /// ])?; /// let pre = Prefilter::from_hirs_prefix(MatchKind::LeftmostFirst, &hirs) /// .expect("a prefilter"); /// let hay = "Hello Mrs. Doubtfire"; /// assert_eq!( /// Some(Span::from(6..20)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[cfg(feature = "syntax")] pub fn from_hirs_prefix>( kind: MatchKind, hirs: &[H], ) -> Option { prefixes(kind, hirs) .literals() .and_then(|lits| Prefilter::new(kind, lits)) } /// Run this prefilter on `haystack[span.start..end]` and return a matching /// span if one exists. /// /// The span returned is guaranteed to have a start position greater than /// or equal to the one given, and an end position less than or equal to /// the one given. /// /// # Example /// /// This example shows how to build a prefilter directly from an [`Hir`] /// expression, and use it to find an occurrence of a prefix from the regex /// pattern. /// /// ``` /// use regex_automata::{ /// util::{prefilter::Prefilter, syntax}, /// MatchKind, Span, /// }; /// /// let hir = syntax::parse(r"Bruce \w+")?; /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) /// .expect("a prefilter"); /// let hay = "Hello Bruce Springsteen!"; /// assert_eq!( /// Some(Span::from(6..12)), /// pre.find(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "alloc"))] { unreachable!() } #[cfg(feature = "alloc")] { self.pre.find(haystack, span) } } /// Returns the span of a prefix of `haystack[span.start..span.end]` if /// the prefilter matches. /// /// The span returned is guaranteed to have a start position equivalent to /// the one given, and an end position less than or equal to the one given. /// /// # Example /// /// This example shows how to build a prefilter directly from an [`Hir`] /// expression, and use it to find an occurrence of a prefix from the regex /// pattern that begins at the start of a haystack only. /// /// ``` /// use regex_automata::{ /// util::{prefilter::Prefilter, syntax}, /// MatchKind, Span, /// }; /// /// let hir = syntax::parse(r"Bruce \w+")?; /// let pre = Prefilter::from_hir_prefix(MatchKind::LeftmostFirst, &hir) /// .expect("a prefilter"); /// let hay = "Hello Bruce Springsteen!"; /// // Nothing is found here because 'Bruce' does /// // not occur at the beginning of our search. /// assert_eq!( /// None, /// pre.prefix(hay.as_bytes(), Span::from(0..hay.len())), /// ); /// // But if we change where we start the search /// // to begin where 'Bruce ' begins, then a /// // match will be found. /// assert_eq!( /// Some(Span::from(6..12)), /// pre.prefix(hay.as_bytes(), Span::from(6..hay.len())), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn prefix(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "alloc"))] { unreachable!() } #[cfg(feature = "alloc")] { self.pre.prefix(haystack, span) } } /// Returns the heap memory, in bytes, used by the underlying prefilter. #[inline] pub fn memory_usage(&self) -> usize { #[cfg(not(feature = "alloc"))] { unreachable!() } #[cfg(feature = "alloc")] { self.pre.memory_usage() } } /// Return the length of the longest needle /// in this Prefilter #[inline] pub fn max_needle_len(&self) -> usize { #[cfg(not(feature = "alloc"))] { unreachable!() } #[cfg(feature = "alloc")] { self.max_needle_len } } /// Implementations might return true here if they believe themselves to /// be "fast." The concept of "fast" is deliberately left vague, but in /// practice this usually corresponds to whether it's believed that SIMD /// will be used. /// /// Why do we care about this? Well, some prefilter tricks tend to come /// with their own bits of overhead, and so might only make sense if we /// know that a scan will be *much* faster than the regex engine itself. /// Otherwise, the trick may not be worth doing. Whether something is /// "much" faster than the regex engine generally boils down to whether /// SIMD is used. (But not always. Even a SIMD matcher with a high false /// positive rate can become quite slow.) /// /// Even if this returns true, it is still possible for the prefilter to /// be "slow." Remember, prefilters are just heuristics. We can't really /// *know* a prefilter will be fast without actually trying the prefilter. /// (Which of course we cannot afford to do.) #[inline] pub fn is_fast(&self) -> bool { #[cfg(not(feature = "alloc"))] { unreachable!() } #[cfg(feature = "alloc")] { self.is_fast } } } /// A trait for abstracting over prefilters. Basically, a prefilter is /// something that do an unanchored *and* an anchored search in a haystack /// within a given span. /// /// This exists pretty much only so that we can use prefilters as a trait /// object (which is what `Prefilter` is). If we ever move off of trait objects /// and to an enum, then it's likely this trait could be removed. pub(crate) trait PrefilterI: Debug + Send + Sync + RefUnwindSafe + UnwindSafe + 'static { /// Run this prefilter on `haystack[span.start..end]` and return a matching /// span if one exists. /// /// The span returned is guaranteed to have a start position greater than /// or equal to the one given, and an end position less than or equal to /// the one given. fn find(&self, haystack: &[u8], span: Span) -> Option; /// Returns the span of a prefix of `haystack[span.start..span.end]` if /// the prefilter matches. /// /// The span returned is guaranteed to have a start position equivalent to /// the one given, and an end position less than or equal to the one given. fn prefix(&self, haystack: &[u8], span: Span) -> Option; /// Returns the heap memory, in bytes, used by the underlying prefilter. fn memory_usage(&self) -> usize; /// Implementations might return true here if they believe themselves to /// be "fast." See [`Prefilter::is_fast`] for more details. fn is_fast(&self) -> bool; } #[cfg(feature = "alloc")] impl PrefilterI for Arc

{ #[cfg_attr(feature = "perf-inline", inline(always))] fn find(&self, haystack: &[u8], span: Span) -> Option { (&**self).find(haystack, span) } #[cfg_attr(feature = "perf-inline", inline(always))] fn prefix(&self, haystack: &[u8], span: Span) -> Option { (&**self).prefix(haystack, span) } #[cfg_attr(feature = "perf-inline", inline(always))] fn memory_usage(&self) -> usize { (&**self).memory_usage() } #[cfg_attr(feature = "perf-inline", inline(always))] fn is_fast(&self) -> bool { (&**self).is_fast() } } /// A type that encapsulates the selection of a prefilter algorithm from a /// sequence of needles. /// /// The existence of this type is a little tricky, because we don't (currently) /// use it for performing a search. Instead, we really only consume it by /// converting the underlying prefilter into a trait object, whether that be /// `dyn PrefilterI` or `dyn Strategy` (for the meta regex engine). In order /// to avoid re-copying the prefilter selection logic, we isolate it here, and /// then force anything downstream that wants to convert it to a trait object /// to do trivial case analysis on it. /// /// One wonders whether we *should* use an enum instead of a trait object. /// At time of writing, I chose trait objects based on instinct because 1) I /// knew I wasn't going to inline anything and 2) there would potentially be /// many different choices. However, as of time of writing, I haven't actually /// compared the trait object approach to the enum approach. That probably /// should be litigated, but I ran out of steam. /// /// Note that if the `alloc` feature is disabled, then values of this type /// are (and should) never be constructed. Also, in practice, for any of the /// prefilters to be selected, you'll need at least one of the `perf-literal-*` /// features enabled. #[derive(Clone, Debug)] pub(crate) enum Choice { Memchr(Memchr), Memchr2(Memchr2), Memchr3(Memchr3), Memmem(Memmem), Teddy(Teddy), ByteSet(ByteSet), AhoCorasick(AhoCorasick), } impl Choice { /// Select what is believed to be the best prefilter algorithm for the /// match semantics and sequence of needles given. /// /// This selection algorithm uses the needles as given without any /// modification. For example, if `[bar]` is given, then this doesn't /// try to select `memchr` for `b`. Instead, it would select `memmem` /// for `bar`. If callers would want `memchr` selected for `[bar]`, then /// callers should massages the literals themselves. That is, callers are /// responsible for heuristics surrounding which sequence of literals is /// best. /// /// What this selection algorithm does is attempt to use the fastest /// prefilter that works for the literals given. So if `[a, b]`, is given, /// then `memchr2` is selected. /// /// Of course, which prefilter is selected is also subject to what /// is available. For example, if `alloc` isn't enabled, then /// that limits which prefilters can be selected. Similarly, if /// `perf-literal-substring` isn't enabled, then nothing from the `memchr` /// crate can be returned. pub(crate) fn new>( kind: MatchKind, needles: &[B], ) -> Option { // An empty set means the regex matches nothing, so no sense in // building a prefilter. if needles.len() == 0 { debug!("prefilter building failed: found empty set of literals"); return None; } // If the regex can match the empty string, then the prefilter // will by definition match at every position. This is obviously // completely ineffective. if needles.iter().any(|n| n.as_ref().is_empty()) { debug!("prefilter building failed: literals match empty string"); return None; } // BREADCRUMBS: Perhaps the literal optimizer should special case // sequences of length two or three if the leading bytes of each are // "rare"? Or perhaps, if there are two or three total possible leading // bytes, regardless of the number of literals, and all are rare... // Then well, perhaps we should use memchr2 or memchr3 in those cases? if let Some(pre) = Memchr::new(kind, needles) { debug!("prefilter built: memchr"); return Some(Choice::Memchr(pre)); } if let Some(pre) = Memchr2::new(kind, needles) { debug!("prefilter built: memchr2"); return Some(Choice::Memchr2(pre)); } if let Some(pre) = Memchr3::new(kind, needles) { debug!("prefilter built: memchr3"); return Some(Choice::Memchr3(pre)); } if let Some(pre) = Memmem::new(kind, needles) { debug!("prefilter built: memmem"); return Some(Choice::Memmem(pre)); } if let Some(pre) = Teddy::new(kind, needles) { debug!("prefilter built: teddy"); return Some(Choice::Teddy(pre)); } if let Some(pre) = ByteSet::new(kind, needles) { debug!("prefilter built: byteset"); return Some(Choice::ByteSet(pre)); } if let Some(pre) = AhoCorasick::new(kind, needles) { debug!("prefilter built: aho-corasick"); return Some(Choice::AhoCorasick(pre)); } debug!("prefilter building failed: no strategy could be found"); None } } /// Extracts all of the prefix literals from the given HIR expressions into a /// single `Seq`. The literals in the sequence are ordered with respect to the /// order of the given HIR expressions and consistent with the match semantics /// given. /// /// The sequence returned is "optimized." That is, they may be shrunk or even /// truncated according to heuristics with the intent of making them more /// useful as a prefilter. (Which translates to both using faster algorithms /// and minimizing the false positive rate.) /// /// Note that this erases any connection between the literals and which pattern /// (or patterns) they came from. /// /// The match kind given must correspond to the match semantics of the regex /// that is represented by the HIRs given. The match semantics may change the /// literal sequence returned. #[cfg(feature = "syntax")] pub(crate) fn prefixes(kind: MatchKind, hirs: &[H]) -> literal::Seq where H: core::borrow::Borrow, { let mut extractor = literal::Extractor::new(); extractor.kind(literal::ExtractKind::Prefix); let mut prefixes = literal::Seq::empty(); for hir in hirs { prefixes.union(&mut extractor.extract(hir.borrow())); } debug!( "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}", prefixes.len(), prefixes.is_exact(), prefixes ); match kind { MatchKind::All => { prefixes.sort(); prefixes.dedup(); } MatchKind::LeftmostFirst => { prefixes.optimize_for_prefix_by_preference(); } } debug!( "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}", prefixes.len(), prefixes.is_exact(), prefixes ); prefixes } /// Like `prefixes`, but for all suffixes of all matches for the given HIRs. #[cfg(feature = "syntax")] pub(crate) fn suffixes(kind: MatchKind, hirs: &[H]) -> literal::Seq where H: core::borrow::Borrow, { let mut extractor = literal::Extractor::new(); extractor.kind(literal::ExtractKind::Suffix); let mut suffixes = literal::Seq::empty(); for hir in hirs { suffixes.union(&mut extractor.extract(hir.borrow())); } debug!( "suffixes (len={:?}, exact={:?}) extracted before optimization: {:?}", suffixes.len(), suffixes.is_exact(), suffixes ); match kind { MatchKind::All => { suffixes.sort(); suffixes.dedup(); } MatchKind::LeftmostFirst => { suffixes.optimize_for_suffix_by_preference(); } } debug!( "suffixes (len={:?}, exact={:?}) extracted after optimization: {:?}", suffixes.len(), suffixes.is_exact(), suffixes ); suffixes } regex-automata-0.4.9/src/util/prefilter/teddy.rs000064400000000000000000000153151046102023000200050ustar 00000000000000use crate::util::{ prefilter::PrefilterI, search::{MatchKind, Span}, }; #[derive(Clone, Debug)] pub(crate) struct Teddy { #[cfg(not(feature = "perf-literal-multisubstring"))] _unused: (), /// The actual Teddy searcher. /// /// Technically, it's possible that Teddy doesn't actually get used, since /// Teddy does require its haystack to at least be of a certain size /// (usually around the size of whatever vector is being used, so ~16 /// or ~32 bytes). For haystacks shorter than that, the implementation /// currently uses Rabin-Karp. #[cfg(feature = "perf-literal-multisubstring")] searcher: aho_corasick::packed::Searcher, /// When running an anchored search, the packed searcher can't handle it so /// we defer to Aho-Corasick itself. Kind of sad, but changing the packed /// searchers to support anchored search would be difficult at worst and /// annoying at best. Since packed searchers only apply to small numbers of /// literals, we content ourselves that this is not much of an added cost. /// (That packed searchers only work with a small number of literals is /// also why we use a DFA here. Otherwise, the memory usage of a DFA would /// likely be unacceptable.) #[cfg(feature = "perf-literal-multisubstring")] anchored_ac: aho_corasick::dfa::DFA, /// The length of the smallest literal we look for. /// /// We use this as a heuristic to figure out whether this will be "fast" or /// not. Generally, the longer the better, because longer needles are more /// discriminating and thus reduce false positive rate. #[cfg(feature = "perf-literal-multisubstring")] minimum_len: usize, } impl Teddy { pub(crate) fn new>( kind: MatchKind, needles: &[B], ) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { None } #[cfg(feature = "perf-literal-multisubstring")] { // We only really support leftmost-first semantics. In // theory we could at least support leftmost-longest, as the // aho-corasick crate does, but regex-automata doesn't know about // leftmost-longest currently. // // And like the aho-corasick prefilter, if we're using `All` // semantics, then we can still use leftmost semantics for a // prefilter. (This might be a suspicious choice for the literal // engine, which uses a prefilter as a regex engine directly, but // that only happens when using leftmost-first semantics.) let (packed_match_kind, ac_match_kind) = match kind { MatchKind::LeftmostFirst | MatchKind::All => ( aho_corasick::packed::MatchKind::LeftmostFirst, aho_corasick::MatchKind::LeftmostFirst, ), }; let minimum_len = needles.iter().map(|n| n.as_ref().len()).min().unwrap_or(0); let packed = aho_corasick::packed::Config::new() .match_kind(packed_match_kind) .builder() .extend(needles) .build()?; let anchored_ac = aho_corasick::dfa::DFA::builder() .match_kind(ac_match_kind) .start_kind(aho_corasick::StartKind::Anchored) .prefilter(false) .build(needles) .ok()?; Some(Teddy { searcher: packed, anchored_ac, minimum_len }) } } } impl PrefilterI for Teddy { fn find(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { let ac_span = aho_corasick::Span { start: span.start, end: span.end }; self.searcher .find_in(haystack, ac_span) .map(|m| Span { start: m.start(), end: m.end() }) } } fn prefix(&self, haystack: &[u8], span: Span) -> Option { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { use aho_corasick::automaton::Automaton; let input = aho_corasick::Input::new(haystack) .anchored(aho_corasick::Anchored::Yes) .span(span.start..span.end); self.anchored_ac .try_find(&input) // OK because we build the DFA with anchored support. .expect("aho-corasick DFA should never fail") .map(|m| Span { start: m.start(), end: m.end() }) } } fn memory_usage(&self) -> usize { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { use aho_corasick::automaton::Automaton; self.searcher.memory_usage() + self.anchored_ac.memory_usage() } } fn is_fast(&self) -> bool { #[cfg(not(feature = "perf-literal-multisubstring"))] { unreachable!() } #[cfg(feature = "perf-literal-multisubstring")] { // Teddy is usually quite fast, but I have seen some cases where // a large number of literals can overwhelm it and make it not so // fast. We make an educated but conservative guess at a limit, at // which point, we're not so comfortable thinking Teddy is "fast." // // Well... this used to incorporate a "limit" on the *number* // of literals, but I have since changed it to a minimum on the // *smallest* literal. Namely, when there is a very small literal // (1 or 2 bytes), it is far more likely that it leads to a higher // false positive rate. (Although, of course, not always. For // example, 'zq' is likely to have a very low false positive rate.) // But when we have 3 bytes, we have a really good chance of being // quite discriminatory and thus fast. // // We may still want to add some kind of limit on the number of // literals here, but keep in mind that Teddy already has its own // somewhat small limit (64 at time of writing). The main issue // here is that if 'is_fast' is false, it opens the door for the // reverse inner optimization to kick in. We really only want to // resort to the reverse inner optimization if we absolutely must. self.minimum_len >= 3 } } } regex-automata-0.4.9/src/util/primitives.rs000064400000000000000000000660531046102023000171000ustar 00000000000000/*! Lower level primitive types that are useful in a variety of circumstances. # Overview This list represents the principle types in this module and briefly describes when you might want to use them. * [`PatternID`] - A type that represents the identifier of a regex pattern. This is probably the most widely used type in this module (which is why it's also re-exported in the crate root). * [`StateID`] - A type the represents the identifier of a finite automaton state. This is used for both NFAs and DFAs, with the notable exception of the hybrid NFA/DFA. (The hybrid NFA/DFA uses a special purpose "lazy" state identifier.) * [`SmallIndex`] - The internal representation of both a `PatternID` and a `StateID`. Its purpose is to serve as a type that can index memory without being as big as a `usize` on 64-bit targets. The main idea behind this type is that there are many things in regex engines that will, in practice, never overflow a 32-bit integer. (For example, like the number of patterns in a regex or the number of states in an NFA.) Thus, a `SmallIndex` can be used to index memory without peppering `as` casts everywhere. Moreover, it forces callers to handle errors in the case where, somehow, the value would otherwise overflow either a 32-bit integer or a `usize` (e.g., on 16-bit targets). * [`NonMaxUsize`] - Represents a `usize` that cannot be `usize::MAX`. As a result, `Option` has the same size in memory as a `usize`. This useful, for example, when representing the offsets of submatches since it reduces memory usage by a factor of 2. It is a legal optimization since Rust guarantees that slices never have a length that exceeds `isize::MAX`. */ use core::num::NonZeroUsize; #[cfg(feature = "alloc")] use alloc::vec::Vec; use crate::util::int::{Usize, U16, U32, U64}; /// A `usize` that can never be `usize::MAX`. /// /// This is similar to `core::num::NonZeroUsize`, but instead of not permitting /// a zero value, this does not permit a max value. /// /// This is useful in certain contexts where one wants to optimize the memory /// usage of things that contain match offsets. Namely, since Rust slices /// are guaranteed to never have a length exceeding `isize::MAX`, we can use /// `usize::MAX` as a sentinel to indicate that no match was found. Indeed, /// types like `Option` have exactly the same size in memory as a /// `usize`. /// /// This type is defined to be `repr(transparent)` for /// `core::num::NonZeroUsize`, which is in turn defined to be /// `repr(transparent)` for `usize`. #[derive(Clone, Copy, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub struct NonMaxUsize(NonZeroUsize); impl NonMaxUsize { /// Create a new `NonMaxUsize` from the given value. /// /// This returns `None` only when the given value is equal to `usize::MAX`. #[inline] pub fn new(value: usize) -> Option { NonZeroUsize::new(value.wrapping_add(1)).map(NonMaxUsize) } /// Return the underlying `usize` value. The returned value is guaranteed /// to not equal `usize::MAX`. #[inline] pub fn get(self) -> usize { self.0.get().wrapping_sub(1) } } // We provide our own Debug impl because seeing the internal repr can be quite // surprising if you aren't expecting it. e.g., 'NonMaxUsize(5)' vs just '5'. impl core::fmt::Debug for NonMaxUsize { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{:?}", self.get()) } } /// A type that represents a "small" index. /// /// The main idea of this type is to provide something that can index memory, /// but uses less memory than `usize` on 64-bit systems. Specifically, its /// representation is always a `u32` and has `repr(transparent)` enabled. (So /// it is safe to transmute between a `u32` and a `SmallIndex`.) /// /// A small index is typically useful in cases where there is no practical way /// that the index will overflow a 32-bit integer. A good example of this is /// an NFA state. If you could somehow build an NFA with `2^30` states, its /// memory usage would be exorbitant and its runtime execution would be so /// slow as to be completely worthless. Therefore, this crate generally deems /// it acceptable to return an error if it would otherwise build an NFA that /// requires a slice longer than what a 32-bit integer can index. In exchange, /// we can use 32-bit indices instead of 64-bit indices in various places. /// /// This type ensures this by providing a constructor that will return an error /// if its argument cannot fit into the type. This makes it much easier to /// handle these sorts of boundary cases that are otherwise extremely subtle. /// /// On all targets, this type guarantees that its value will fit in a `u32`, /// `i32`, `usize` and an `isize`. This means that on 16-bit targets, for /// example, this type's maximum value will never overflow an `isize`, /// which means it will never overflow a `i16` even though its internal /// representation is still a `u32`. /// /// The purpose for making the type fit into even signed integer types like /// `isize` is to guarantee that the difference between any two small indices /// is itself also a small index. This is useful in certain contexts, e.g., /// for delta encoding. /// /// # Other types /// /// The following types wrap `SmallIndex` to provide a more focused use case: /// /// * [`PatternID`] is for representing the identifiers of patterns. /// * [`StateID`] is for representing the identifiers of states in finite /// automata. It is used for both NFAs and DFAs. /// /// # Representation /// /// This type is always represented internally by a `u32` and is marked as /// `repr(transparent)`. Thus, this type always has the same representation as /// a `u32`. It is thus safe to transmute between a `u32` and a `SmallIndex`. /// /// # Indexing /// /// For convenience, callers may use a `SmallIndex` to index slices. /// /// # Safety /// /// While a `SmallIndex` is meant to guarantee that its value fits into `usize` /// without using as much space as a `usize` on all targets, callers must /// not rely on this property for safety. Callers may choose to rely on this /// property for correctness however. For example, creating a `SmallIndex` with /// an invalid value can be done in entirely safe code. This may in turn result /// in panics or silent logical errors. #[derive( Clone, Copy, Debug, Default, Eq, Hash, PartialEq, PartialOrd, Ord, )] #[repr(transparent)] pub struct SmallIndex(u32); impl SmallIndex { /// The maximum index value. #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] pub const MAX: SmallIndex = // FIXME: Use as_usize() once const functions in traits are stable. SmallIndex::new_unchecked(core::i32::MAX as usize - 1); /// The maximum index value. #[cfg(target_pointer_width = "16")] pub const MAX: SmallIndex = SmallIndex::new_unchecked(core::isize::MAX - 1); /// The total number of values that can be represented as a small index. pub const LIMIT: usize = SmallIndex::MAX.as_usize() + 1; /// The zero index value. pub const ZERO: SmallIndex = SmallIndex::new_unchecked(0); /// The number of bytes that a single small index uses in memory. pub const SIZE: usize = core::mem::size_of::(); /// Create a new small index. /// /// If the given index exceeds [`SmallIndex::MAX`], then this returns /// an error. #[inline] pub fn new(index: usize) -> Result { SmallIndex::try_from(index) } /// Create a new small index without checking whether the given value /// exceeds [`SmallIndex::MAX`]. /// /// Using this routine with an invalid index value will result in /// unspecified behavior, but *not* undefined behavior. In particular, an /// invalid index value is likely to cause panics or possibly even silent /// logical errors. /// /// Callers must never rely on a `SmallIndex` to be within a certain range /// for memory safety. #[inline] pub const fn new_unchecked(index: usize) -> SmallIndex { // FIXME: Use as_u32() once const functions in traits are stable. SmallIndex(index as u32) } /// Like [`SmallIndex::new`], but panics if the given index is not valid. #[inline] pub fn must(index: usize) -> SmallIndex { SmallIndex::new(index).expect("invalid small index") } /// Return this small index as a `usize`. This is guaranteed to never /// overflow `usize`. #[inline] pub const fn as_usize(&self) -> usize { // FIXME: Use as_usize() once const functions in traits are stable. self.0 as usize } /// Return this small index as a `u64`. This is guaranteed to never /// overflow. #[inline] pub const fn as_u64(&self) -> u64 { // FIXME: Use u64::from() once const functions in traits are stable. self.0 as u64 } /// Return the internal `u32` of this small index. This is guaranteed to /// never overflow `u32`. #[inline] pub const fn as_u32(&self) -> u32 { self.0 } /// Return the internal `u32` of this small index represented as an `i32`. /// This is guaranteed to never overflow an `i32`. #[inline] pub const fn as_i32(&self) -> i32 { // This is OK because we guarantee that our max value is <= i32::MAX. self.0 as i32 } /// Returns one more than this small index as a usize. /// /// Since a small index has constraints on its maximum value, adding `1` to /// it will always fit in a `usize`, `u32` and a `i32`. #[inline] pub fn one_more(&self) -> usize { self.as_usize() + 1 } /// Decode this small index from the bytes given using the native endian /// byte order for the current target. /// /// If the decoded integer is not representable as a small index for the /// current target, then this returns an error. #[inline] pub fn from_ne_bytes( bytes: [u8; 4], ) -> Result { let id = u32::from_ne_bytes(bytes); if id > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(id) }); } Ok(SmallIndex::new_unchecked(id.as_usize())) } /// Decode this small index from the bytes given using the native endian /// byte order for the current target. /// /// This is analogous to [`SmallIndex::new_unchecked`] in that is does not /// check whether the decoded integer is representable as a small index. #[inline] pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> SmallIndex { SmallIndex::new_unchecked(u32::from_ne_bytes(bytes).as_usize()) } /// Return the underlying small index integer as raw bytes in native endian /// format. #[inline] pub fn to_ne_bytes(&self) -> [u8; 4] { self.0.to_ne_bytes() } } impl core::ops::Index for [T] { type Output = T; #[inline] fn index(&self, index: SmallIndex) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut for [T] { #[inline] fn index_mut(&mut self, index: SmallIndex) -> &mut T { &mut self[index.as_usize()] } } #[cfg(feature = "alloc")] impl core::ops::Index for Vec { type Output = T; #[inline] fn index(&self, index: SmallIndex) -> &T { &self[index.as_usize()] } } #[cfg(feature = "alloc")] impl core::ops::IndexMut for Vec { #[inline] fn index_mut(&mut self, index: SmallIndex) -> &mut T { &mut self[index.as_usize()] } } impl From for SmallIndex { fn from(index: u8) -> SmallIndex { SmallIndex::new_unchecked(usize::from(index)) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u16) -> Result { if u32::from(index) > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(index) }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u32) -> Result { if index > SmallIndex::MAX.as_u32() { return Err(SmallIndexError { attempted: u64::from(index) }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: u64) -> Result { if index > SmallIndex::MAX.as_u64() { return Err(SmallIndexError { attempted: index }); } Ok(SmallIndex::new_unchecked(index.as_usize())) } } impl TryFrom for SmallIndex { type Error = SmallIndexError; fn try_from(index: usize) -> Result { if index > SmallIndex::MAX.as_usize() { return Err(SmallIndexError { attempted: index.as_u64() }); } Ok(SmallIndex::new_unchecked(index)) } } #[cfg(test)] impl quickcheck::Arbitrary for SmallIndex { fn arbitrary(gen: &mut quickcheck::Gen) -> SmallIndex { use core::cmp::max; let id = max(i32::MIN + 1, i32::arbitrary(gen)).abs(); if id > SmallIndex::MAX.as_i32() { SmallIndex::MAX } else { SmallIndex::new(usize::try_from(id).unwrap()).unwrap() } } } /// This error occurs when a small index could not be constructed. /// /// This occurs when given an integer exceeding the maximum small index value. /// /// When the `std` feature is enabled, this implements the `Error` trait. #[derive(Clone, Debug, Eq, PartialEq)] pub struct SmallIndexError { attempted: u64, } impl SmallIndexError { /// Returns the value that could not be converted to a small index. pub fn attempted(&self) -> u64 { self.attempted } } #[cfg(feature = "std")] impl std::error::Error for SmallIndexError {} impl core::fmt::Display for SmallIndexError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to create small index from {:?}, which exceeds {:?}", self.attempted(), SmallIndex::MAX, ) } } #[derive(Clone, Debug)] pub(crate) struct SmallIndexIter { rng: core::ops::Range, } impl Iterator for SmallIndexIter { type Item = SmallIndex; fn next(&mut self) -> Option { if self.rng.start >= self.rng.end { return None; } let next_id = self.rng.start + 1; let id = core::mem::replace(&mut self.rng.start, next_id); // new_unchecked is OK since we asserted that the number of // elements in this iterator will fit in an ID at construction. Some(SmallIndex::new_unchecked(id)) } } macro_rules! index_type_impls { ($name:ident, $err:ident, $iter:ident, $withiter:ident) => { impl $name { /// The maximum value. pub const MAX: $name = $name(SmallIndex::MAX); /// The total number of values that can be represented. pub const LIMIT: usize = SmallIndex::LIMIT; /// The zero value. pub const ZERO: $name = $name(SmallIndex::ZERO); /// The number of bytes that a single value uses in memory. pub const SIZE: usize = SmallIndex::SIZE; /// Create a new value that is represented by a "small index." /// /// If the given index exceeds the maximum allowed value, then this /// returns an error. #[inline] pub fn new(value: usize) -> Result<$name, $err> { SmallIndex::new(value).map($name).map_err($err) } /// Create a new value without checking whether the given argument /// exceeds the maximum. /// /// Using this routine with an invalid value will result in /// unspecified behavior, but *not* undefined behavior. In /// particular, an invalid ID value is likely to cause panics or /// possibly even silent logical errors. /// /// Callers must never rely on this type to be within a certain /// range for memory safety. #[inline] pub const fn new_unchecked(value: usize) -> $name { $name(SmallIndex::new_unchecked(value)) } /// Like `new`, but panics if the given value is not valid. #[inline] pub fn must(value: usize) -> $name { $name::new(value).expect(concat!( "invalid ", stringify!($name), " value" )) } /// Return the internal value as a `usize`. This is guaranteed to /// never overflow `usize`. #[inline] pub const fn as_usize(&self) -> usize { self.0.as_usize() } /// Return the internal value as a `u64`. This is guaranteed to /// never overflow. #[inline] pub const fn as_u64(&self) -> u64 { self.0.as_u64() } /// Return the internal value as a `u32`. This is guaranteed to /// never overflow `u32`. #[inline] pub const fn as_u32(&self) -> u32 { self.0.as_u32() } /// Return the internal value as a i32`. This is guaranteed to /// never overflow an `i32`. #[inline] pub const fn as_i32(&self) -> i32 { self.0.as_i32() } /// Returns one more than this value as a usize. /// /// Since values represented by a "small index" have constraints /// on their maximum value, adding `1` to it will always fit in a /// `usize`, `u32` and a `i32`. #[inline] pub fn one_more(&self) -> usize { self.0.one_more() } /// Decode this value from the bytes given using the native endian /// byte order for the current target. /// /// If the decoded integer is not representable as a small index /// for the current target, then this returns an error. #[inline] pub fn from_ne_bytes(bytes: [u8; 4]) -> Result<$name, $err> { SmallIndex::from_ne_bytes(bytes).map($name).map_err($err) } /// Decode this value from the bytes given using the native endian /// byte order for the current target. /// /// This is analogous to `new_unchecked` in that is does not check /// whether the decoded integer is representable as a small index. #[inline] pub fn from_ne_bytes_unchecked(bytes: [u8; 4]) -> $name { $name(SmallIndex::from_ne_bytes_unchecked(bytes)) } /// Return the underlying integer as raw bytes in native endian /// format. #[inline] pub fn to_ne_bytes(&self) -> [u8; 4] { self.0.to_ne_bytes() } /// Returns an iterator over all values from 0 up to and not /// including the given length. /// /// If the given length exceeds this type's limit, then this /// panics. pub(crate) fn iter(len: usize) -> $iter { $iter::new(len) } } // We write our own Debug impl so that we get things like PatternID(5) // instead of PatternID(SmallIndex(5)). impl core::fmt::Debug for $name { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { f.debug_tuple(stringify!($name)).field(&self.as_u32()).finish() } } impl core::ops::Index<$name> for [T] { type Output = T; #[inline] fn index(&self, index: $name) -> &T { &self[index.as_usize()] } } impl core::ops::IndexMut<$name> for [T] { #[inline] fn index_mut(&mut self, index: $name) -> &mut T { &mut self[index.as_usize()] } } #[cfg(feature = "alloc")] impl core::ops::Index<$name> for Vec { type Output = T; #[inline] fn index(&self, index: $name) -> &T { &self[index.as_usize()] } } #[cfg(feature = "alloc")] impl core::ops::IndexMut<$name> for Vec { #[inline] fn index_mut(&mut self, index: $name) -> &mut T { &mut self[index.as_usize()] } } impl From for $name { fn from(value: u8) -> $name { $name(SmallIndex::from(value)) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u16) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u32) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: u64) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } impl TryFrom for $name { type Error = $err; fn try_from(value: usize) -> Result<$name, $err> { SmallIndex::try_from(value).map($name).map_err($err) } } #[cfg(test)] impl quickcheck::Arbitrary for $name { fn arbitrary(gen: &mut quickcheck::Gen) -> $name { $name(SmallIndex::arbitrary(gen)) } } /// This error occurs when a value could not be constructed. /// /// This occurs when given an integer exceeding the maximum allowed /// value. /// /// When the `std` feature is enabled, this implements the `Error` /// trait. #[derive(Clone, Debug, Eq, PartialEq)] pub struct $err(SmallIndexError); impl $err { /// Returns the value that could not be converted to an ID. pub fn attempted(&self) -> u64 { self.0.attempted() } } #[cfg(feature = "std")] impl std::error::Error for $err {} impl core::fmt::Display for $err { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to create {} from {:?}, which exceeds {:?}", stringify!($name), self.attempted(), $name::MAX, ) } } #[derive(Clone, Debug)] pub(crate) struct $iter(SmallIndexIter); impl $iter { fn new(len: usize) -> $iter { assert!( len <= $name::LIMIT, "cannot create iterator for {} when number of \ elements exceed {:?}", stringify!($name), $name::LIMIT, ); $iter(SmallIndexIter { rng: 0..len }) } } impl Iterator for $iter { type Item = $name; fn next(&mut self) -> Option<$name> { self.0.next().map($name) } } /// An iterator adapter that is like std::iter::Enumerate, but attaches /// small index values instead. It requires `ExactSizeIterator`. At /// construction, it ensures that the index of each element in the /// iterator is representable in the corresponding small index type. #[derive(Clone, Debug)] pub(crate) struct $withiter { it: I, ids: $iter, } impl $withiter { fn new(it: I) -> $withiter { let ids = $name::iter(it.len()); $withiter { it, ids } } } impl Iterator for $withiter { type Item = ($name, I::Item); fn next(&mut self) -> Option<($name, I::Item)> { let item = self.it.next()?; // Number of elements in this iterator must match, according // to contract of ExactSizeIterator. let id = self.ids.next().unwrap(); Some((id, item)) } } }; } /// The identifier of a regex pattern, represented by a [`SmallIndex`]. /// /// The identifier for a pattern corresponds to its relative position among /// other patterns in a single finite state machine. Namely, when building /// a multi-pattern regex engine, one must supply a sequence of patterns to /// match. The position (starting at 0) of each pattern in that sequence /// represents its identifier. This identifier is in turn used to identify and /// report matches of that pattern in various APIs. /// /// See the [`SmallIndex`] type for more information about what it means for /// a pattern ID to be a "small index." /// /// Note that this type is defined in the /// [`util::primitives`](crate::util::primitives) module, but it is also /// re-exported at the crate root due to how common it is. #[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub struct PatternID(SmallIndex); /// The identifier of a finite automaton state, represented by a /// [`SmallIndex`]. /// /// Most regex engines in this crate are built on top of finite automata. Each /// state in a finite automaton defines transitions from its state to another. /// Those transitions point to other states via their identifiers, i.e., a /// `StateID`. Since finite automata tend to contain many transitions, it is /// much more memory efficient to define state IDs as small indices. /// /// See the [`SmallIndex`] type for more information about what it means for /// a state ID to be a "small index." #[derive(Clone, Copy, Default, Eq, Hash, PartialEq, PartialOrd, Ord)] #[repr(transparent)] pub struct StateID(SmallIndex); index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter); index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter); /// A utility trait that defines a couple of adapters for making it convenient /// to access indices as "small index" types. We require ExactSizeIterator so /// that iterator construction can do a single check to make sure the index of /// each element is representable by its small index type. pub(crate) trait IteratorIndexExt: Iterator { fn with_pattern_ids(self) -> WithPatternIDIter where Self: Sized + ExactSizeIterator, { WithPatternIDIter::new(self) } fn with_state_ids(self) -> WithStateIDIter where Self: Sized + ExactSizeIterator, { WithStateIDIter::new(self) } } impl IteratorIndexExt for I {} regex-automata-0.4.9/src/util/search.rs000064400000000000000000002202501046102023000161410ustar 00000000000000/*! Types and routines that support the search APIs of most regex engines. This sub-module isn't exposed directly, but rather, its contents are exported at the crate root due to the universality of most of the types and routines in this module. */ use core::ops::{Range, RangeBounds}; use crate::util::{escape::DebugByte, primitives::PatternID, utf8}; /// The parameters for a regex search including the haystack to search. /// /// It turns out that regex searches have a few parameters, and in most cases, /// those parameters have defaults that work in the vast majority of cases. /// This `Input` type exists to make that common case seamnless while also /// providing an avenue for changing the parameters of a search. In particular, /// this type enables doing so without a combinatorial explosion of different /// methods and/or superfluous parameters in the common cases. /// /// An `Input` permits configuring the following things: /// /// * Search only a substring of a haystack, while taking the broader context /// into account for resolving look-around assertions. /// * Indicating whether to search for all patterns in a regex, or to /// only search for one pattern in particular. /// * Whether to perform an anchored on unanchored search. /// * Whether to report a match as early as possible. /// /// All of these parameters, except for the haystack, have sensible default /// values. This means that the minimal search configuration is simply a call /// to [`Input::new`] with your haystack. Setting any other parameter is /// optional. /// /// Moreover, for any `H` that implements `AsRef<[u8]>`, there exists a /// `From for Input` implementation. This is useful because many of the /// search APIs in this crate accept an `Into`. This means you can /// provide string or byte strings to these routines directly, and they'll /// automatically get converted into an `Input` for you. /// /// The lifetime parameter `'h` refers to the lifetime of the haystack. /// /// # Organization /// /// The API of `Input` is split into a few different parts: /// /// * A builder-like API that transforms a `Input` by value. Examples: /// [`Input::span`] and [`Input::anchored`]. /// * A setter API that permits mutating parameters in place. Examples: /// [`Input::set_span`] and [`Input::set_anchored`]. /// * A getter API that permits retrieving any of the search parameters. /// Examples: [`Input::get_span`] and [`Input::get_anchored`]. /// * A few convenience getter routines that don't conform to the above naming /// pattern due to how common they are. Examples: [`Input::haystack`], /// [`Input::start`] and [`Input::end`]. /// * Miscellaneous predicates and other helper routines that are useful /// in some contexts. Examples: [`Input::is_char_boundary`]. /// /// A `Input` exposes so much because it is meant to be used by both callers of /// regex engines _and_ implementors of regex engines. A constraining factor is /// that regex engines should accept a `&Input` as its lowest level API, which /// means that implementors should only use the "getter" APIs of a `Input`. /// /// # Valid bounds and search termination /// /// An `Input` permits setting the bounds of a search via either /// [`Input::span`] or [`Input::range`]. The bounds set must be valid, or /// else a panic will occur. Bounds are valid if and only if: /// /// * The bounds represent a valid range into the input's haystack. /// * **or** the end bound is a valid ending bound for the haystack *and* /// the start bound is exactly one greater than the start bound. /// /// In the latter case, [`Input::is_done`] will return true and indicates any /// search receiving such an input should immediately return with no match. /// /// Note that while `Input` is used for reverse searches in this crate, the /// `Input::is_done` predicate assumes a forward search. Because unsigned /// offsets are used internally, there is no way to tell from only the offsets /// whether a reverse search is done or not. /// /// # Regex engine support /// /// Any regex engine accepting an `Input` must support at least the following /// things: /// /// * Searching a `&[u8]` for matches. /// * Searching a substring of `&[u8]` for a match, such that any match /// reported must appear entirely within that substring. /// * For a forwards search, a match should never be reported when /// [`Input::is_done`] returns true. (For reverse searches, termination should /// be handled outside of `Input`.) /// /// Supporting other aspects of an `Input` are optional, but regex engines /// should handle aspects they don't support gracefully. How this is done is /// generally up to the regex engine. This crate generally treats unsupported /// anchored modes as an error to report for example, but for simplicity, in /// the meta regex engine, trying to search with an invalid pattern ID just /// results in no match being reported. #[derive(Clone)] pub struct Input<'h> { haystack: &'h [u8], span: Span, anchored: Anchored, earliest: bool, } impl<'h> Input<'h> { /// Create a new search configuration for the given haystack. #[inline] pub fn new>(haystack: &'h H) -> Input<'h> { // Perform only one call to `haystack.as_ref()` to protect from incorrect // implementations that return different values from multiple calls. // This is important because there's code that relies on `span` not being // out of bounds with respect to the stored `haystack`. let haystack = haystack.as_ref(); Input { haystack, span: Span { start: 0, end: haystack.len() }, anchored: Anchored::No, earliest: false, } } /// Set the span for this search. /// /// This routine does not panic if the span given is not a valid range for /// this search's haystack. If this search is run with an invalid range, /// then the most likely outcome is that the actual search execution will /// panic. /// /// This routine is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. To provide anything supported by range /// syntax, use the [`Input::range`] method. /// /// The default span is the entire haystack. /// /// Note that [`Input::range`] overrides this method and vice versa. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// This example shows how the span of the search can impact whether a /// match is reported or not. This is particularly relevant for look-around /// operators, which might take things outside of the span into account /// when determining whether they match. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Match, Input, /// }; /// /// // Look for 'at', but as a distinct word. /// let re = PikeVM::new(r"\bat\b")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// // Our haystack contains 'at', but not as a distinct word. /// let haystack = "batter"; /// /// // A standard search finds nothing, as expected. /// let input = Input::new(haystack); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(None, caps.get_match()); /// /// // But if we wanted to search starting at position '1', we might /// // slice the haystack. If we do this, it's impossible for the \b /// // anchors to take the surrounding context into account! And thus, /// // a match is produced. /// let input = Input::new(&haystack[1..3]); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match()); /// /// // But if we specify the span of the search instead of slicing the /// // haystack, then the regex engine can "see" outside of the span /// // and resolve the anchors correctly. /// let input = Input::new(haystack).span(1..3); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(None, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// This may seem a little ham-fisted, but this scenario tends to come up /// if some other regex engine found the match span and now you need to /// re-process that span to look for capturing groups. (e.g., Run a faster /// DFA first, find a match, then run the PikeVM on just the match span to /// resolve capturing groups.) In order to implement that sort of logic /// correctly, you need to set the span on the search instead of slicing /// the haystack directly. /// /// The other advantage of using this routine to specify the bounds of the /// search is that the match offsets are still reported in terms of the /// original haystack. For example, the second search in the example above /// reported a match at position `0`, even though `at` starts at offset /// `1` because we sliced the haystack. #[inline] pub fn span>(mut self, span: S) -> Input<'h> { self.set_span(span); self } /// Like `Input::span`, but accepts any range instead. /// /// This routine does not panic if the range given is not a valid range for /// this search's haystack. If this search is run with an invalid range, /// then the most likely outcome is that the actual search execution will /// panic. /// /// The default range is the entire haystack. /// /// Note that [`Input::span`] overrides this method and vice versa. /// /// # Panics /// /// This routine will panic if the given range could not be converted /// to a valid [`Range`]. For example, this would panic when given /// `0..=usize::MAX` since it cannot be represented using a half-open /// interval in terms of `usize`. /// /// This also panics if the given range does not correspond to valid bounds /// in the haystack or the termination of a search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// /// let input = Input::new("foobar").range(2..=4); /// assert_eq!(2..5, input.get_range()); /// ``` #[inline] pub fn range>(mut self, range: R) -> Input<'h> { self.set_range(range); self } /// Sets the anchor mode of a search. /// /// When a search is anchored (so that's [`Anchored::Yes`] or /// [`Anchored::Pattern`]), a match must begin at the start of a search. /// When a search is not anchored (that's [`Anchored::No`]), regex engines /// will behave as if the pattern started with a `(?s-u:.)*?`. This prefix /// permits a match to appear anywhere. /// /// By default, the anchored mode is [`Anchored::No`]. /// /// **WARNING:** this is subtly different than using a `^` at the start of /// your regex. A `^` forces a regex to match exclusively at the start of /// a haystack, regardless of where you begin your search. In contrast, /// anchoring a search will allow your regex to match anywhere in your /// haystack, but the match must start at the beginning of a search. /// /// For example, consider the haystack `aba` and the following searches: /// /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba` /// starting at position `2`. Since `^` requires the match to start at /// the beginning of the haystack and `2 > 0`, no match is found. /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba` /// starting at position `2`. This reports a match at `[2, 3]` since /// the match starts where the search started. Since there is no `^`, /// there is no requirement for the match to start at the beginning of /// the haystack. /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba` /// starting at position `1`. Since `b` corresponds to position `1` and /// since the search is anchored, it finds no match. While the regex /// matches at other positions, configuring the search to be anchored /// requires that it only report a match that begins at the same offset /// as the beginning of the search. /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba` /// starting at position `1`. Since the search is not anchored and /// the regex does not start with `^`, the search executes as if there /// is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it /// reports a match at `[2, 3]`. /// /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`, /// except it only reports matches for a particular pattern. /// /// # Example /// /// This demonstrates the differences between an anchored search and /// a pattern that begins with `^` (as described in the above warning /// message). /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Anchored, Match, Input, /// }; /// /// let haystack = "aba"; /// /// let re = PikeVM::new(r"^a")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let input = Input::new(haystack).span(2..3).anchored(Anchored::No); /// re.search(&mut cache, &input, &mut caps); /// // No match is found because 2 is not the beginning of the haystack, /// // which is what ^ requires. /// assert_eq!(None, caps.get_match()); /// /// let re = PikeVM::new(r"a")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let input = Input::new(haystack).span(2..3).anchored(Anchored::Yes); /// re.search(&mut cache, &input, &mut caps); /// // An anchored search can still match anywhere in the haystack, it just /// // must begin at the start of the search which is '2' in this case. /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match()); /// /// let re = PikeVM::new(r"a")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let input = Input::new(haystack).span(1..3).anchored(Anchored::Yes); /// re.search(&mut cache, &input, &mut caps); /// // No match is found since we start searching at offset 1 which /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match /// // is found. /// assert_eq!(None, caps.get_match()); /// /// let re = PikeVM::new(r"a")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let input = Input::new(haystack).span(1..3).anchored(Anchored::No); /// re.search(&mut cache, &input, &mut caps); /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the /// // pattern. Even though the search starts at 'b', the 'match anything' /// // prefix allows the search to match 'a'. /// let expected = Some(Match::must(0, 2..3)); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn anchored(mut self, mode: Anchored) -> Input<'h> { self.set_anchored(mode); self } /// Whether to execute an "earliest" search or not. /// /// When running a non-overlapping search, an "earliest" search will return /// the match location as early as possible. For example, given a pattern /// of `foo[0-9]+` and a haystack of `foo12345`, a normal leftmost search /// will return `foo12345` as a match. But an "earliest" search for regex /// engines that support "earliest" semantics will return `foo1` as a /// match, since as soon as the first digit following `foo` is seen, it is /// known to have found a match. /// /// Note that "earliest" semantics generally depend on the regex engine. /// Different regex engines may determine there is a match at different /// points. So there is no guarantee that "earliest" matches will always /// return the same offsets for all regex engines. The "earliest" notion /// is really about when the particular regex engine determines there is /// a match rather than a consistent semantic unto itself. This is often /// useful for implementing "did a match occur or not" predicates, but /// sometimes the offset is useful as well. /// /// This is disabled by default. /// /// # Example /// /// This example shows the difference between "earliest" searching and /// normal searching. /// /// ``` /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input}; /// /// let re = PikeVM::new(r"foo[0-9]+")?; /// let mut cache = re.create_cache(); /// let mut caps = re.create_captures(); /// /// // A normal search implements greediness like you expect. /// let input = Input::new("foo12345"); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match()); /// /// // When 'earliest' is enabled and the regex engine supports /// // it, the search will bail once it knows a match has been /// // found. /// let input = Input::new("foo12345").earliest(true); /// re.search(&mut cache, &input, &mut caps); /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match()); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn earliest(mut self, yes: bool) -> Input<'h> { self.set_earliest(yes); self } /// Set the span for this search configuration. /// /// This is like the [`Input::span`] method, except this mutates the /// span in place. /// /// This routine is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// # Panics /// /// This panics if the given span does not correspond to valid bounds in /// the haystack or the termination of a search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_span(2..4); /// assert_eq!(2..4, input.get_range()); /// ``` #[inline] pub fn set_span>(&mut self, span: S) { let span = span.into(); assert!( span.end <= self.haystack.len() && span.start <= span.end.wrapping_add(1), "invalid span {:?} for haystack of length {}", span, self.haystack.len(), ); self.span = span; } /// Set the span for this search configuration given any range. /// /// This is like the [`Input::range`] method, except this mutates the /// span in place. /// /// This routine does not panic if the range given is not a valid range for /// this search's haystack. If this search is run with an invalid range, /// then the most likely outcome is that the actual search execution will /// panic. /// /// # Panics /// /// This routine will panic if the given range could not be converted /// to a valid [`Range`]. For example, this would panic when given /// `0..=usize::MAX` since it cannot be represented using a half-open /// interval in terms of `usize`. /// /// This also panics if the given span does not correspond to valid bounds /// in the haystack or the termination of a search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_range(2..=4); /// assert_eq!(2..5, input.get_range()); /// ``` #[inline] pub fn set_range>(&mut self, range: R) { use core::ops::Bound; // It's a little weird to convert ranges into spans, and then spans // back into ranges when we actually slice the haystack. Because // of that process, we always represent everything as a half-open // internal. Therefore, handling things like m..=n is a little awkward. let start = match range.start_bound() { Bound::Included(&i) => i, // Can this case ever happen? Range syntax doesn't support it... Bound::Excluded(&i) => i.checked_add(1).unwrap(), Bound::Unbounded => 0, }; let end = match range.end_bound() { Bound::Included(&i) => i.checked_add(1).unwrap(), Bound::Excluded(&i) => i, Bound::Unbounded => self.haystack().len(), }; self.set_span(Span { start, end }); } /// Set the starting offset for the span for this search configuration. /// /// This is a convenience routine for only mutating the start of a span /// without having to set the entire span. /// /// # Panics /// /// This panics if the span resulting from the new start position does not /// correspond to valid bounds in the haystack or the termination of a /// search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_start(5); /// assert_eq!(5..6, input.get_range()); /// ``` #[inline] pub fn set_start(&mut self, start: usize) { self.set_span(Span { start, ..self.get_span() }); } /// Set the ending offset for the span for this search configuration. /// /// This is a convenience routine for only mutating the end of a span /// without having to set the entire span. /// /// # Panics /// /// This panics if the span resulting from the new end position does not /// correspond to valid bounds in the haystack or the termination of a /// search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// input.set_end(5); /// assert_eq!(0..5, input.get_range()); /// ``` #[inline] pub fn set_end(&mut self, end: usize) { self.set_span(Span { end, ..self.get_span() }); } /// Set the anchor mode of a search. /// /// This is like [`Input::anchored`], except it mutates the search /// configuration in place. /// /// # Example /// /// ``` /// use regex_automata::{Anchored, Input, PatternID}; /// /// let mut input = Input::new("foobar"); /// assert_eq!(Anchored::No, input.get_anchored()); /// /// let pid = PatternID::must(5); /// input.set_anchored(Anchored::Pattern(pid)); /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); /// ``` #[inline] pub fn set_anchored(&mut self, mode: Anchored) { self.anchored = mode; } /// Set whether the search should execute in "earliest" mode or not. /// /// This is like [`Input::earliest`], except it mutates the search /// configuration in place. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert!(!input.get_earliest()); /// input.set_earliest(true); /// assert!(input.get_earliest()); /// ``` #[inline] pub fn set_earliest(&mut self, yes: bool) { self.earliest = yes; } /// Return a borrow of the underlying haystack as a slice of bytes. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(b"foobar", input.haystack()); /// ``` #[inline] pub fn haystack(&self) -> &[u8] { self.haystack } /// Return the start position of this search. /// /// This is a convenience routine for `search.get_span().start()`. /// /// When [`Input::is_done`] is `false`, this is guaranteed to return /// an offset that is less than or equal to [`Input::end`]. Otherwise, /// the offset is one greater than [`Input::end`]. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0, input.start()); /// /// let input = Input::new("foobar").span(2..4); /// assert_eq!(2, input.start()); /// ``` #[inline] pub fn start(&self) -> usize { self.get_span().start } /// Return the end position of this search. /// /// This is a convenience routine for `search.get_span().end()`. /// /// This is guaranteed to return an offset that is a valid exclusive end /// bound for this input's haystack. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(6, input.end()); /// /// let input = Input::new("foobar").span(2..4); /// assert_eq!(4, input.end()); /// ``` #[inline] pub fn end(&self) -> usize { self.get_span().end } /// Return the span for this search configuration. /// /// If one was not explicitly set, then the span corresponds to the entire /// range of the haystack. /// /// When [`Input::is_done`] is `false`, the span returned is guaranteed /// to correspond to valid bounds for this input's haystack. /// /// # Example /// /// ``` /// use regex_automata::{Input, Span}; /// /// let input = Input::new("foobar"); /// assert_eq!(Span { start: 0, end: 6 }, input.get_span()); /// ``` #[inline] pub fn get_span(&self) -> Span { self.span } /// Return the span as a range for this search configuration. /// /// If one was not explicitly set, then the span corresponds to the entire /// range of the haystack. /// /// When [`Input::is_done`] is `false`, the range returned is guaranteed /// to correspond to valid bounds for this input's haystack. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert_eq!(0..6, input.get_range()); /// ``` #[inline] pub fn get_range(&self) -> Range { self.get_span().range() } /// Return the anchored mode for this search configuration. /// /// If no anchored mode was set, then it defaults to [`Anchored::No`]. /// /// # Example /// /// ``` /// use regex_automata::{Anchored, Input, PatternID}; /// /// let mut input = Input::new("foobar"); /// assert_eq!(Anchored::No, input.get_anchored()); /// /// let pid = PatternID::must(5); /// input.set_anchored(Anchored::Pattern(pid)); /// assert_eq!(Anchored::Pattern(pid), input.get_anchored()); /// ``` #[inline] pub fn get_anchored(&self) -> Anchored { self.anchored } /// Return whether this search should execute in "earliest" mode. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("foobar"); /// assert!(!input.get_earliest()); /// ``` #[inline] pub fn get_earliest(&self) -> bool { self.earliest } /// Return true if and only if this search can never return any other /// matches. /// /// This occurs when the start position of this search is greater than the /// end position of the search. /// /// # Example /// /// ``` /// use regex_automata::Input; /// /// let mut input = Input::new("foobar"); /// assert!(!input.is_done()); /// input.set_start(6); /// assert!(!input.is_done()); /// input.set_start(7); /// assert!(input.is_done()); /// ``` #[inline] pub fn is_done(&self) -> bool { self.get_span().start > self.get_span().end } /// Returns true if and only if the given offset in this search's haystack /// falls on a valid UTF-8 encoded codepoint boundary. /// /// If the haystack is not valid UTF-8, then the behavior of this routine /// is unspecified. /// /// # Example /// /// This shows where codepoint boundaries do and don't exist in valid /// UTF-8. /// /// ``` /// use regex_automata::Input; /// /// let input = Input::new("☃"); /// assert!(input.is_char_boundary(0)); /// assert!(!input.is_char_boundary(1)); /// assert!(!input.is_char_boundary(2)); /// assert!(input.is_char_boundary(3)); /// assert!(!input.is_char_boundary(4)); /// ``` #[inline] pub fn is_char_boundary(&self, offset: usize) -> bool { utf8::is_boundary(self.haystack(), offset) } } impl<'h> core::fmt::Debug for Input<'h> { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use crate::util::escape::DebugHaystack; f.debug_struct("Input") .field("haystack", &DebugHaystack(self.haystack())) .field("span", &self.span) .field("anchored", &self.anchored) .field("earliest", &self.earliest) .finish() } } impl<'h, H: ?Sized + AsRef<[u8]>> From<&'h H> for Input<'h> { fn from(haystack: &'h H) -> Input<'h> { Input::new(haystack) } } /// A representation of a span reported by a regex engine. /// /// A span corresponds to the starting and ending _byte offsets_ of a /// contiguous region of bytes. The starting offset is inclusive while the /// ending offset is exclusive. That is, a span is a half-open interval. /// /// A span is used to report the offsets of a match, but it is also used to /// convey which region of a haystack should be searched via routines like /// [`Input::span`]. /// /// This is basically equivalent to a `std::ops::Range`, except this /// type implements `Copy` which makes it more ergonomic to use in the context /// of this crate. Like a range, this implements `Index` for `[u8]` and `str`, /// and `IndexMut` for `[u8]`. For convenience, this also impls `From`, /// which means things like `Span::from(5..10)` work. #[derive(Clone, Copy, Eq, Hash, PartialEq)] pub struct Span { /// The start offset of the span, inclusive. pub start: usize, /// The end offset of the span, exclusive. pub end: usize, } impl Span { /// Returns this span as a range. #[inline] pub fn range(&self) -> Range { Range::from(*self) } /// Returns true when this span is empty. That is, when `start >= end`. #[inline] pub fn is_empty(&self) -> bool { self.start >= self.end } /// Returns the length of this span. /// /// This returns `0` in precisely the cases that `is_empty` returns `true`. #[inline] pub fn len(&self) -> usize { self.end.saturating_sub(self.start) } /// Returns true when the given offset is contained within this span. /// /// Note that an empty span contains no offsets and will always return /// false. #[inline] pub fn contains(&self, offset: usize) -> bool { !self.is_empty() && self.start <= offset && offset <= self.end } /// Returns a new span with `offset` added to this span's `start` and `end` /// values. #[inline] pub fn offset(&self, offset: usize) -> Span { Span { start: self.start + offset, end: self.end + offset } } } impl core::fmt::Debug for Span { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "{}..{}", self.start, self.end) } } impl core::ops::Index for [u8] { type Output = [u8]; #[inline] fn index(&self, index: Span) -> &[u8] { &self[index.range()] } } impl core::ops::IndexMut for [u8] { #[inline] fn index_mut(&mut self, index: Span) -> &mut [u8] { &mut self[index.range()] } } impl core::ops::Index for str { type Output = str; #[inline] fn index(&self, index: Span) -> &str { &self[index.range()] } } impl From> for Span { #[inline] fn from(range: Range) -> Span { Span { start: range.start, end: range.end } } } impl From for Range { #[inline] fn from(span: Span) -> Range { Range { start: span.start, end: span.end } } } impl PartialEq> for Span { #[inline] fn eq(&self, range: &Range) -> bool { self.start == range.start && self.end == range.end } } impl PartialEq for Range { #[inline] fn eq(&self, span: &Span) -> bool { self.start == span.start && self.end == span.end } } /// A representation of "half" of a match reported by a DFA. /// /// This is called a "half" match because it only includes the end location (or /// start location for a reverse search) of a match. This corresponds to the /// information that a single DFA scan can report. Getting the other half of /// the match requires a second scan with a reversed DFA. /// /// A half match also includes the pattern that matched. The pattern is /// identified by an ID, which corresponds to its position (starting from `0`) /// relative to other patterns used to construct the corresponding DFA. If only /// a single pattern is provided to the DFA, then all matches are guaranteed to /// have a pattern ID of `0`. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct HalfMatch { /// The pattern ID. pattern: PatternID, /// The offset of the match. /// /// For forward searches, the offset is exclusive. For reverse searches, /// the offset is inclusive. offset: usize, } impl HalfMatch { /// Create a new half match from a pattern ID and a byte offset. #[inline] pub fn new(pattern: PatternID, offset: usize) -> HalfMatch { HalfMatch { pattern, offset } } /// Create a new half match from a pattern ID and a byte offset. /// /// This is like [`HalfMatch::new`], but accepts a `usize` instead of a /// [`PatternID`]. This panics if the given `usize` is not representable /// as a `PatternID`. #[inline] pub fn must(pattern: usize, offset: usize) -> HalfMatch { HalfMatch::new(PatternID::new(pattern).unwrap(), offset) } /// Returns the ID of the pattern that matched. /// /// The ID of a pattern is derived from the position in which it was /// originally inserted into the corresponding DFA. The first pattern has /// identifier `0`, and each subsequent pattern is `1`, `2` and so on. #[inline] pub fn pattern(&self) -> PatternID { self.pattern } /// The position of the match. /// /// If this match was produced by a forward search, then the offset is /// exclusive. If this match was produced by a reverse search, then the /// offset is inclusive. #[inline] pub fn offset(&self) -> usize { self.offset } } /// A representation of a match reported by a regex engine. /// /// A match has two essential pieces of information: the [`PatternID`] that /// matches, and the [`Span`] of the match in a haystack. /// /// The pattern is identified by an ID, which corresponds to its position /// (starting from `0`) relative to other patterns used to construct the /// corresponding regex engine. If only a single pattern is provided, then all /// matches are guaranteed to have a pattern ID of `0`. /// /// Every match reported by a regex engine guarantees that its span has its /// start offset as less than or equal to its end offset. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub struct Match { /// The pattern ID. pattern: PatternID, /// The underlying match span. span: Span, } impl Match { /// Create a new match from a pattern ID and a span. /// /// This constructor is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// # Panics /// /// This panics if `end < start`. /// /// # Example /// /// This shows how to create a match for the first pattern in a regex /// object using convenient range syntax. /// /// ``` /// use regex_automata::{Match, PatternID}; /// /// let m = Match::new(PatternID::ZERO, 5..10); /// assert_eq!(0, m.pattern().as_usize()); /// assert_eq!(5, m.start()); /// assert_eq!(10, m.end()); /// ``` #[inline] pub fn new>(pattern: PatternID, span: S) -> Match { let span: Span = span.into(); assert!(span.start <= span.end, "invalid match span"); Match { pattern, span } } /// Create a new match from a pattern ID and a byte offset span. /// /// This constructor is generic over how a span is provided. While /// a [`Span`] may be given directly, one may also provide a /// `std::ops::Range`. /// /// This is like [`Match::new`], but accepts a `usize` instead of a /// [`PatternID`]. This panics if the given `usize` is not representable /// as a `PatternID`. /// /// # Panics /// /// This panics if `end < start` or if `pattern > PatternID::MAX`. /// /// # Example /// /// This shows how to create a match for the third pattern in a regex /// object using convenient range syntax. /// /// ``` /// use regex_automata::Match; /// /// let m = Match::must(3, 5..10); /// assert_eq!(3, m.pattern().as_usize()); /// assert_eq!(5, m.start()); /// assert_eq!(10, m.end()); /// ``` #[inline] pub fn must>(pattern: usize, span: S) -> Match { Match::new(PatternID::must(pattern), span) } /// Returns the ID of the pattern that matched. /// /// The ID of a pattern is derived from the position in which it was /// originally inserted into the corresponding regex engine. The first /// pattern has identifier `0`, and each subsequent pattern is `1`, `2` and /// so on. #[inline] pub fn pattern(&self) -> PatternID { self.pattern } /// The starting position of the match. /// /// This is a convenience routine for `Match::span().start`. #[inline] pub fn start(&self) -> usize { self.span().start } /// The ending position of the match. /// /// This is a convenience routine for `Match::span().end`. #[inline] pub fn end(&self) -> usize { self.span().end } /// Returns the match span as a range. /// /// This is a convenience routine for `Match::span().range()`. #[inline] pub fn range(&self) -> core::ops::Range { self.span().range() } /// Returns the span for this match. #[inline] pub fn span(&self) -> Span { self.span } /// Returns true when the span in this match is empty. /// /// An empty match can only be returned when the regex itself can match /// the empty string. #[inline] pub fn is_empty(&self) -> bool { self.span().is_empty() } /// Returns the length of this match. /// /// This returns `0` in precisely the cases that `is_empty` returns `true`. #[inline] pub fn len(&self) -> usize { self.span().len() } } /// A set of `PatternID`s. /// /// A set of pattern identifiers is useful for recording which patterns have /// matched a particular haystack. A pattern set _only_ includes pattern /// identifiers. It does not include offset information. /// /// # Example /// /// This shows basic usage of a set. /// /// ``` /// use regex_automata::{PatternID, PatternSet}; /// /// let pid1 = PatternID::must(5); /// let pid2 = PatternID::must(8); /// // Create a new empty set. /// let mut set = PatternSet::new(10); /// // Insert pattern IDs. /// set.insert(pid1); /// set.insert(pid2); /// // Test membership. /// assert!(set.contains(pid1)); /// assert!(set.contains(pid2)); /// // Get all members. /// assert_eq!( /// vec![5, 8], /// set.iter().map(|p| p.as_usize()).collect::>(), /// ); /// // Clear the set. /// set.clear(); /// // Test that it is indeed empty. /// assert!(set.is_empty()); /// ``` #[cfg(feature = "alloc")] #[derive(Clone, Debug, Eq, PartialEq)] pub struct PatternSet { /// The number of patterns set to 'true' in this set. len: usize, /// A map from PatternID to boolean of whether a pattern matches or not. /// /// This should probably be a bitset, but it's probably unlikely to matter /// much in practice. /// /// The main downside of this representation (and similarly for a bitset) /// is that iteration scales with the capacity of the set instead of /// the length of the set. This doesn't seem likely to be a problem in /// practice. /// /// Another alternative is to just use a 'SparseSet' for this. It does use /// more memory (quite a bit more), but that seems fine I think compared /// to the memory being used by the regex engine. The real hiccup with /// it is that it yields pattern IDs in the order they were inserted. /// Which is actually kind of nice, but at the time of writing, pattern /// IDs are yielded in ascending order in the regex crate RegexSet API. /// If we did change to 'SparseSet', we could provide an additional /// 'iter_match_order' iterator, but keep the ascending order one for /// compatibility. which: alloc::boxed::Box<[bool]>, } #[cfg(feature = "alloc")] impl PatternSet { /// Create a new set of pattern identifiers with the given capacity. /// /// The given capacity typically corresponds to (at least) the number of /// patterns in a compiled regex object. /// /// # Panics /// /// This panics if the given capacity exceeds [`PatternID::LIMIT`]. This is /// impossible if you use the `pattern_len()` method as defined on any of /// the regex engines in this crate. Namely, a regex will fail to build by /// returning an error if the number of patterns given to it exceeds the /// limit. Therefore, the number of patterns in a valid regex is always /// a correct capacity to provide here. pub fn new(capacity: usize) -> PatternSet { assert!( capacity <= PatternID::LIMIT, "pattern set capacity exceeds limit of {}", PatternID::LIMIT, ); PatternSet { len: 0, which: alloc::vec![false; capacity].into_boxed_slice(), } } /// Clear this set such that it contains no pattern IDs. pub fn clear(&mut self) { self.len = 0; for matched in self.which.iter_mut() { *matched = false; } } /// Return true if and only if the given pattern identifier is in this set. pub fn contains(&self, pid: PatternID) -> bool { pid.as_usize() < self.capacity() && self.which[pid] } /// Insert the given pattern identifier into this set and return `true` if /// the given pattern ID was not previously in this set. /// /// If the pattern identifier is already in this set, then this is a no-op. /// /// Use [`PatternSet::try_insert`] for a fallible version of this routine. /// /// # Panics /// /// This panics if this pattern set has insufficient capacity to /// store the given pattern ID. pub fn insert(&mut self, pid: PatternID) -> bool { self.try_insert(pid) .expect("PatternSet should have sufficient capacity") } /// Insert the given pattern identifier into this set and return `true` if /// the given pattern ID was not previously in this set. /// /// If the pattern identifier is already in this set, then this is a no-op. /// /// # Errors /// /// This returns an error if this pattern set has insufficient capacity to /// store the given pattern ID. pub fn try_insert( &mut self, pid: PatternID, ) -> Result { if pid.as_usize() >= self.capacity() { return Err(PatternSetInsertError { attempted: pid, capacity: self.capacity(), }); } if self.which[pid] { return Ok(false); } self.len += 1; self.which[pid] = true; Ok(true) } /* // This is currently commented out because it is unused and it is unclear // whether it's useful or not. What's the harm in having it? When, if // we ever wanted to change our representation to a 'SparseSet', then // supporting this method would be a bit tricky. So in order to keep some // API evolution flexibility, we leave it out for now. /// Remove the given pattern identifier from this set. /// /// If the pattern identifier was not previously in this set, then this /// does not change the set and returns `false`. /// /// # Panics /// /// This panics if `pid` exceeds the capacity of this set. pub fn remove(&mut self, pid: PatternID) -> bool { if !self.which[pid] { return false; } self.len -= 1; self.which[pid] = false; true } */ /// Return true if and only if this set has no pattern identifiers in it. pub fn is_empty(&self) -> bool { self.len() == 0 } /// Return true if and only if this set has the maximum number of pattern /// identifiers in the set. This occurs precisely when `PatternSet::len() /// == PatternSet::capacity()`. /// /// This particular property is useful to test because it may allow one to /// stop a search earlier than you might otherwise. Namely, if a search is /// only reporting which patterns match a haystack and if you know all of /// the patterns match at a given point, then there's no new information /// that can be learned by continuing the search. (Because a pattern set /// does not keep track of offset information.) pub fn is_full(&self) -> bool { self.len() == self.capacity() } /// Returns the total number of pattern identifiers in this set. pub fn len(&self) -> usize { self.len } /// Returns the total number of pattern identifiers that may be stored /// in this set. /// /// This is guaranteed to be less than or equal to [`PatternID::LIMIT`]. /// /// Typically, the capacity of a pattern set matches the number of patterns /// in a regex object with which you are searching. pub fn capacity(&self) -> usize { self.which.len() } /// Returns an iterator over all pattern identifiers in this set. /// /// The iterator yields pattern identifiers in ascending order, starting /// at zero. pub fn iter(&self) -> PatternSetIter<'_> { PatternSetIter { it: self.which.iter().enumerate() } } } /// An error that occurs when a `PatternID` failed to insert into a /// `PatternSet`. /// /// An insert fails when the given `PatternID` exceeds the configured capacity /// of the `PatternSet`. /// /// This error is created by the [`PatternSet::try_insert`] routine. #[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub struct PatternSetInsertError { attempted: PatternID, capacity: usize, } #[cfg(feature = "std")] impl std::error::Error for PatternSetInsertError {} #[cfg(feature = "alloc")] impl core::fmt::Display for PatternSetInsertError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!( f, "failed to insert pattern ID {} into pattern set \ with insufficiet capacity of {}", self.attempted.as_usize(), self.capacity, ) } } /// An iterator over all pattern identifiers in a [`PatternSet`]. /// /// The lifetime parameter `'a` refers to the lifetime of the pattern set being /// iterated over. /// /// This iterator is created by the [`PatternSet::iter`] method. #[cfg(feature = "alloc")] #[derive(Clone, Debug)] pub struct PatternSetIter<'a> { it: core::iter::Enumerate>, } #[cfg(feature = "alloc")] impl<'a> Iterator for PatternSetIter<'a> { type Item = PatternID; fn next(&mut self) -> Option { while let Some((index, &yes)) = self.it.next() { if yes { // Only valid 'PatternID' values can be inserted into the set // and construction of the set panics if the capacity would // permit storing invalid pattern IDs. Thus, 'yes' is only true // precisely when 'index' corresponds to a valid 'PatternID'. return Some(PatternID::new_unchecked(index)); } } None } fn size_hint(&self) -> (usize, Option) { self.it.size_hint() } } #[cfg(feature = "alloc")] impl<'a> DoubleEndedIterator for PatternSetIter<'a> { fn next_back(&mut self) -> Option { while let Some((index, &yes)) = self.it.next_back() { if yes { // Only valid 'PatternID' values can be inserted into the set // and construction of the set panics if the capacity would // permit storing invalid pattern IDs. Thus, 'yes' is only true // precisely when 'index' corresponds to a valid 'PatternID'. return Some(PatternID::new_unchecked(index)); } } None } } /// The type of anchored search to perform. /// /// This is *almost* a boolean option. That is, you can either do an unanchored /// search for any pattern in a regex, or you can do an anchored search for any /// pattern in a regex. /// /// A third option exists that, assuming the regex engine supports it, permits /// you to do an anchored search for a specific pattern. /// /// Note that there is no way to run an unanchored search for a specific /// pattern. If you need that, you'll need to build separate regexes for each /// pattern. /// /// # Errors /// /// If a regex engine does not support the anchored mode selected, then the /// regex engine will return an error. While any non-trivial regex engine /// should support at least one of the available anchored modes, there is no /// singular mode that is guaranteed to be universally supported. Some regex /// engines might only support unanchored searches (DFAs compiled without /// anchored starting states) and some regex engines might only support /// anchored searches (like the one-pass DFA). /// /// The specific error returned is a [`MatchError`] with a /// [`MatchErrorKind::UnsupportedAnchored`] kind. The kind includes the /// `Anchored` value given that is unsupported. /// /// Note that regex engines should report "no match" if, for example, an /// `Anchored::Pattern` is provided with an invalid pattern ID _but_ where /// anchored searches for a specific pattern are supported. This is smooths out /// behavior such that it's possible to guarantee that an error never occurs /// based on how the regex engine is configured. All regex engines in this /// crate report "no match" when searching for an invalid pattern ID, but where /// searching for a valid pattern ID is otherwise supported. /// /// # Example /// /// This example shows how to use the various `Anchored` modes to run a /// search. We use the [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM) /// because it supports all modes unconditionally. Some regex engines, like /// the [`onepass::DFA`](crate::dfa::onepass::DFA) cannot support unanchored /// searches. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Anchored, Input, Match, PatternID, /// }; /// /// let re = PikeVM::new_many(&[ /// r"Mrs. \w+", /// r"Miss \w+", /// r"Mr. \w+", /// r"Ms. \w+", /// ])?; /// let mut cache = re.create_cache(); /// let hay = "Hello Mr. Springsteen!"; /// /// // The default is to do an unanchored search. /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); /// // Explicitly ask for an unanchored search. Same as above. /// let input = Input::new(hay).anchored(Anchored::No); /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, hay)); /// /// // Now try an anchored search. Since the match doesn't start at the /// // beginning of the haystack, no match is found! /// let input = Input::new(hay).anchored(Anchored::Yes); /// assert_eq!(None, re.find(&mut cache, input)); /// /// // We can try an anchored search again, but move the location of where /// // we start the search. Note that the offsets reported are still in /// // terms of the overall haystack and not relative to where we started /// // the search. /// let input = Input::new(hay).anchored(Anchored::Yes).range(6..); /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); /// /// // Now try an anchored search for a specific pattern. We specifically /// // choose a pattern that we know doesn't match to prove that the search /// // only looks for the pattern we provide. /// let input = Input::new(hay) /// .anchored(Anchored::Pattern(PatternID::must(1))) /// .range(6..); /// assert_eq!(None, re.find(&mut cache, input)); /// /// // But if we switch it to the pattern that we know matches, then we find /// // the match. /// let input = Input::new(hay) /// .anchored(Anchored::Pattern(PatternID::must(2))) /// .range(6..); /// assert_eq!(Some(Match::must(2, 6..21)), re.find(&mut cache, input)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum Anchored { /// Run an unanchored search. This means a match may occur anywhere at or /// after the start position of the search. /// /// This search can return a match for any pattern in the regex. No, /// Run an anchored search. This means that a match must begin at the /// start position of the search. /// /// This search can return a match for any pattern in the regex. Yes, /// Run an anchored search for a specific pattern. This means that a match /// must be for the given pattern and must begin at the start position of /// the search. Pattern(PatternID), } impl Anchored { /// Returns true if and only if this anchor mode corresponds to any kind of /// anchored search. /// /// # Example /// /// This examples shows that both `Anchored::Yes` and `Anchored::Pattern` /// are considered anchored searches. /// /// ``` /// use regex_automata::{Anchored, PatternID}; /// /// assert!(!Anchored::No.is_anchored()); /// assert!(Anchored::Yes.is_anchored()); /// assert!(Anchored::Pattern(PatternID::ZERO).is_anchored()); /// ``` #[inline] pub fn is_anchored(&self) -> bool { matches!(*self, Anchored::Yes | Anchored::Pattern(_)) } /// Returns the pattern ID associated with this configuration if it is an /// anchored search for a specific pattern. Otherwise `None` is returned. /// /// # Example /// /// ``` /// use regex_automata::{Anchored, PatternID}; /// /// assert_eq!(None, Anchored::No.pattern()); /// assert_eq!(None, Anchored::Yes.pattern()); /// /// let pid = PatternID::must(5); /// assert_eq!(Some(pid), Anchored::Pattern(pid).pattern()); /// ``` #[inline] pub fn pattern(&self) -> Option { match *self { Anchored::Pattern(pid) => Some(pid), _ => None, } } } /// The kind of match semantics to use for a regex pattern. /// /// The default match kind is `LeftmostFirst`, and this corresponds to the /// match semantics used by most backtracking engines, such as Perl. /// /// # Leftmost first or "preference order" match semantics /// /// Leftmost-first semantics determine which match to report when there are /// multiple paths through a regex that match at the same position. The tie is /// essentially broken by how a backtracker would behave. For example, consider /// running the regex `foofoofoo|foofoo|foo` on the haystack `foofoo`. In this /// case, both the `foofoo` and `foo` branches match at position `0`. So should /// the end of the match be `3` or `6`? /// /// A backtracker will conceptually work by trying `foofoofoo` and failing. /// Then it will try `foofoo`, find the match and stop there. Thus, the /// leftmost-first match position is `6`. This is called "leftmost-first" or /// "preference order" because the order of the branches as written in the /// regex pattern is what determines how to break the tie. /// /// (Note that leftmost-longest match semantics, which break ties by always /// taking the longest matching string, are not currently supported by this /// crate. These match semantics tend to be found in POSIX regex engines.) /// /// This example shows how leftmost-first semantics work, and how it even /// applies to multi-pattern regexes: /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Match, /// }; /// /// let re = PikeVM::new_many(&[ /// r"foofoofoo", /// r"foofoo", /// r"foo", /// ])?; /// let mut cache = re.create_cache(); /// let got: Vec = re.find_iter(&mut cache, "foofoo").collect(); /// let expected = vec![Match::must(1, 0..6)]; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// # All matches /// /// The `All` match semantics report any and all matches, and generally will /// attempt to match as much as possible. It doesn't respect any sort of match /// priority at all, so things like non-greedy matching don't work in this /// mode. /// /// The fact that non-greedy matching doesn't work generally makes most forms /// of unanchored non-overlapping searches have unintuitive behavior. Namely, /// unanchored searches behave as if there is a `(?s-u:.)*?` prefix at the /// beginning of the pattern, which is specifically non-greedy. Since it will /// be treated as greedy in `All` match semantics, this generally means that /// it will first attempt to consume all of the haystack and is likely to wind /// up skipping matches. /// /// Generally speaking, `All` should only be used in two circumstances: /// /// * When running an anchored search and there is a desire to match as much as /// possible. For example, when building a reverse regex matcher to find the /// start of a match after finding the end. In this case, the reverse search /// is anchored to the end of the match found by the forward search. /// * When running overlapping searches. Since `All` encodes all possible /// matches, this is generally what you want for an overlapping search. If you /// try to use leftmost-first in an overlapping search, it is likely to produce /// counter-intuitive results since leftmost-first specifically excludes some /// matches from its underlying finite state machine. /// /// This example demonstrates the counter-intuitive behavior of `All` semantics /// when using a standard leftmost unanchored search: /// /// ``` /// use regex_automata::{ /// nfa::thompson::pikevm::PikeVM, /// Match, MatchKind, /// }; /// /// let re = PikeVM::builder() /// .configure(PikeVM::config().match_kind(MatchKind::All)) /// .build("foo")?; /// let hay = "first foo second foo wat"; /// let mut cache = re.create_cache(); /// let got: Vec = re.find_iter(&mut cache, hay).collect(); /// // Notice that it completely skips the first 'foo'! /// let expected = vec![Match::must(0, 17..20)]; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// This second example shows how `All` semantics are useful for an overlapping /// search. Note that we use lower level lazy DFA APIs here since the NFA /// engines only currently support a very limited form of overlapping search. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::{DFA, OverlappingState}, /// HalfMatch, Input, MatchKind, /// }; /// /// let re = DFA::builder() /// // If we didn't set 'All' semantics here, then the regex would only /// // match 'foo' at offset 3 and nothing else. Why? Because the state /// // machine implements preference order and knows that the 'foofoo' and /// // 'foofoofoo' branches can never match since 'foo' will always match /// // when they match and take priority. /// .configure(DFA::config().match_kind(MatchKind::All)) /// .build(r"foo|foofoo|foofoofoo")?; /// let mut cache = re.create_cache(); /// let mut state = OverlappingState::start(); /// let input = Input::new("foofoofoo"); /// let mut got = vec![]; /// loop { /// re.try_search_overlapping_fwd(&mut cache, &input, &mut state)?; /// let m = match state.get_match() { /// None => break, /// Some(m) => m, /// }; /// got.push(m); /// } /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 6), /// HalfMatch::must(0, 9), /// ]; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[non_exhaustive] #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub enum MatchKind { /// Report all possible matches. All, /// Report only the leftmost matches. When multiple leftmost matches exist, /// report the match corresponding to the part of the regex that appears /// first in the syntax. LeftmostFirst, // There is prior art in RE2 that shows that we should be able to add // LeftmostLongest too. The tricky part of it is supporting ungreedy // repetitions. Instead of treating all NFA states as having equivalent // priority (as in 'All') or treating all NFA states as having distinct // priority based on order (as in 'LeftmostFirst'), we instead group NFA // states into sets, and treat members of each set as having equivalent // priority, but having greater priority than all following members // of different sets. // // However, it's not clear whether it's really worth adding this. After // all, leftmost-longest can be emulated when using literals by using // leftmost-first and sorting the literals by length in descending order. // However, this won't work for arbitrary regexes. e.g., `\w|\w\w` will // always match `a` in `ab` when using leftmost-first, but leftmost-longest // would match `ab`. } impl MatchKind { #[cfg(feature = "alloc")] pub(crate) fn continue_past_first_match(&self) -> bool { *self == MatchKind::All } } impl Default for MatchKind { fn default() -> MatchKind { MatchKind::LeftmostFirst } } /// An error indicating that a search stopped before reporting whether a /// match exists or not. /// /// To be very clear, this error type implies that one cannot assume that no /// matches occur, since the search stopped before completing. That is, if /// you're looking for information about where a search determined that no /// match can occur, then this error type does *not* give you that. (Indeed, at /// the time of writing, if you need such a thing, you have to write your own /// search routine.) /// /// Normally, when one searches for something, the response is either an /// affirmative "it was found at this location" or a negative "not found at /// all." However, in some cases, a regex engine can be configured to stop its /// search before concluding whether a match exists or not. When this happens, /// it may be important for the caller to know why the regex engine gave up and /// where in the input it gave up at. This error type exposes the 'why' and the /// 'where.' /// /// For example, the DFAs provided by this library generally cannot correctly /// implement Unicode word boundaries. Instead, they provide an option to /// eagerly support them on ASCII text (since Unicode word boundaries are /// equivalent to ASCII word boundaries when searching ASCII text), but will /// "give up" if a non-ASCII byte is seen. In such cases, one is usually /// required to either report the failure to the caller (unergonomic) or /// otherwise fall back to some other regex engine (ergonomic, but potentially /// costly). /// /// More generally, some regex engines offer the ability for callers to specify /// certain bytes that will trigger the regex engine to automatically quit if /// they are seen. /// /// Still yet, there may be other reasons for a failed match. For example, /// the hybrid DFA provided by this crate can be configured to give up if it /// believes that it is not efficient. This in turn permits callers to choose a /// different regex engine. /// /// (Note that DFAs are configured by default to never quit or give up in this /// fashion. For example, by default, a DFA will fail to build if the regex /// pattern contains a Unicode word boundary. One needs to opt into the "quit" /// behavior via options, like /// [`hybrid::dfa::Config::unicode_word_boundary`](crate::hybrid::dfa::Config::unicode_word_boundary).) /// /// There are a couple other ways a search /// can fail. For example, when using the /// [`BoundedBacktracker`](crate::nfa::thompson::backtrack::BoundedBacktracker) /// with a haystack that is too long, or trying to run an unanchored search /// with a [one-pass DFA](crate::dfa::onepass). #[derive(Clone, Debug, Eq, PartialEq)] pub struct MatchError( #[cfg(feature = "alloc")] alloc::boxed::Box, #[cfg(not(feature = "alloc"))] MatchErrorKind, ); impl MatchError { /// Create a new error value with the given kind. /// /// This is a more verbose version of the kind-specific constructors, /// e.g., `MatchError::quit`. pub fn new(kind: MatchErrorKind) -> MatchError { #[cfg(feature = "alloc")] { MatchError(alloc::boxed::Box::new(kind)) } #[cfg(not(feature = "alloc"))] { MatchError(kind) } } /// Returns a reference to the underlying error kind. pub fn kind(&self) -> &MatchErrorKind { &self.0 } /// Create a new "quit" error. The given `byte` corresponds to the value /// that tripped a search's quit condition, and `offset` corresponds to the /// location in the haystack at which the search quit. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::Quit`] kind. pub fn quit(byte: u8, offset: usize) -> MatchError { MatchError::new(MatchErrorKind::Quit { byte, offset }) } /// Create a new "gave up" error. The given `offset` corresponds to the /// location in the haystack at which the search gave up. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::GaveUp`] kind. pub fn gave_up(offset: usize) -> MatchError { MatchError::new(MatchErrorKind::GaveUp { offset }) } /// Create a new "haystack too long" error. The given `len` corresponds to /// the length of the haystack that was problematic. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::HaystackTooLong`] kind. pub fn haystack_too_long(len: usize) -> MatchError { MatchError::new(MatchErrorKind::HaystackTooLong { len }) } /// Create a new "unsupported anchored" error. This occurs when the caller /// requests a search with an anchor mode that is not supported by the /// regex engine. /// /// This is the same as calling `MatchError::new` with a /// [`MatchErrorKind::UnsupportedAnchored`] kind. pub fn unsupported_anchored(mode: Anchored) -> MatchError { MatchError::new(MatchErrorKind::UnsupportedAnchored { mode }) } } /// The underlying kind of a [`MatchError`]. /// /// This is a **non-exhaustive** enum. That means new variants may be added in /// a semver-compatible release. #[non_exhaustive] #[derive(Clone, Debug, Eq, PartialEq)] pub enum MatchErrorKind { /// The search saw a "quit" byte at which it was instructed to stop /// searching. Quit { /// The "quit" byte that was observed that caused the search to stop. byte: u8, /// The offset at which the quit byte was observed. offset: usize, }, /// The search, based on heuristics, determined that it would be better /// to stop, typically to provide the caller an opportunity to use an /// alternative regex engine. /// /// Currently, the only way for this to occur is via the lazy DFA and /// only when it is configured to do so (it will not return this error by /// default). GaveUp { /// The offset at which the search stopped. This corresponds to the /// position immediately following the last byte scanned. offset: usize, }, /// This error occurs if the haystack given to the regex engine was too /// long to be searched. This occurs, for example, with regex engines /// like the bounded backtracker that have a configurable fixed amount of /// capacity that is tied to the length of the haystack. Anything beyond /// that configured limit will result in an error at search time. HaystackTooLong { /// The length of the haystack that exceeded the limit. len: usize, }, /// An error indicating that a particular type of anchored search was /// requested, but that the regex engine does not support it. /// /// Note that this error should not be returned by a regex engine simply /// because the pattern ID is invalid (i.e., equal to or exceeds the number /// of patterns in the regex). In that case, the regex engine should report /// a non-match. UnsupportedAnchored { /// The anchored mode given that is unsupported. mode: Anchored, }, } #[cfg(feature = "std")] impl std::error::Error for MatchError {} impl core::fmt::Display for MatchError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { match *self.kind() { MatchErrorKind::Quit { byte, offset } => write!( f, "quit search after observing byte {:?} at offset {}", DebugByte(byte), offset, ), MatchErrorKind::GaveUp { offset } => { write!(f, "gave up searching at offset {}", offset) } MatchErrorKind::HaystackTooLong { len } => { write!(f, "haystack of length {} is too long", len) } MatchErrorKind::UnsupportedAnchored { mode: Anchored::Yes } => { write!(f, "anchored searches are not supported or enabled") } MatchErrorKind::UnsupportedAnchored { mode: Anchored::No } => { write!(f, "unanchored searches are not supported or enabled") } MatchErrorKind::UnsupportedAnchored { mode: Anchored::Pattern(pid), } => { write!( f, "anchored searches for a specific pattern ({}) are \ not supported or enabled", pid.as_usize(), ) } } } } #[cfg(test)] mod tests { use super::*; // We test that our 'MatchError' type is the size we expect. This isn't an // API guarantee, but if the size increases, we really want to make sure we // decide to do that intentionally. So this should be a speed bump. And in // general, we should not increase the size without a very good reason. // // Why? Because low level search APIs return Result<.., MatchError>. When // MatchError gets bigger, so to does the Result type. // // Now, when 'alloc' is enabled, we do box the error, which de-emphasizes // the importance of keeping a small error type. But without 'alloc', we // still want things to be small. #[test] fn match_error_size() { let expected_size = if cfg!(feature = "alloc") { core::mem::size_of::() } else { 2 * core::mem::size_of::() }; assert_eq!(expected_size, core::mem::size_of::()); } // Same as above, but for the underlying match error kind. #[cfg(target_pointer_width = "64")] #[test] fn match_error_kind_size() { let expected_size = 2 * core::mem::size_of::(); assert_eq!(expected_size, core::mem::size_of::()); } #[cfg(target_pointer_width = "32")] #[test] fn match_error_kind_size() { let expected_size = 3 * core::mem::size_of::(); assert_eq!(expected_size, core::mem::size_of::()); } #[test] fn incorrect_asref_guard() { struct Bad(std::cell::Cell); impl AsRef<[u8]> for Bad { fn as_ref(&self) -> &[u8] { if self.0.replace(false) { &[] } else { &[0; 1000] } } } let bad = Bad(std::cell::Cell::new(true)); let input = Input::new(&bad); assert!(input.end() <= input.haystack().len()); } } regex-automata-0.4.9/src/util/sparse_set.rs000064400000000000000000000175231046102023000170530ustar 00000000000000/*! This module defines a sparse set data structure. Its most interesting properties are: * They preserve insertion order. * Set membership testing is done in constant time. * Set insertion is done in constant time. * Clearing the set is done in constant time. The cost for doing this is that the capacity of the set needs to be known up front, and the elements in the set are limited to state identifiers. These sets are principally used when traversing an NFA state graph. This happens at search time, for example, in the PikeVM. It also happens during DFA determinization. */ use alloc::{vec, vec::Vec}; use crate::util::primitives::StateID; /// A pairse of sparse sets. /// /// This is useful when one needs to compute NFA epsilon closures from a /// previous set of states derived from an epsilon closure. One set can be the /// starting states where as the other set can be the destination states after /// following the transitions for a particular byte of input. /// /// There is no significance to 'set1' or 'set2'. They are both sparse sets of /// the same size. /// /// The members of this struct are exposed so that callers may borrow 'set1' /// and 'set2' individually without being force to borrow both at the same /// time. #[derive(Clone, Debug)] pub(crate) struct SparseSets { pub(crate) set1: SparseSet, pub(crate) set2: SparseSet, } impl SparseSets { /// Create a new pair of sparse sets where each set has the given capacity. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. pub(crate) fn new(capacity: usize) -> SparseSets { SparseSets { set1: SparseSet::new(capacity), set2: SparseSet::new(capacity), } } /// Resizes these sparse sets to have the new capacity given. /// /// The sets are automatically cleared. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. #[inline] pub(crate) fn resize(&mut self, new_capacity: usize) { self.set1.resize(new_capacity); self.set2.resize(new_capacity); } /// Clear both sparse sets. pub(crate) fn clear(&mut self) { self.set1.clear(); self.set2.clear(); } /// Swap set1 with set2. pub(crate) fn swap(&mut self) { core::mem::swap(&mut self.set1, &mut self.set2); } /// Returns the memory usage, in bytes, used by this pair of sparse sets. pub(crate) fn memory_usage(&self) -> usize { self.set1.memory_usage() + self.set2.memory_usage() } } /// A sparse set used for representing ordered NFA states. /// /// This supports constant time addition and membership testing. Clearing an /// entire set can also be done in constant time. Iteration yields elements /// in the order in which they were inserted. /// /// The data structure is based on: https://research.swtch.com/sparse /// Note though that we don't actually use uninitialized memory. We generally /// reuse sparse sets, so the initial allocation cost is bareable. However, its /// other properties listed above are extremely useful. #[derive(Clone)] pub(crate) struct SparseSet { /// The number of elements currently in this set. len: usize, /// Dense contains the ids in the order in which they were inserted. dense: Vec, /// Sparse maps ids to their location in dense. /// /// A state ID is in the set if and only if /// sparse[id] < len && id == dense[sparse[id]]. /// /// Note that these are indices into 'dense'. It's a little weird to use /// StateID here, but we know our length can never exceed the bounds of /// StateID (enforced by 'resize') and StateID will be at most 4 bytes /// where as a usize is likely double that in most cases. sparse: Vec, } impl SparseSet { /// Create a new sparse set with the given capacity. /// /// Sparse sets have a fixed size and they cannot grow. Attempting to /// insert more distinct elements than the total capacity of the set will /// result in a panic. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. #[inline] pub(crate) fn new(capacity: usize) -> SparseSet { let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] }; set.resize(capacity); set } /// Resizes this sparse set to have the new capacity given. /// /// This set is automatically cleared. /// /// This panics if the capacity given is bigger than `StateID::LIMIT`. #[inline] pub(crate) fn resize(&mut self, new_capacity: usize) { assert!( new_capacity <= StateID::LIMIT, "sparse set capacity cannot excced {:?}", StateID::LIMIT ); self.clear(); self.dense.resize(new_capacity, StateID::ZERO); self.sparse.resize(new_capacity, StateID::ZERO); } /// Returns the capacity of this set. /// /// The capacity represents a fixed limit on the number of distinct /// elements that are allowed in this set. The capacity cannot be changed. #[inline] pub(crate) fn capacity(&self) -> usize { self.dense.len() } /// Returns the number of elements in this set. #[inline] pub(crate) fn len(&self) -> usize { self.len } /// Returns true if and only if this set is empty. #[inline] pub(crate) fn is_empty(&self) -> bool { self.len() == 0 } /// Insert the state ID value into this set and return true if the given /// state ID was not previously in this set. /// /// This operation is idempotent. If the given value is already in this /// set, then this is a no-op. /// /// If more than `capacity` ids are inserted, then this panics. /// /// This is marked as inline(always) since the compiler won't inline it /// otherwise, and it's a fairly hot piece of code in DFA determinization. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn insert(&mut self, id: StateID) -> bool { if self.contains(id) { return false; } let i = self.len(); assert!( i < self.capacity(), "{:?} exceeds capacity of {:?} when inserting {:?}", i, self.capacity(), id, ); // OK since i < self.capacity() and self.capacity() is guaranteed to // be <= StateID::LIMIT. let index = StateID::new_unchecked(i); self.dense[index] = id; self.sparse[id] = index; self.len += 1; true } /// Returns true if and only if this set contains the given value. #[inline] pub(crate) fn contains(&self, id: StateID) -> bool { let index = self.sparse[id]; index.as_usize() < self.len() && self.dense[index] == id } /// Clear this set such that it has no members. #[inline] pub(crate) fn clear(&mut self) { self.len = 0; } #[inline] pub(crate) fn iter(&self) -> SparseSetIter<'_> { SparseSetIter(self.dense[..self.len()].iter()) } /// Returns the heap memory usage, in bytes, used by this sparse set. #[inline] pub(crate) fn memory_usage(&self) -> usize { self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE } } impl core::fmt::Debug for SparseSet { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { let elements: Vec = self.iter().collect(); f.debug_tuple("SparseSet").field(&elements).finish() } } /// An iterator over all elements in a sparse set. /// /// The lifetime `'a` refers to the lifetime of the set being iterated over. #[derive(Debug)] pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>); impl<'a> Iterator for SparseSetIter<'a> { type Item = StateID; #[cfg_attr(feature = "perf-inline", inline(always))] fn next(&mut self) -> Option { self.0.next().map(|&id| id) } } regex-automata-0.4.9/src/util/start.rs000064400000000000000000000427731046102023000160450ustar 00000000000000/*! Provides helpers for dealing with start state configurations in DFAs. */ use crate::util::{ look::LookMatcher, search::{Anchored, Input}, wire::{self, DeserializeError, SerializeError}, }; /// The configuration used to determine a DFA's start state for a search. /// /// A DFA has a single starting state in the typical textbook description. That /// is, it corresponds to the set of all starting states for the NFA that built /// it, along with their espsilon closures. In this crate, however, DFAs have /// many possible start states due to a few factors: /// /// * DFAs support the ability to run either anchored or unanchored searches. /// Each type of search needs its own start state. For example, an unanchored /// search requires starting at a state corresponding to a regex with a /// `(?s-u:.)*?` prefix, which will match through anything. /// * DFAs also optionally support starting an anchored search for any one /// specific pattern. Each such pattern requires its own start state. /// * If a look-behind assertion like `^` or `\b` is used in the regex, then /// the DFA will need to inspect a single byte immediately before the start of /// the search to choose the correct start state. /// /// Indeed, this configuration precisely encapsulates all of the above factors. /// The [`Config::anchored`] method sets which kind of anchored search to /// perform while the [`Config::look_behind`] method provides a way to set /// the byte that occurs immediately before the start of the search. /// /// Generally speaking, this type is only useful when you want to run searches /// without using an [`Input`]. In particular, an `Input` wants a haystack /// slice, but callers may not have a contiguous sequence of bytes as a /// haystack in all cases. This type provides a lower level of control such /// that callers can provide their own anchored configuration and look-behind /// byte explicitly. /// /// # Example /// /// This shows basic usage that permits running a search with a DFA without /// using the `Input` abstraction. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// util::start, /// Anchored, /// }; /// /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; /// let haystack = "quartz"; /// /// let config = start::Config::new().anchored(Anchored::Yes); /// let mut state = dfa.start_state(&config)?; /// for &b in haystack.as_bytes().iter() { /// state = dfa.next_state(state, b); /// } /// state = dfa.next_eoi_state(state); /// assert!(dfa.is_match_state(state)); /// /// # Ok::<(), Box>(()) /// ``` /// /// This example shows how to correctly run a search that doesn't begin at /// the start of a haystack. Notice how we set the look-behind byte, and as /// a result, the `\b` assertion does not match. /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// util::start, /// Anchored, /// }; /// /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; /// let haystack = "quartz"; /// /// let config = start::Config::new() /// .anchored(Anchored::Yes) /// .look_behind(Some(b'q')); /// let mut state = dfa.start_state(&config)?; /// for &b in haystack.as_bytes().iter().skip(1) { /// state = dfa.next_state(state, b); /// } /// state = dfa.next_eoi_state(state); /// // No match! /// assert!(!dfa.is_match_state(state)); /// /// # Ok::<(), Box>(()) /// ``` /// /// If we had instead not set a look-behind byte, then the DFA would assume /// that it was starting at the beginning of the haystack, and thus `\b` should /// match. This in turn would result in erroneously reporting a match: /// /// ``` /// use regex_automata::{ /// dfa::{Automaton, dense}, /// util::start, /// Anchored, /// }; /// /// let dfa = dense::DFA::new(r"(?-u)\b\w+\b")?; /// let haystack = "quartz"; /// /// // Whoops, forgot the look-behind byte... /// let config = start::Config::new().anchored(Anchored::Yes); /// let mut state = dfa.start_state(&config)?; /// for &b in haystack.as_bytes().iter().skip(1) { /// state = dfa.next_state(state, b); /// } /// state = dfa.next_eoi_state(state); /// // And now we get a match unexpectedly. /// assert!(dfa.is_match_state(state)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Config { look_behind: Option, anchored: Anchored, } impl Config { /// Create a new default start configuration. /// /// The default is an unanchored search that starts at the beginning of the /// haystack. pub fn new() -> Config { Config { anchored: Anchored::No, look_behind: None } } /// A convenience routine for building a start configuration from an /// [`Input`] for a forward search. /// /// This automatically sets the look-behind byte to the byte immediately /// preceding the start of the search. If the start of the search is at /// offset `0`, then no look-behind byte is set. pub fn from_input_forward(input: &Input<'_>) -> Config { let look_behind = input .start() .checked_sub(1) .and_then(|i| input.haystack().get(i).copied()); Config { look_behind, anchored: input.get_anchored() } } /// A convenience routine for building a start configuration from an /// [`Input`] for a reverse search. /// /// This automatically sets the look-behind byte to the byte immediately /// following the end of the search. If the end of the search is at /// offset `haystack.len()`, then no look-behind byte is set. pub fn from_input_reverse(input: &Input<'_>) -> Config { let look_behind = input.haystack().get(input.end()).copied(); Config { look_behind, anchored: input.get_anchored() } } /// Set the look-behind byte at the start of a search. /// /// Unless the search is intended to logically start at the beginning of a /// haystack, this should _always_ be set to the byte immediately preceding /// the start of the search. If no look-behind byte is set, then the start /// configuration will assume it is at the beginning of the haystack. For /// example, the anchor `^` will match. /// /// The default is that no look-behind byte is set. pub fn look_behind(mut self, byte: Option) -> Config { self.look_behind = byte; self } /// Set the anchored mode of a search. /// /// The default is an unanchored search. pub fn anchored(mut self, mode: Anchored) -> Config { self.anchored = mode; self } /// Return the look-behind byte in this configuration, if one exists. pub fn get_look_behind(&self) -> Option { self.look_behind } /// Return the anchored mode in this configuration. pub fn get_anchored(&self) -> Anchored { self.anchored } } /// A map from every possible byte value to its corresponding starting /// configuration. /// /// This map is used in order to lookup the start configuration for a particular /// position in a haystack. This start configuration is then used in /// combination with things like the anchored mode and pattern ID to fully /// determine the start state. /// /// Generally speaking, this map is only used for fully compiled DFAs and lazy /// DFAs. For NFAs (including the one-pass DFA), the start state is generally /// selected by virtue of traversing the NFA state graph. DFAs do the same /// thing, but at build time and not search time. (Well, technically the lazy /// DFA does it at search time, but it does enough work to cache the full /// result of the epsilon closure that the NFA engines tend to need to do.) #[derive(Clone)] pub(crate) struct StartByteMap { map: [Start; 256], } impl StartByteMap { /// Create a new map from byte values to their corresponding starting /// configurations. The map is determined, in part, by how look-around /// assertions are matched via the matcher given. pub(crate) fn new(lookm: &LookMatcher) -> StartByteMap { let mut map = [Start::NonWordByte; 256]; map[usize::from(b'\n')] = Start::LineLF; map[usize::from(b'\r')] = Start::LineCR; map[usize::from(b'_')] = Start::WordByte; let mut byte = b'0'; while byte <= b'9' { map[usize::from(byte)] = Start::WordByte; byte += 1; } byte = b'A'; while byte <= b'Z' { map[usize::from(byte)] = Start::WordByte; byte += 1; } byte = b'a'; while byte <= b'z' { map[usize::from(byte)] = Start::WordByte; byte += 1; } let lineterm = lookm.get_line_terminator(); // If our line terminator is normal, then it is already handled by // the LineLF and LineCR configurations. But if it's weird, then we // overwrite whatever was there before for that terminator with a // special configuration. The trick here is that if the terminator // is, say, a word byte like `a`, then callers seeing this start // configuration need to account for that and build their DFA state as // if it *also* came from a word byte. if lineterm != b'\r' && lineterm != b'\n' { map[usize::from(lineterm)] = Start::CustomLineTerminator; } StartByteMap { map } } /// Return the starting configuration for the given look-behind byte. /// /// If no look-behind exists, callers should use `Start::Text`. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn get(&self, byte: u8) -> Start { self.map[usize::from(byte)] } /// Deserializes a byte class map from the given slice. If the slice is of /// insufficient length or otherwise contains an impossible mapping, then /// an error is returned. Upon success, the number of bytes read along with /// the map are returned. The number of bytes read is always a multiple of /// 8. pub(crate) fn from_bytes( slice: &[u8], ) -> Result<(StartByteMap, usize), DeserializeError> { wire::check_slice_len(slice, 256, "start byte map")?; let mut map = [Start::NonWordByte; 256]; for (i, &repr) in slice[..256].iter().enumerate() { map[i] = match Start::from_usize(usize::from(repr)) { Some(start) => start, None => { return Err(DeserializeError::generic( "found invalid starting configuration", )) } }; } Ok((StartByteMap { map }, 256)) } /// Writes this map to the given byte buffer. if the given buffer is too /// small, then an error is returned. Upon success, the total number of /// bytes written is returned. The number of bytes written is guaranteed to /// be a multiple of 8. pub(crate) fn write_to( &self, dst: &mut [u8], ) -> Result { let nwrite = self.write_to_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("start byte map")); } for (i, &start) in self.map.iter().enumerate() { dst[i] = start.as_u8(); } Ok(nwrite) } /// Returns the total number of bytes written by `write_to`. pub(crate) fn write_to_len(&self) -> usize { 256 } } impl core::fmt::Debug for StartByteMap { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use crate::util::escape::DebugByte; write!(f, "StartByteMap{{")?; for byte in 0..=255 { if byte > 0 { write!(f, ", ")?; } let start = self.map[usize::from(byte)]; write!(f, "{:?} => {:?}", DebugByte(byte), start)?; } write!(f, "}}")?; Ok(()) } } /// Represents the six possible starting configurations of a DFA search. /// /// The starting configuration is determined by inspecting the beginning /// of the haystack (up to 1 byte). Ultimately, this along with a pattern ID /// (if specified) and the type of search (anchored or not) is what selects the /// start state to use in a DFA. /// /// As one example, if a DFA only supports unanchored searches and does not /// support anchored searches for each pattern, then it will have at most 6 /// distinct start states. (Some start states may be reused if determinization /// can determine that they will be equivalent.) If the DFA supports both /// anchored and unanchored searches, then it will have a maximum of 12 /// distinct start states. Finally, if the DFA also supports anchored searches /// for each pattern, then it can have up to `12 + (N * 6)` start states, where /// `N` is the number of patterns. /// /// Handling each of these starting configurations in the context of DFA /// determinization can be *quite* tricky and subtle. But the code is small /// and can be found at `crate::util::determinize::set_lookbehind_from_start`. #[derive(Clone, Copy, Debug, Eq, PartialEq)] pub(crate) enum Start { /// This occurs when the starting position is not any of the ones below. NonWordByte = 0, /// This occurs when the byte immediately preceding the start of the search /// is an ASCII word byte. WordByte = 1, /// This occurs when the starting position of the search corresponds to the /// beginning of the haystack. Text = 2, /// This occurs when the byte immediately preceding the start of the search /// is a line terminator. Specifically, `\n`. LineLF = 3, /// This occurs when the byte immediately preceding the start of the search /// is a line terminator. Specifically, `\r`. LineCR = 4, /// This occurs when a custom line terminator has been set via a /// `LookMatcher`, and when that line terminator is neither a `\r` or a /// `\n`. /// /// If the custom line terminator is a word byte, then this start /// configuration is still selected. DFAs that implement word boundary /// assertions will likely need to check whether the custom line terminator /// is a word byte, in which case, it should behave as if the byte /// satisfies `\b` in addition to multi-line anchors. CustomLineTerminator = 5, } impl Start { /// Return the starting state corresponding to the given integer. If no /// starting state exists for the given integer, then None is returned. pub(crate) fn from_usize(n: usize) -> Option { match n { 0 => Some(Start::NonWordByte), 1 => Some(Start::WordByte), 2 => Some(Start::Text), 3 => Some(Start::LineLF), 4 => Some(Start::LineCR), 5 => Some(Start::CustomLineTerminator), _ => None, } } /// Returns the total number of starting state configurations. pub(crate) fn len() -> usize { 6 } /// Return this starting configuration as `u8` integer. It is guaranteed to /// be less than `Start::len()`. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn as_u8(&self) -> u8 { // AFAIK, 'as' is the only way to zero-cost convert an int enum to an // actual int. *self as u8 } /// Return this starting configuration as a `usize` integer. It is /// guaranteed to be less than `Start::len()`. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn as_usize(&self) -> usize { usize::from(self.as_u8()) } } #[cfg(test)] mod tests { use super::*; #[test] fn start_fwd_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); let input = Input::new("").range(1..0); let config = Config::from_input_forward(&input); let start = config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); assert_eq!(Start::Text, start); } #[test] fn start_rev_done_range() { let smap = StartByteMap::new(&LookMatcher::default()); let input = Input::new("").range(1..0); let config = Config::from_input_reverse(&input); let start = config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); assert_eq!(Start::Text, start); } #[test] fn start_fwd() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); let input = Input::new(haystack).range(start..end); let config = Config::from_input_forward(&input); let start = config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); start }; assert_eq!(Start::Text, f("", 0, 0)); assert_eq!(Start::Text, f("abc", 0, 3)); assert_eq!(Start::Text, f("\nabc", 0, 3)); assert_eq!(Start::LineLF, f("\nabc", 1, 3)); assert_eq!(Start::LineCR, f("\rabc", 1, 3)); assert_eq!(Start::WordByte, f("abc", 1, 3)); assert_eq!(Start::NonWordByte, f(" abc", 1, 3)); } #[test] fn start_rev() { let f = |haystack, start, end| { let smap = StartByteMap::new(&LookMatcher::default()); let input = Input::new(haystack).range(start..end); let config = Config::from_input_reverse(&input); let start = config.get_look_behind().map_or(Start::Text, |b| smap.get(b)); start }; assert_eq!(Start::Text, f("", 0, 0)); assert_eq!(Start::Text, f("abc", 0, 3)); assert_eq!(Start::Text, f("abc\n", 0, 4)); assert_eq!(Start::LineLF, f("abc\nz", 0, 3)); assert_eq!(Start::LineCR, f("abc\rz", 0, 3)); assert_eq!(Start::WordByte, f("abc", 0, 2)); assert_eq!(Start::NonWordByte, f("abc ", 0, 3)); } } regex-automata-0.4.9/src/util/syntax.rs000064400000000000000000000424521046102023000162300ustar 00000000000000/*! Utilities for dealing with the syntax of a regular expression. This module currently only exposes a [`Config`] type that itself represents a wrapper around the configuration for a [`regex-syntax::ParserBuilder`](regex_syntax::ParserBuilder). The purpose of this wrapper is to make configuring syntax options very similar to how other configuration is done throughout this crate. Namely, instead of duplicating syntax options across every builder (of which there are many), we instead create small config objects like this one that can be passed around and composed. */ use alloc::{vec, vec::Vec}; use regex_syntax::{ ast, hir::{self, Hir}, Error, ParserBuilder, }; /// A convenience routine for parsing a pattern into an HIR value with the /// default configuration. /// /// # Example /// /// This shows how to parse a pattern into an HIR value: /// /// ``` /// use regex_automata::util::syntax; /// /// let hir = syntax::parse(r"([a-z]+)|([0-9]+)")?; /// assert_eq!(Some(1), hir.properties().static_explicit_captures_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn parse(pattern: &str) -> Result { parse_with(pattern, &Config::default()) } /// A convenience routine for parsing many patterns into HIR value with the /// default configuration. /// /// # Example /// /// This shows how to parse many patterns into an corresponding HIR values: /// /// ``` /// use { /// regex_automata::util::syntax, /// regex_syntax::hir::Properties, /// }; /// /// let hirs = syntax::parse_many(&[ /// r"([a-z]+)|([0-9]+)", /// r"foo(A-Z]+)bar", /// ])?; /// let props = Properties::union(hirs.iter().map(|h| h.properties())); /// assert_eq!(Some(1), props.static_explicit_captures_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn parse_many>(patterns: &[P]) -> Result, Error> { parse_many_with(patterns, &Config::default()) } /// A convenience routine for parsing a pattern into an HIR value using a /// `Config`. /// /// # Example /// /// This shows how to parse a pattern into an HIR value with a non-default /// configuration: /// /// ``` /// use regex_automata::util::syntax; /// /// let hir = syntax::parse_with( /// r"^[a-z]+$", /// &syntax::Config::new().multi_line(true).crlf(true), /// )?; /// assert!(hir.properties().look_set().contains_anchor_crlf()); /// /// # Ok::<(), Box>(()) /// ``` pub fn parse_with(pattern: &str, config: &Config) -> Result { let mut builder = ParserBuilder::new(); config.apply(&mut builder); builder.build().parse(pattern) } /// A convenience routine for parsing many patterns into HIR values using a /// `Config`. /// /// # Example /// /// This shows how to parse many patterns into an corresponding HIR values /// with a non-default configuration: /// /// ``` /// use { /// regex_automata::util::syntax, /// regex_syntax::hir::Properties, /// }; /// /// let patterns = &[ /// r"([a-z]+)|([0-9]+)", /// r"\W", /// r"foo(A-Z]+)bar", /// ]; /// let config = syntax::Config::new().unicode(false).utf8(false); /// let hirs = syntax::parse_many_with(patterns, &config)?; /// let props = Properties::union(hirs.iter().map(|h| h.properties())); /// assert!(!props.is_utf8()); /// /// # Ok::<(), Box>(()) /// ``` pub fn parse_many_with>( patterns: &[P], config: &Config, ) -> Result, Error> { let mut builder = ParserBuilder::new(); config.apply(&mut builder); let mut hirs = vec![]; for p in patterns.iter() { hirs.push(builder.build().parse(p.as_ref())?); } Ok(hirs) } /// A common set of configuration options that apply to the syntax of a regex. /// /// This represents a group of configuration options that specifically apply /// to how the concrete syntax of a regular expression is interpreted. In /// particular, they are generally forwarded to the /// [`ParserBuilder`](https://docs.rs/regex-syntax/*/regex_syntax/struct.ParserBuilder.html) /// in the /// [`regex-syntax`](https://docs.rs/regex-syntax) /// crate when building a regex from its concrete syntax directly. /// /// These options are defined as a group since they apply to every regex engine /// in this crate. Instead of re-defining them on every engine's builder, they /// are instead provided here as one cohesive unit. #[derive(Clone, Copy, Debug)] pub struct Config { case_insensitive: bool, multi_line: bool, dot_matches_new_line: bool, crlf: bool, line_terminator: u8, swap_greed: bool, ignore_whitespace: bool, unicode: bool, utf8: bool, nest_limit: u32, octal: bool, } impl Config { /// Return a new default syntax configuration. pub fn new() -> Config { // These defaults match the ones used in regex-syntax. Config { case_insensitive: false, multi_line: false, dot_matches_new_line: false, crlf: false, line_terminator: b'\n', swap_greed: false, ignore_whitespace: false, unicode: true, utf8: true, nest_limit: 250, octal: false, } } /// Enable or disable the case insensitive flag by default. /// /// When Unicode mode is enabled, case insensitivity is Unicode-aware. /// Specifically, it will apply the "simple" case folding rules as /// specified by Unicode. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. pub fn case_insensitive(mut self, yes: bool) -> Config { self.case_insensitive = yes; self } /// Enable or disable the multi-line matching flag by default. /// /// When this is enabled, the `^` and `$` look-around assertions will /// match immediately after and immediately before a new line character, /// respectively. Note that the `\A` and `\z` look-around assertions are /// unaffected by this setting and always correspond to matching at the /// beginning and end of the input. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `m` flag. pub fn multi_line(mut self, yes: bool) -> Config { self.multi_line = yes; self } /// Enable or disable the "dot matches any character" flag by default. /// /// When this is enabled, `.` will match any character. When it's disabled, /// then `.` will match any character except for a new line character. /// /// Note that `.` is impacted by whether the "unicode" setting is enabled /// or not. When Unicode is enabled (the default), `.` will match any UTF-8 /// encoding of any Unicode scalar value (sans a new line, depending on /// whether this "dot matches new line" option is enabled). When Unicode /// mode is disabled, `.` will match any byte instead. Because of this, /// when Unicode mode is disabled, `.` can only be used when the "allow /// invalid UTF-8" option is enabled, since `.` could otherwise match /// invalid UTF-8. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. pub fn dot_matches_new_line(mut self, yes: bool) -> Config { self.dot_matches_new_line = yes; self } /// Enable or disable the "CRLF mode" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `R` flag. /// /// When CRLF mode is enabled, the following happens: /// /// * Unless `dot_matches_new_line` is enabled, `.` will match any character /// except for `\r` and `\n`. /// * When `multi_line` mode is enabled, `^` and `$` will treat `\r\n`, /// `\r` and `\n` as line terminators. And in particular, neither will /// match between a `\r` and a `\n`. pub fn crlf(mut self, yes: bool) -> Config { self.crlf = yes; self } /// Sets the line terminator for use with `(?u-s:.)` and `(?-us:.)`. /// /// Namely, instead of `.` (by default) matching everything except for `\n`, /// this will cause `.` to match everything except for the byte given. /// /// If `.` is used in a context where Unicode mode is enabled and this byte /// isn't ASCII, then an error will be returned. When Unicode mode is /// disabled, then any byte is permitted, but will return an error if UTF-8 /// mode is enabled and it is a non-ASCII byte. /// /// In short, any ASCII value for a line terminator is always okay. But a /// non-ASCII byte might result in an error depending on whether Unicode /// mode or UTF-8 mode are enabled. /// /// Note that if `R` mode is enabled then it always takes precedence and /// the line terminator will be treated as `\r` and `\n` simultaneously. /// /// Note also that this *doesn't* impact the look-around assertions /// `(?m:^)` and `(?m:$)`. That's usually controlled by additional /// configuration in the regex engine itself. pub fn line_terminator(mut self, byte: u8) -> Config { self.line_terminator = byte; self } /// Enable or disable the "swap greed" flag by default. /// /// When this is enabled, `.*` (for example) will become ungreedy and `.*?` /// will become greedy. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. pub fn swap_greed(mut self, yes: bool) -> Config { self.swap_greed = yes; self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insigificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(mut self, yes: bool) -> Config { self.ignore_whitespace = yes; self } /// Enable or disable the Unicode flag (`u`) by default. /// /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// /// Note that unless "allow invalid UTF-8" is enabled (it's disabled by /// default), a regular expression will fail to parse if Unicode mode is /// disabled and a sub-expression could possibly match invalid UTF-8. /// /// **WARNING**: Unicode mode can greatly increase the size of the compiled /// DFA, which can noticeably impact both memory usage and compilation /// time. This is especially noticeable if your regex contains character /// classes like `\w` that are impacted by whether Unicode is enabled or /// not. If Unicode is not necessary, you are encouraged to disable it. pub fn unicode(mut self, yes: bool) -> Config { self.unicode = yes; self } /// When disabled, the builder will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// For example, when [`Config::unicode`] is disabled, then /// expressions like `[^a]` may match invalid UTF-8 since they can match /// any single byte that is not `a`. By default, these sub-expressions /// are disallowed to avoid returning offsets that split a UTF-8 /// encoded codepoint. However, in cases where matching at arbitrary /// locations is desired, this option can be disabled to permit all such /// sub-expressions. /// /// When enabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). pub fn utf8(mut self, yes: bool) -> Config { self.utf8 = yes; self } /// Set the nesting limit used for the regular expression parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow when building a finite automaton from a regular expression's /// abstract syntax tree. In particular, construction currently uses /// recursion. In the future, the implementation may stop using recursion /// and this option will no longer be necessary. /// /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since the parser will /// limit itself to heap space proportional to the length of the pattern /// string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation AST item, which results /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. pub fn nest_limit(mut self, limit: u32) -> Config { self.nest_limit = limit; self } /// Whether to support octal syntax or not. /// /// Octal syntax is a little-known way of uttering Unicode codepoints in /// a regular expression. For example, `a`, `\x61`, `\u0061` and /// `\141` are all equivalent regular expressions, where the last example /// shows octal syntax. /// /// While supporting octal syntax isn't in and of itself a problem, it does /// make good error messages harder. That is, in PCRE based regex engines, /// syntax like `\1` invokes a backreference, which is explicitly /// unsupported in Rust's regex engine. However, many users expect it to /// be supported. Therefore, when octal support is disabled, the error /// message will explicitly mention that backreferences aren't supported. /// /// Octal syntax is disabled by default. pub fn octal(mut self, yes: bool) -> Config { self.octal = yes; self } /// Returns whether "unicode" mode is enabled. pub fn get_unicode(&self) -> bool { self.unicode } /// Returns whether "case insensitive" mode is enabled. pub fn get_case_insensitive(&self) -> bool { self.case_insensitive } /// Returns whether "multi line" mode is enabled. pub fn get_multi_line(&self) -> bool { self.multi_line } /// Returns whether "dot matches new line" mode is enabled. pub fn get_dot_matches_new_line(&self) -> bool { self.dot_matches_new_line } /// Returns whether "CRLF" mode is enabled. pub fn get_crlf(&self) -> bool { self.crlf } /// Returns the line terminator in this syntax configuration. pub fn get_line_terminator(&self) -> u8 { self.line_terminator } /// Returns whether "swap greed" mode is enabled. pub fn get_swap_greed(&self) -> bool { self.swap_greed } /// Returns whether "ignore whitespace" mode is enabled. pub fn get_ignore_whitespace(&self) -> bool { self.ignore_whitespace } /// Returns whether UTF-8 mode is enabled. pub fn get_utf8(&self) -> bool { self.utf8 } /// Returns the "nest limit" setting. pub fn get_nest_limit(&self) -> u32 { self.nest_limit } /// Returns whether "octal" mode is enabled. pub fn get_octal(&self) -> bool { self.octal } /// Applies this configuration to the given parser. pub(crate) fn apply(&self, builder: &mut ParserBuilder) { builder .unicode(self.unicode) .case_insensitive(self.case_insensitive) .multi_line(self.multi_line) .dot_matches_new_line(self.dot_matches_new_line) .crlf(self.crlf) .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .ignore_whitespace(self.ignore_whitespace) .utf8(self.utf8) .nest_limit(self.nest_limit) .octal(self.octal); } /// Applies this configuration to the given AST parser. pub(crate) fn apply_ast(&self, builder: &mut ast::parse::ParserBuilder) { builder .ignore_whitespace(self.ignore_whitespace) .nest_limit(self.nest_limit) .octal(self.octal); } /// Applies this configuration to the given AST-to-HIR translator. pub(crate) fn apply_hir( &self, builder: &mut hir::translate::TranslatorBuilder, ) { builder .unicode(self.unicode) .case_insensitive(self.case_insensitive) .multi_line(self.multi_line) .crlf(self.crlf) .dot_matches_new_line(self.dot_matches_new_line) .line_terminator(self.line_terminator) .swap_greed(self.swap_greed) .utf8(self.utf8); } } impl Default for Config { fn default() -> Config { Config::new() } } regex-automata-0.4.9/src/util/unicode_data/mod.rs000064400000000000000000000013501046102023000200700ustar 00000000000000// This cfg should match the one in src/util/look.rs that uses perl_word. #[cfg(all( // We have to explicitly want to support Unicode word boundaries. feature = "unicode-word-boundary", not(all( // If we don't have regex-syntax at all, then we definitely need to // bring our own \w data table. feature = "syntax", // If unicode-perl is enabled, then regex-syntax/unicode-perl is // also enabled, which in turn means we can use regex-syntax's // is_word_character routine (and thus use its data tables). But if // unicode-perl is not enabled, even if syntax is, then we need to // bring our own. feature = "unicode-perl", )), ))] pub(crate) mod perl_word; regex-automata-0.4.9/src/util/unicode_data/perl_word.rs000064400000000000000000000436011046102023000213130ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // ucd-generate perl-word ucd-16.0.0 --chars // // Unicode version: 16.0.0. // // ucd-generate 0.3.1 is available on crates.io. pub const PERL_WORD: &'static [(char, char)] = &[ ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ª', 'ª'), ('µ', 'µ'), ('º', 'º'), ('À', 'Ö'), ('Ø', 'ö'), ('ø', 'ˁ'), ('ˆ', 'ˑ'), ('ˠ', 'ˤ'), ('ˬ', 'ˬ'), ('ˮ', 'ˮ'), ('\u{300}', 'ʹ'), ('Ͷ', 'ͷ'), ('ͺ', 'ͽ'), ('Ϳ', 'Ϳ'), ('Ά', 'Ά'), ('Έ', 'Ί'), ('Ό', 'Ό'), ('Ύ', 'Ρ'), ('Σ', 'ϵ'), ('Ϸ', 'ҁ'), ('\u{483}', 'ԯ'), ('Ա', 'Ֆ'), ('ՙ', 'ՙ'), ('ՠ', 'ֈ'), ('\u{591}', '\u{5bd}'), ('\u{5bf}', '\u{5bf}'), ('\u{5c1}', '\u{5c2}'), ('\u{5c4}', '\u{5c5}'), ('\u{5c7}', '\u{5c7}'), ('א', 'ת'), ('ׯ', 'ײ'), ('\u{610}', '\u{61a}'), ('ؠ', '٩'), ('ٮ', 'ۓ'), ('ە', '\u{6dc}'), ('\u{6df}', '\u{6e8}'), ('\u{6ea}', 'ۼ'), ('ۿ', 'ۿ'), ('ܐ', '\u{74a}'), ('ݍ', 'ޱ'), ('߀', 'ߵ'), ('ߺ', 'ߺ'), ('\u{7fd}', '\u{7fd}'), ('ࠀ', '\u{82d}'), ('ࡀ', '\u{85b}'), ('ࡠ', 'ࡪ'), ('ࡰ', 'ࢇ'), ('ࢉ', 'ࢎ'), ('\u{897}', '\u{8e1}'), ('\u{8e3}', '\u{963}'), ('०', '९'), ('ॱ', 'ঃ'), ('অ', 'ঌ'), ('এ', 'ঐ'), ('ও', 'ন'), ('প', 'র'), ('ল', 'ল'), ('শ', 'হ'), ('\u{9bc}', '\u{9c4}'), ('ে', 'ৈ'), ('ো', 'ৎ'), ('\u{9d7}', '\u{9d7}'), ('ড়', 'ঢ়'), ('য়', '\u{9e3}'), ('০', 'ৱ'), ('ৼ', 'ৼ'), ('\u{9fe}', '\u{9fe}'), ('\u{a01}', 'ਃ'), ('ਅ', 'ਊ'), ('ਏ', 'ਐ'), ('ਓ', 'ਨ'), ('ਪ', 'ਰ'), ('ਲ', 'ਲ਼'), ('ਵ', 'ਸ਼'), ('ਸ', 'ਹ'), ('\u{a3c}', '\u{a3c}'), ('ਾ', '\u{a42}'), ('\u{a47}', '\u{a48}'), ('\u{a4b}', '\u{a4d}'), ('\u{a51}', '\u{a51}'), ('ਖ਼', 'ੜ'), ('ਫ਼', 'ਫ਼'), ('੦', '\u{a75}'), ('\u{a81}', 'ઃ'), ('અ', 'ઍ'), ('એ', 'ઑ'), ('ઓ', 'ન'), ('પ', 'ર'), ('લ', 'ળ'), ('વ', 'હ'), ('\u{abc}', '\u{ac5}'), ('\u{ac7}', 'ૉ'), ('ો', '\u{acd}'), ('ૐ', 'ૐ'), ('ૠ', '\u{ae3}'), ('૦', '૯'), ('ૹ', '\u{aff}'), ('\u{b01}', 'ଃ'), ('ଅ', 'ଌ'), ('ଏ', 'ଐ'), ('ଓ', 'ନ'), ('ପ', 'ର'), ('ଲ', 'ଳ'), ('ଵ', 'ହ'), ('\u{b3c}', '\u{b44}'), ('େ', 'ୈ'), ('ୋ', '\u{b4d}'), ('\u{b55}', '\u{b57}'), ('ଡ଼', 'ଢ଼'), ('ୟ', '\u{b63}'), ('୦', '୯'), ('ୱ', 'ୱ'), ('\u{b82}', 'ஃ'), ('அ', 'ஊ'), ('எ', 'ஐ'), ('ஒ', 'க'), ('ங', 'ச'), ('ஜ', 'ஜ'), ('ஞ', 'ட'), ('ண', 'த'), ('ந', 'ப'), ('ம', 'ஹ'), ('\u{bbe}', 'ூ'), ('ெ', 'ை'), ('ொ', '\u{bcd}'), ('ௐ', 'ௐ'), ('\u{bd7}', '\u{bd7}'), ('௦', '௯'), ('\u{c00}', 'ఌ'), ('ఎ', 'ఐ'), ('ఒ', 'న'), ('ప', 'హ'), ('\u{c3c}', 'ౄ'), ('\u{c46}', '\u{c48}'), ('\u{c4a}', '\u{c4d}'), ('\u{c55}', '\u{c56}'), ('ౘ', 'ౚ'), ('ౝ', 'ౝ'), ('ౠ', '\u{c63}'), ('౦', '౯'), ('ಀ', 'ಃ'), ('ಅ', 'ಌ'), ('ಎ', 'ಐ'), ('ಒ', 'ನ'), ('ಪ', 'ಳ'), ('ವ', 'ಹ'), ('\u{cbc}', 'ೄ'), ('\u{cc6}', '\u{cc8}'), ('\u{cca}', '\u{ccd}'), ('\u{cd5}', '\u{cd6}'), ('ೝ', 'ೞ'), ('ೠ', '\u{ce3}'), ('೦', '೯'), ('ೱ', 'ೳ'), ('\u{d00}', 'ഌ'), ('എ', 'ഐ'), ('ഒ', '\u{d44}'), ('െ', 'ൈ'), ('ൊ', 'ൎ'), ('ൔ', '\u{d57}'), ('ൟ', '\u{d63}'), ('൦', '൯'), ('ൺ', 'ൿ'), ('\u{d81}', 'ඃ'), ('අ', 'ඖ'), ('ක', 'න'), ('ඳ', 'ර'), ('ල', 'ල'), ('ව', 'ෆ'), ('\u{dca}', '\u{dca}'), ('\u{dcf}', '\u{dd4}'), ('\u{dd6}', '\u{dd6}'), ('ෘ', '\u{ddf}'), ('෦', '෯'), ('ෲ', 'ෳ'), ('ก', '\u{e3a}'), ('เ', '\u{e4e}'), ('๐', '๙'), ('ກ', 'ຂ'), ('ຄ', 'ຄ'), ('ຆ', 'ຊ'), ('ຌ', 'ຣ'), ('ລ', 'ລ'), ('ວ', 'ຽ'), ('ເ', 'ໄ'), ('ໆ', 'ໆ'), ('\u{ec8}', '\u{ece}'), ('໐', '໙'), ('ໜ', 'ໟ'), ('ༀ', 'ༀ'), ('\u{f18}', '\u{f19}'), ('༠', '༩'), ('\u{f35}', '\u{f35}'), ('\u{f37}', '\u{f37}'), ('\u{f39}', '\u{f39}'), ('༾', 'ཇ'), ('ཉ', 'ཬ'), ('\u{f71}', '\u{f84}'), ('\u{f86}', '\u{f97}'), ('\u{f99}', '\u{fbc}'), ('\u{fc6}', '\u{fc6}'), ('က', '၉'), ('ၐ', '\u{109d}'), ('Ⴀ', 'Ⴥ'), ('Ⴧ', 'Ⴧ'), ('Ⴭ', 'Ⴭ'), ('ა', 'ჺ'), ('ჼ', 'ቈ'), ('ቊ', 'ቍ'), ('ቐ', 'ቖ'), ('ቘ', 'ቘ'), ('ቚ', 'ቝ'), ('በ', 'ኈ'), ('ኊ', 'ኍ'), ('ነ', 'ኰ'), ('ኲ', 'ኵ'), ('ኸ', 'ኾ'), ('ዀ', 'ዀ'), ('ዂ', 'ዅ'), ('ወ', 'ዖ'), ('ዘ', 'ጐ'), ('ጒ', 'ጕ'), ('ጘ', 'ፚ'), ('\u{135d}', '\u{135f}'), ('ᎀ', 'ᎏ'), ('Ꭰ', 'Ᏽ'), ('ᏸ', 'ᏽ'), ('ᐁ', 'ᙬ'), ('ᙯ', 'ᙿ'), ('ᚁ', 'ᚚ'), ('ᚠ', 'ᛪ'), ('ᛮ', 'ᛸ'), ('ᜀ', '\u{1715}'), ('ᜟ', '\u{1734}'), ('ᝀ', '\u{1753}'), ('ᝠ', 'ᝬ'), ('ᝮ', 'ᝰ'), ('\u{1772}', '\u{1773}'), ('ក', '\u{17d3}'), ('ៗ', 'ៗ'), ('ៜ', '\u{17dd}'), ('០', '៩'), ('\u{180b}', '\u{180d}'), ('\u{180f}', '᠙'), ('ᠠ', 'ᡸ'), ('ᢀ', 'ᢪ'), ('ᢰ', 'ᣵ'), ('ᤀ', 'ᤞ'), ('\u{1920}', 'ᤫ'), ('ᤰ', '\u{193b}'), ('᥆', 'ᥭ'), ('ᥰ', 'ᥴ'), ('ᦀ', 'ᦫ'), ('ᦰ', 'ᧉ'), ('᧐', '᧙'), ('ᨀ', '\u{1a1b}'), ('ᨠ', '\u{1a5e}'), ('\u{1a60}', '\u{1a7c}'), ('\u{1a7f}', '᪉'), ('᪐', '᪙'), ('ᪧ', 'ᪧ'), ('\u{1ab0}', '\u{1ace}'), ('\u{1b00}', 'ᭌ'), ('᭐', '᭙'), ('\u{1b6b}', '\u{1b73}'), ('\u{1b80}', '\u{1bf3}'), ('ᰀ', '\u{1c37}'), ('᱀', '᱉'), ('ᱍ', 'ᱽ'), ('ᲀ', 'ᲊ'), ('Ა', 'Ჺ'), ('Ჽ', 'Ჿ'), ('\u{1cd0}', '\u{1cd2}'), ('\u{1cd4}', 'ᳺ'), ('ᴀ', 'ἕ'), ('Ἐ', 'Ἕ'), ('ἠ', 'ὅ'), ('Ὀ', 'Ὅ'), ('ὐ', 'ὗ'), ('Ὑ', 'Ὑ'), ('Ὓ', 'Ὓ'), ('Ὕ', 'Ὕ'), ('Ὗ', 'ώ'), ('ᾀ', 'ᾴ'), ('ᾶ', 'ᾼ'), ('ι', 'ι'), ('ῂ', 'ῄ'), ('ῆ', 'ῌ'), ('ῐ', 'ΐ'), ('ῖ', 'Ί'), ('ῠ', 'Ῥ'), ('ῲ', 'ῴ'), ('ῶ', 'ῼ'), ('\u{200c}', '\u{200d}'), ('‿', '⁀'), ('⁔', '⁔'), ('ⁱ', 'ⁱ'), ('ⁿ', 'ⁿ'), ('ₐ', 'ₜ'), ('\u{20d0}', '\u{20f0}'), ('ℂ', 'ℂ'), ('ℇ', 'ℇ'), ('ℊ', 'ℓ'), ('ℕ', 'ℕ'), ('ℙ', 'ℝ'), ('ℤ', 'ℤ'), ('Ω', 'Ω'), ('ℨ', 'ℨ'), ('K', 'ℭ'), ('ℯ', 'ℹ'), ('ℼ', 'ℿ'), ('ⅅ', 'ⅉ'), ('ⅎ', 'ⅎ'), ('Ⅰ', 'ↈ'), ('Ⓐ', 'ⓩ'), ('Ⰰ', 'ⳤ'), ('Ⳬ', 'ⳳ'), ('ⴀ', 'ⴥ'), ('ⴧ', 'ⴧ'), ('ⴭ', 'ⴭ'), ('ⴰ', 'ⵧ'), ('ⵯ', 'ⵯ'), ('\u{2d7f}', 'ⶖ'), ('ⶠ', 'ⶦ'), ('ⶨ', 'ⶮ'), ('ⶰ', 'ⶶ'), ('ⶸ', 'ⶾ'), ('ⷀ', 'ⷆ'), ('ⷈ', 'ⷎ'), ('ⷐ', 'ⷖ'), ('ⷘ', 'ⷞ'), ('\u{2de0}', '\u{2dff}'), ('ⸯ', 'ⸯ'), ('々', '〇'), ('〡', '\u{302f}'), ('〱', '〵'), ('〸', '〼'), ('ぁ', 'ゖ'), ('\u{3099}', '\u{309a}'), ('ゝ', 'ゟ'), ('ァ', 'ヺ'), ('ー', 'ヿ'), ('ㄅ', 'ㄯ'), ('ㄱ', 'ㆎ'), ('ㆠ', 'ㆿ'), ('ㇰ', 'ㇿ'), ('㐀', '䶿'), ('一', 'ꒌ'), ('ꓐ', 'ꓽ'), ('ꔀ', 'ꘌ'), ('ꘐ', 'ꘫ'), ('Ꙁ', '\u{a672}'), ('\u{a674}', '\u{a67d}'), ('ꙿ', '\u{a6f1}'), ('ꜗ', 'ꜟ'), ('Ꜣ', 'ꞈ'), ('Ꞌ', 'ꟍ'), ('Ꟑ', 'ꟑ'), ('ꟓ', 'ꟓ'), ('ꟕ', 'Ƛ'), ('ꟲ', 'ꠧ'), ('\u{a82c}', '\u{a82c}'), ('ꡀ', 'ꡳ'), ('ꢀ', '\u{a8c5}'), ('꣐', '꣙'), ('\u{a8e0}', 'ꣷ'), ('ꣻ', 'ꣻ'), ('ꣽ', '\u{a92d}'), ('ꤰ', '\u{a953}'), ('ꥠ', 'ꥼ'), ('\u{a980}', '\u{a9c0}'), ('ꧏ', '꧙'), ('ꧠ', 'ꧾ'), ('ꨀ', '\u{aa36}'), ('ꩀ', 'ꩍ'), ('꩐', '꩙'), ('ꩠ', 'ꩶ'), ('ꩺ', 'ꫂ'), ('ꫛ', 'ꫝ'), ('ꫠ', 'ꫯ'), ('ꫲ', '\u{aaf6}'), ('ꬁ', 'ꬆ'), ('ꬉ', 'ꬎ'), ('ꬑ', 'ꬖ'), ('ꬠ', 'ꬦ'), ('ꬨ', 'ꬮ'), ('ꬰ', 'ꭚ'), ('ꭜ', 'ꭩ'), ('ꭰ', 'ꯪ'), ('꯬', '\u{abed}'), ('꯰', '꯹'), ('가', '힣'), ('ힰ', 'ퟆ'), ('ퟋ', 'ퟻ'), ('豈', '舘'), ('並', '龎'), ('ff', 'st'), ('ﬓ', 'ﬗ'), ('יִ', 'ﬨ'), ('שׁ', 'זּ'), ('טּ', 'לּ'), ('מּ', 'מּ'), ('נּ', 'סּ'), ('ףּ', 'פּ'), ('צּ', 'ﮱ'), ('ﯓ', 'ﴽ'), ('ﵐ', 'ﶏ'), ('ﶒ', 'ﷇ'), ('ﷰ', 'ﷻ'), ('\u{fe00}', '\u{fe0f}'), ('\u{fe20}', '\u{fe2f}'), ('︳', '︴'), ('﹍', '﹏'), ('ﹰ', 'ﹴ'), ('ﹶ', 'ﻼ'), ('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z'), ('ヲ', 'ᄒ'), ('ᅡ', 'ᅦ'), ('ᅧ', 'ᅬ'), ('ᅭ', 'ᅲ'), ('ᅳ', 'ᅵ'), ('𐀀', '𐀋'), ('𐀍', '𐀦'), ('𐀨', '𐀺'), ('𐀼', '𐀽'), ('𐀿', '𐁍'), ('𐁐', '𐁝'), ('𐂀', '𐃺'), ('𐅀', '𐅴'), ('\u{101fd}', '\u{101fd}'), ('𐊀', '𐊜'), ('𐊠', '𐋐'), ('\u{102e0}', '\u{102e0}'), ('𐌀', '𐌟'), ('𐌭', '𐍊'), ('𐍐', '\u{1037a}'), ('𐎀', '𐎝'), ('𐎠', '𐏃'), ('𐏈', '𐏏'), ('𐏑', '𐏕'), ('𐐀', '𐒝'), ('𐒠', '𐒩'), ('𐒰', '𐓓'), ('𐓘', '𐓻'), ('𐔀', '𐔧'), ('𐔰', '𐕣'), ('𐕰', '𐕺'), ('𐕼', '𐖊'), ('𐖌', '𐖒'), ('𐖔', '𐖕'), ('𐖗', '𐖡'), ('𐖣', '𐖱'), ('𐖳', '𐖹'), ('𐖻', '𐖼'), ('𐗀', '𐗳'), ('𐘀', '𐜶'), ('𐝀', '𐝕'), ('𐝠', '𐝧'), ('𐞀', '𐞅'), ('𐞇', '𐞰'), ('𐞲', '𐞺'), ('𐠀', '𐠅'), ('𐠈', '𐠈'), ('𐠊', '𐠵'), ('𐠷', '𐠸'), ('𐠼', '𐠼'), ('𐠿', '𐡕'), ('𐡠', '𐡶'), ('𐢀', '𐢞'), ('𐣠', '𐣲'), ('𐣴', '𐣵'), ('𐤀', '𐤕'), ('𐤠', '𐤹'), ('𐦀', '𐦷'), ('𐦾', '𐦿'), ('𐨀', '\u{10a03}'), ('\u{10a05}', '\u{10a06}'), ('\u{10a0c}', '𐨓'), ('𐨕', '𐨗'), ('𐨙', '𐨵'), ('\u{10a38}', '\u{10a3a}'), ('\u{10a3f}', '\u{10a3f}'), ('𐩠', '𐩼'), ('𐪀', '𐪜'), ('𐫀', '𐫇'), ('𐫉', '\u{10ae6}'), ('𐬀', '𐬵'), ('𐭀', '𐭕'), ('𐭠', '𐭲'), ('𐮀', '𐮑'), ('𐰀', '𐱈'), ('𐲀', '𐲲'), ('𐳀', '𐳲'), ('𐴀', '\u{10d27}'), ('𐴰', '𐴹'), ('𐵀', '𐵥'), ('\u{10d69}', '\u{10d6d}'), ('𐵯', '𐶅'), ('𐺀', '𐺩'), ('\u{10eab}', '\u{10eac}'), ('𐺰', '𐺱'), ('𐻂', '𐻄'), ('\u{10efc}', '𐼜'), ('𐼧', '𐼧'), ('𐼰', '\u{10f50}'), ('𐽰', '\u{10f85}'), ('𐾰', '𐿄'), ('𐿠', '𐿶'), ('𑀀', '\u{11046}'), ('𑁦', '𑁵'), ('\u{1107f}', '\u{110ba}'), ('\u{110c2}', '\u{110c2}'), ('𑃐', '𑃨'), ('𑃰', '𑃹'), ('\u{11100}', '\u{11134}'), ('𑄶', '𑄿'), ('𑅄', '𑅇'), ('𑅐', '\u{11173}'), ('𑅶', '𑅶'), ('\u{11180}', '𑇄'), ('\u{111c9}', '\u{111cc}'), ('𑇎', '𑇚'), ('𑇜', '𑇜'), ('𑈀', '𑈑'), ('𑈓', '\u{11237}'), ('\u{1123e}', '\u{11241}'), ('𑊀', '𑊆'), ('𑊈', '𑊈'), ('𑊊', '𑊍'), ('𑊏', '𑊝'), ('𑊟', '𑊨'), ('𑊰', '\u{112ea}'), ('𑋰', '𑋹'), ('\u{11300}', '𑌃'), ('𑌅', '𑌌'), ('𑌏', '𑌐'), ('𑌓', '𑌨'), ('𑌪', '𑌰'), ('𑌲', '𑌳'), ('𑌵', '𑌹'), ('\u{1133b}', '𑍄'), ('𑍇', '𑍈'), ('𑍋', '\u{1134d}'), ('𑍐', '𑍐'), ('\u{11357}', '\u{11357}'), ('𑍝', '𑍣'), ('\u{11366}', '\u{1136c}'), ('\u{11370}', '\u{11374}'), ('𑎀', '𑎉'), ('𑎋', '𑎋'), ('𑎎', '𑎎'), ('𑎐', '𑎵'), ('𑎷', '\u{113c0}'), ('\u{113c2}', '\u{113c2}'), ('\u{113c5}', '\u{113c5}'), ('\u{113c7}', '𑏊'), ('𑏌', '𑏓'), ('\u{113e1}', '\u{113e2}'), ('𑐀', '𑑊'), ('𑑐', '𑑙'), ('\u{1145e}', '𑑡'), ('𑒀', '𑓅'), ('𑓇', '𑓇'), ('𑓐', '𑓙'), ('𑖀', '\u{115b5}'), ('𑖸', '\u{115c0}'), ('𑗘', '\u{115dd}'), ('𑘀', '\u{11640}'), ('𑙄', '𑙄'), ('𑙐', '𑙙'), ('𑚀', '𑚸'), ('𑛀', '𑛉'), ('𑛐', '𑛣'), ('𑜀', '𑜚'), ('\u{1171d}', '\u{1172b}'), ('𑜰', '𑜹'), ('𑝀', '𑝆'), ('𑠀', '\u{1183a}'), ('𑢠', '𑣩'), ('𑣿', '𑤆'), ('𑤉', '𑤉'), ('𑤌', '𑤓'), ('𑤕', '𑤖'), ('𑤘', '𑤵'), ('𑤷', '𑤸'), ('\u{1193b}', '\u{11943}'), ('𑥐', '𑥙'), ('𑦠', '𑦧'), ('𑦪', '\u{119d7}'), ('\u{119da}', '𑧡'), ('𑧣', '𑧤'), ('𑨀', '\u{11a3e}'), ('\u{11a47}', '\u{11a47}'), ('𑩐', '\u{11a99}'), ('𑪝', '𑪝'), ('𑪰', '𑫸'), ('𑯀', '𑯠'), ('𑯰', '𑯹'), ('𑰀', '𑰈'), ('𑰊', '\u{11c36}'), ('\u{11c38}', '𑱀'), ('𑱐', '𑱙'), ('𑱲', '𑲏'), ('\u{11c92}', '\u{11ca7}'), ('𑲩', '\u{11cb6}'), ('𑴀', '𑴆'), ('𑴈', '𑴉'), ('𑴋', '\u{11d36}'), ('\u{11d3a}', '\u{11d3a}'), ('\u{11d3c}', '\u{11d3d}'), ('\u{11d3f}', '\u{11d47}'), ('𑵐', '𑵙'), ('𑵠', '𑵥'), ('𑵧', '𑵨'), ('𑵪', '𑶎'), ('\u{11d90}', '\u{11d91}'), ('𑶓', '𑶘'), ('𑶠', '𑶩'), ('𑻠', '𑻶'), ('\u{11f00}', '𑼐'), ('𑼒', '\u{11f3a}'), ('𑼾', '\u{11f42}'), ('𑽐', '\u{11f5a}'), ('𑾰', '𑾰'), ('𒀀', '𒎙'), ('𒐀', '𒑮'), ('𒒀', '𒕃'), ('𒾐', '𒿰'), ('𓀀', '𓐯'), ('\u{13440}', '\u{13455}'), ('𓑠', '𔏺'), ('𔐀', '𔙆'), ('𖄀', '𖄹'), ('𖠀', '𖨸'), ('𖩀', '𖩞'), ('𖩠', '𖩩'), ('𖩰', '𖪾'), ('𖫀', '𖫉'), ('𖫐', '𖫭'), ('\u{16af0}', '\u{16af4}'), ('𖬀', '\u{16b36}'), ('𖭀', '𖭃'), ('𖭐', '𖭙'), ('𖭣', '𖭷'), ('𖭽', '𖮏'), ('𖵀', '𖵬'), ('𖵰', '𖵹'), ('𖹀', '𖹿'), ('𖼀', '𖽊'), ('\u{16f4f}', '𖾇'), ('\u{16f8f}', '𖾟'), ('𖿠', '𖿡'), ('𖿣', '\u{16fe4}'), ('\u{16ff0}', '\u{16ff1}'), ('𗀀', '𘟷'), ('𘠀', '𘳕'), ('𘳿', '𘴈'), ('𚿰', '𚿳'), ('𚿵', '𚿻'), ('𚿽', '𚿾'), ('𛀀', '𛄢'), ('𛄲', '𛄲'), ('𛅐', '𛅒'), ('𛅕', '𛅕'), ('𛅤', '𛅧'), ('𛅰', '𛋻'), ('𛰀', '𛱪'), ('𛱰', '𛱼'), ('𛲀', '𛲈'), ('𛲐', '𛲙'), ('\u{1bc9d}', '\u{1bc9e}'), ('𜳰', '𜳹'), ('\u{1cf00}', '\u{1cf2d}'), ('\u{1cf30}', '\u{1cf46}'), ('\u{1d165}', '\u{1d169}'), ('\u{1d16d}', '\u{1d172}'), ('\u{1d17b}', '\u{1d182}'), ('\u{1d185}', '\u{1d18b}'), ('\u{1d1aa}', '\u{1d1ad}'), ('\u{1d242}', '\u{1d244}'), ('𝐀', '𝑔'), ('𝑖', '𝒜'), ('𝒞', '𝒟'), ('𝒢', '𝒢'), ('𝒥', '𝒦'), ('𝒩', '𝒬'), ('𝒮', '𝒹'), ('𝒻', '𝒻'), ('𝒽', '𝓃'), ('𝓅', '𝔅'), ('𝔇', '𝔊'), ('𝔍', '𝔔'), ('𝔖', '𝔜'), ('𝔞', '𝔹'), ('𝔻', '𝔾'), ('𝕀', '𝕄'), ('𝕆', '𝕆'), ('𝕊', '𝕐'), ('𝕒', '𝚥'), ('𝚨', '𝛀'), ('𝛂', '𝛚'), ('𝛜', '𝛺'), ('𝛼', '𝜔'), ('𝜖', '𝜴'), ('𝜶', '𝝎'), ('𝝐', '𝝮'), ('𝝰', '𝞈'), ('𝞊', '𝞨'), ('𝞪', '𝟂'), ('𝟄', '𝟋'), ('𝟎', '𝟿'), ('\u{1da00}', '\u{1da36}'), ('\u{1da3b}', '\u{1da6c}'), ('\u{1da75}', '\u{1da75}'), ('\u{1da84}', '\u{1da84}'), ('\u{1da9b}', '\u{1da9f}'), ('\u{1daa1}', '\u{1daaf}'), ('𝼀', '𝼞'), ('𝼥', '𝼪'), ('\u{1e000}', '\u{1e006}'), ('\u{1e008}', '\u{1e018}'), ('\u{1e01b}', '\u{1e021}'), ('\u{1e023}', '\u{1e024}'), ('\u{1e026}', '\u{1e02a}'), ('𞀰', '𞁭'), ('\u{1e08f}', '\u{1e08f}'), ('𞄀', '𞄬'), ('\u{1e130}', '𞄽'), ('𞅀', '𞅉'), ('𞅎', '𞅎'), ('𞊐', '\u{1e2ae}'), ('𞋀', '𞋹'), ('𞓐', '𞓹'), ('𞗐', '𞗺'), ('𞟠', '𞟦'), ('𞟨', '𞟫'), ('𞟭', '𞟮'), ('𞟰', '𞟾'), ('𞠀', '𞣄'), ('\u{1e8d0}', '\u{1e8d6}'), ('𞤀', '𞥋'), ('𞥐', '𞥙'), ('𞸀', '𞸃'), ('𞸅', '𞸟'), ('𞸡', '𞸢'), ('𞸤', '𞸤'), ('𞸧', '𞸧'), ('𞸩', '𞸲'), ('𞸴', '𞸷'), ('𞸹', '𞸹'), ('𞸻', '𞸻'), ('𞹂', '𞹂'), ('𞹇', '𞹇'), ('𞹉', '𞹉'), ('𞹋', '𞹋'), ('𞹍', '𞹏'), ('𞹑', '𞹒'), ('𞹔', '𞹔'), ('𞹗', '𞹗'), ('𞹙', '𞹙'), ('𞹛', '𞹛'), ('𞹝', '𞹝'), ('𞹟', '𞹟'), ('𞹡', '𞹢'), ('𞹤', '𞹤'), ('𞹧', '𞹪'), ('𞹬', '𞹲'), ('𞹴', '𞹷'), ('𞹹', '𞹼'), ('𞹾', '𞹾'), ('𞺀', '𞺉'), ('𞺋', '𞺛'), ('𞺡', '𞺣'), ('𞺥', '𞺩'), ('𞺫', '𞺻'), ('🄰', '🅉'), ('🅐', '🅩'), ('🅰', '🆉'), ('🯰', '🯹'), ('𠀀', '𪛟'), ('𪜀', '𫜹'), ('𫝀', '𫠝'), ('𫠠', '𬺡'), ('𬺰', '𮯠'), ('𮯰', '𮹝'), ('丽', '𪘀'), ('𰀀', '𱍊'), ('𱍐', '𲎯'), ('\u{e0100}', '\u{e01ef}'), ]; regex-automata-0.4.9/src/util/utf8.rs000064400000000000000000000161721046102023000155700ustar 00000000000000/*! Utilities for dealing with UTF-8. This module provides some UTF-8 related helper routines, including an incremental decoder. */ /// Returns true if and only if the given byte is considered a word character. /// This only applies to ASCII. /// /// This was copied from regex-syntax so that we can use it to determine the /// starting DFA state while searching without depending on regex-syntax. The /// definition is never going to change, so there's no maintenance/bit-rot /// hazard here. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_word_byte(b: u8) -> bool { const fn mkwordset() -> [bool; 256] { // FIXME: Use as_usize() once const functions in traits are stable. let mut set = [false; 256]; set[b'_' as usize] = true; let mut byte = b'0'; while byte <= b'9' { set[byte as usize] = true; byte += 1; } byte = b'A'; while byte <= b'Z' { set[byte as usize] = true; byte += 1; } byte = b'a'; while byte <= b'z' { set[byte as usize] = true; byte += 1; } set } const WORD: [bool; 256] = mkwordset(); WORD[b as usize] } /// Decodes the next UTF-8 encoded codepoint from the given byte slice. /// /// If no valid encoding of a codepoint exists at the beginning of the given /// byte slice, then the first byte is returned instead. /// /// This returns `None` if and only if `bytes` is empty. /// /// This never panics. /// /// *WARNING*: This is not designed for performance. If you're looking for a /// fast UTF-8 decoder, this is not it. If you feel like you need one in this /// crate, then please file an issue and discuss your use case. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn decode(bytes: &[u8]) -> Option> { if bytes.is_empty() { return None; } let len = match len(bytes[0]) { None => return Some(Err(bytes[0])), Some(len) if len > bytes.len() => return Some(Err(bytes[0])), Some(1) => return Some(Ok(char::from(bytes[0]))), Some(len) => len, }; match core::str::from_utf8(&bytes[..len]) { Ok(s) => Some(Ok(s.chars().next().unwrap())), Err(_) => Some(Err(bytes[0])), } } /// Decodes the last UTF-8 encoded codepoint from the given byte slice. /// /// If no valid encoding of a codepoint exists at the end of the given byte /// slice, then the last byte is returned instead. /// /// This returns `None` if and only if `bytes` is empty. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn decode_last(bytes: &[u8]) -> Option> { if bytes.is_empty() { return None; } let mut start = bytes.len() - 1; let limit = bytes.len().saturating_sub(4); while start > limit && !is_leading_or_invalid_byte(bytes[start]) { start -= 1; } match decode(&bytes[start..]) { None => None, Some(Ok(ch)) => Some(Ok(ch)), Some(Err(_)) => Some(Err(bytes[bytes.len() - 1])), } } /// Given a UTF-8 leading byte, this returns the total number of code units /// in the following encoded codepoint. /// /// If the given byte is not a valid UTF-8 leading byte, then this returns /// `None`. #[cfg_attr(feature = "perf-inline", inline(always))] fn len(byte: u8) -> Option { if byte <= 0x7F { return Some(1); } else if byte & 0b1100_0000 == 0b1000_0000 { return None; } else if byte <= 0b1101_1111 { Some(2) } else if byte <= 0b1110_1111 { Some(3) } else if byte <= 0b1111_0111 { Some(4) } else { None } } /// Returns true if and only if the given offset in the given bytes falls on a /// valid UTF-8 encoded codepoint boundary. /// /// If `bytes` is not valid UTF-8, then the behavior of this routine is /// unspecified. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool { match bytes.get(i) { // The position at the end of the bytes always represents an empty // string, which is a valid boundary. But anything after that doesn't // make much sense to call valid a boundary. None => i == bytes.len(), // Other than ASCII (where the most significant bit is never set), // valid starting bytes always have their most significant two bits // set, where as continuation bytes never have their second most // significant bit set. Therefore, this only returns true when bytes[i] // corresponds to a byte that begins a valid UTF-8 encoding of a // Unicode scalar value. Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000, } } /// Returns true if and only if the given byte is either a valid leading UTF-8 /// byte, or is otherwise an invalid byte that can never appear anywhere in a /// valid UTF-8 sequence. #[cfg_attr(feature = "perf-inline", inline(always))] fn is_leading_or_invalid_byte(b: u8) -> bool { // In the ASCII case, the most significant bit is never set. The leading // byte of a 2/3/4-byte sequence always has the top two most significant // bits set. For bytes that can never appear anywhere in valid UTF-8, this // also returns true, since every such byte has its two most significant // bits set: // // \xC0 :: 11000000 // \xC1 :: 11000001 // \xF5 :: 11110101 // \xF6 :: 11110110 // \xF7 :: 11110111 // \xF8 :: 11111000 // \xF9 :: 11111001 // \xFA :: 11111010 // \xFB :: 11111011 // \xFC :: 11111100 // \xFD :: 11111101 // \xFE :: 11111110 // \xFF :: 11111111 (b & 0b1100_0000) != 0b1000_0000 } /* /// Returns the smallest possible index of the next valid UTF-8 sequence /// starting after `i`. /// /// For all inputs, including invalid UTF-8 and any value of `i`, the return /// value is guaranteed to be greater than `i`. (If there is no value greater /// than `i` that fits in `usize`, then this panics.) /// /// Generally speaking, this should only be called on `text` when it is /// permitted to assume that it is valid UTF-8 and where either `i >= /// text.len()` or where `text[i]` is a leading byte of a UTF-8 sequence. /// /// NOTE: This method was used in a previous conception of iterators where we /// specifically tried to skip over empty matches that split a codepoint by /// simply requiring that our next search begin at the beginning of codepoint. /// But we ended up changing that technique to always advance by 1 byte and /// then filter out matches that split a codepoint after-the-fact. Thus, we no /// longer use this method. But I've kept it around in case we want to switch /// back to this approach. Its guarantees are a little subtle, so I'd prefer /// not to rebuild it from whole cloth. pub(crate) fn next(text: &[u8], i: usize) -> usize { let b = match text.get(i) { None => return i.checked_add(1).unwrap(), Some(&b) => b, }; // For cases where we see an invalid UTF-8 byte, there isn't much we can do // other than just start at the next byte. let inc = len(b).unwrap_or(1); i.checked_add(inc).unwrap() } */ regex-automata-0.4.9/src/util/wire.rs000064400000000000000000001056501046102023000156500ustar 00000000000000/*! Types and routines that support the wire format of finite automata. Currently, this module just exports a few error types and some small helpers for deserializing [dense DFAs](crate::dfa::dense::DFA) using correct alignment. */ /* A collection of helper functions, types and traits for serializing automata. This crate defines its own bespoke serialization mechanism for some structures provided in the public API, namely, DFAs. A bespoke mechanism was developed primarily because structures like automata demand a specific binary format. Attempting to encode their rich structure in an existing serialization format is just not feasible. Moreover, the format for each structure is generally designed such that deserialization is cheap. More specifically, that deserialization can be done in constant time. (The idea being that you can embed it into your binary or mmap it, and then use it immediately.) In order to achieve this, the dense and sparse DFAs in this crate use an in-memory representation that very closely corresponds to its binary serialized form. This pervades and complicates everything, and in some cases, requires dealing with alignment and reasoning about safety. This technique does have major advantages. In particular, it permits doing the potentially costly work of compiling a finite state machine in an offline manner, and then loading it at runtime not only without having to re-compile the regex, but even without the code required to do the compilation. This, for example, permits one to use a pre-compiled DFA not only in environments without Rust's standard library, but also in environments without a heap. In the code below, whenever we insert some kind of padding, it's to enforce a 4-byte alignment, unless otherwise noted. Namely, u32 is the only state ID type supported. (In a previous version of this library, DFAs were generic over the state ID representation.) Also, serialization generally requires the caller to specify endianness, where as deserialization always assumes native endianness (otherwise cheap deserialization would be impossible). This implies that serializing a structure generally requires serializing both its big-endian and little-endian variants, and then loading the correct one based on the target's endianness. */ use core::{cmp, mem::size_of}; #[cfg(feature = "alloc")] use alloc::{vec, vec::Vec}; use crate::util::{ int::Pointer, primitives::{PatternID, PatternIDError, StateID, StateIDError}, }; /// A hack to align a smaller type `B` with a bigger type `T`. /// /// The usual use of this is with `B = [u8]` and `T = u32`. That is, /// it permits aligning a sequence of bytes on a 4-byte boundary. This /// is useful in contexts where one wants to embed a serialized [dense /// DFA](crate::dfa::dense::DFA) into a Rust a program while guaranteeing the /// alignment required for the DFA. /// /// See [`dense::DFA::from_bytes`](crate::dfa::dense::DFA::from_bytes) for an /// example of how to use this type. #[repr(C)] #[derive(Debug)] pub struct AlignAs { /// A zero-sized field indicating the alignment we want. pub _align: [T; 0], /// A possibly non-sized field containing a sequence of bytes. pub bytes: B, } /// An error that occurs when serializing an object from this crate. /// /// Serialization, as used in this crate, universally refers to the process /// of transforming a structure (like a DFA) into a custom binary format /// represented by `&[u8]`. To this end, serialization is generally infallible. /// However, it can fail when caller provided buffer sizes are too small. When /// that occurs, a serialization error is reported. /// /// A `SerializeError` provides no introspection capabilities. Its only /// supported operation is conversion to a human readable error message. /// /// This error type implements the `std::error::Error` trait only when the /// `std` feature is enabled. Otherwise, this type is defined in all /// configurations. #[derive(Debug)] pub struct SerializeError { /// The name of the thing that a buffer is too small for. /// /// Currently, the only kind of serialization error is one that is /// committed by a caller: providing a destination buffer that is too /// small to fit the serialized object. This makes sense conceptually, /// since every valid inhabitant of a type should be serializable. /// /// This is somewhat exposed in the public API of this crate. For example, /// the `to_bytes_{big,little}_endian` APIs return a `Vec` and are /// guaranteed to never panic or error. This is only possible because the /// implementation guarantees that it will allocate a `Vec` that is /// big enough. /// /// In summary, if a new serialization error kind needs to be added, then /// it will need careful consideration. what: &'static str, } impl SerializeError { pub(crate) fn buffer_too_small(what: &'static str) -> SerializeError { SerializeError { what } } } impl core::fmt::Display for SerializeError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { write!(f, "destination buffer is too small to write {}", self.what) } } #[cfg(feature = "std")] impl std::error::Error for SerializeError {} /// An error that occurs when deserializing an object defined in this crate. /// /// Serialization, as used in this crate, universally refers to the process /// of transforming a structure (like a DFA) into a custom binary format /// represented by `&[u8]`. Deserialization, then, refers to the process of /// cheaply converting this binary format back to the object's in-memory /// representation as defined in this crate. To the extent possible, /// deserialization will report this error whenever this process fails. /// /// A `DeserializeError` provides no introspection capabilities. Its only /// supported operation is conversion to a human readable error message. /// /// This error type implements the `std::error::Error` trait only when the /// `std` feature is enabled. Otherwise, this type is defined in all /// configurations. #[derive(Debug)] pub struct DeserializeError(DeserializeErrorKind); #[derive(Debug)] enum DeserializeErrorKind { Generic { msg: &'static str }, BufferTooSmall { what: &'static str }, InvalidUsize { what: &'static str }, VersionMismatch { expected: u32, found: u32 }, EndianMismatch { expected: u32, found: u32 }, AlignmentMismatch { alignment: usize, address: usize }, LabelMismatch { expected: &'static str }, ArithmeticOverflow { what: &'static str }, PatternID { err: PatternIDError, what: &'static str }, StateID { err: StateIDError, what: &'static str }, } impl DeserializeError { pub(crate) fn generic(msg: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::Generic { msg }) } pub(crate) fn buffer_too_small(what: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::BufferTooSmall { what }) } fn invalid_usize(what: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::InvalidUsize { what }) } fn version_mismatch(expected: u32, found: u32) -> DeserializeError { DeserializeError(DeserializeErrorKind::VersionMismatch { expected, found, }) } fn endian_mismatch(expected: u32, found: u32) -> DeserializeError { DeserializeError(DeserializeErrorKind::EndianMismatch { expected, found, }) } fn alignment_mismatch( alignment: usize, address: usize, ) -> DeserializeError { DeserializeError(DeserializeErrorKind::AlignmentMismatch { alignment, address, }) } fn label_mismatch(expected: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::LabelMismatch { expected }) } fn arithmetic_overflow(what: &'static str) -> DeserializeError { DeserializeError(DeserializeErrorKind::ArithmeticOverflow { what }) } fn pattern_id_error( err: PatternIDError, what: &'static str, ) -> DeserializeError { DeserializeError(DeserializeErrorKind::PatternID { err, what }) } pub(crate) fn state_id_error( err: StateIDError, what: &'static str, ) -> DeserializeError { DeserializeError(DeserializeErrorKind::StateID { err, what }) } } #[cfg(feature = "std")] impl std::error::Error for DeserializeError {} impl core::fmt::Display for DeserializeError { fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result { use self::DeserializeErrorKind::*; match self.0 { Generic { msg } => write!(f, "{}", msg), BufferTooSmall { what } => { write!(f, "buffer is too small to read {}", what) } InvalidUsize { what } => { write!(f, "{} is too big to fit in a usize", what) } VersionMismatch { expected, found } => write!( f, "unsupported version: \ expected version {} but found version {}", expected, found, ), EndianMismatch { expected, found } => write!( f, "endianness mismatch: expected 0x{:X} but got 0x{:X}. \ (Are you trying to load an object serialized with a \ different endianness?)", expected, found, ), AlignmentMismatch { alignment, address } => write!( f, "alignment mismatch: slice starts at address \ 0x{:X}, which is not aligned to a {} byte boundary", address, alignment, ), LabelMismatch { expected } => write!( f, "label mismatch: start of serialized object should \ contain a NUL terminated {:?} label, but a different \ label was found", expected, ), ArithmeticOverflow { what } => { write!(f, "arithmetic overflow for {}", what) } PatternID { ref err, what } => { write!(f, "failed to read pattern ID for {}: {}", what, err) } StateID { ref err, what } => { write!(f, "failed to read state ID for {}: {}", what, err) } } } } /// Safely converts a `&[u32]` to `&[StateID]` with zero cost. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn u32s_to_state_ids(slice: &[u32]) -> &[StateID] { // SAFETY: This is safe because StateID is defined to have the same memory // representation as a u32 (it is repr(transparent)). While not every u32 // is a "valid" StateID, callers are not permitted to rely on the validity // of StateIDs for memory safety. It can only lead to logical errors. (This // is why StateID::new_unchecked is safe.) unsafe { core::slice::from_raw_parts( slice.as_ptr().cast::(), slice.len(), ) } } /// Safely converts a `&mut [u32]` to `&mut [StateID]` with zero cost. pub(crate) fn u32s_to_state_ids_mut(slice: &mut [u32]) -> &mut [StateID] { // SAFETY: This is safe because StateID is defined to have the same memory // representation as a u32 (it is repr(transparent)). While not every u32 // is a "valid" StateID, callers are not permitted to rely on the validity // of StateIDs for memory safety. It can only lead to logical errors. (This // is why StateID::new_unchecked is safe.) unsafe { core::slice::from_raw_parts_mut( slice.as_mut_ptr().cast::(), slice.len(), ) } } /// Safely converts a `&[u32]` to `&[PatternID]` with zero cost. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn u32s_to_pattern_ids(slice: &[u32]) -> &[PatternID] { // SAFETY: This is safe because PatternID is defined to have the same // memory representation as a u32 (it is repr(transparent)). While not // every u32 is a "valid" PatternID, callers are not permitted to rely // on the validity of PatternIDs for memory safety. It can only lead to // logical errors. (This is why PatternID::new_unchecked is safe.) unsafe { core::slice::from_raw_parts( slice.as_ptr().cast::(), slice.len(), ) } } /// Checks that the given slice has an alignment that matches `T`. /// /// This is useful for checking that a slice has an appropriate alignment /// before casting it to a &[T]. Note though that alignment is not itself /// sufficient to perform the cast for any `T`. pub(crate) fn check_alignment( slice: &[u8], ) -> Result<(), DeserializeError> { let alignment = core::mem::align_of::(); let address = slice.as_ptr().as_usize(); if address % alignment == 0 { return Ok(()); } Err(DeserializeError::alignment_mismatch(alignment, address)) } /// Reads a possibly empty amount of padding, up to 7 bytes, from the beginning /// of the given slice. All padding bytes must be NUL bytes. /// /// This is useful because it can be theoretically necessary to pad the /// beginning of a serialized object with NUL bytes to ensure that it starts /// at a correctly aligned address. These padding bytes should come immediately /// before the label. /// /// This returns the number of bytes read from the given slice. pub(crate) fn skip_initial_padding(slice: &[u8]) -> usize { let mut nread = 0; while nread < 7 && nread < slice.len() && slice[nread] == 0 { nread += 1; } nread } /// Allocate a byte buffer of the given size, along with some initial padding /// such that `buf[padding..]` has the same alignment as `T`, where the /// alignment of `T` must be at most `8`. In particular, callers should treat /// the first N bytes (second return value) as padding bytes that must not be /// overwritten. In all cases, the following identity holds: /// /// ```ignore /// let (buf, padding) = alloc_aligned_buffer::(SIZE); /// assert_eq!(SIZE, buf[padding..].len()); /// ``` /// /// In practice, padding is often zero. /// /// The requirement for `8` as a maximum here is somewhat arbitrary. In /// practice, we never need anything bigger in this crate, and so this function /// does some sanity asserts under the assumption of a max alignment of `8`. #[cfg(feature = "alloc")] pub(crate) fn alloc_aligned_buffer(size: usize) -> (Vec, usize) { // NOTE: This is a kludge because there's no easy way to allocate a Vec // with an alignment guaranteed to be greater than 1. We could create a // Vec, but this cannot be safely transmuted to a Vec without // concern, since reallocing or dropping the Vec is UB (different // alignment than the initial allocation). We could define a wrapper type // to manage this for us, but it seems like more machinery than it's worth. let buf = vec![0; size]; let align = core::mem::align_of::(); let address = buf.as_ptr().as_usize(); if address % align == 0 { return (buf, 0); } // Let's try this again. We have to create a totally new alloc with // the maximum amount of bytes we might need. We can't just extend our // pre-existing 'buf' because that might create a new alloc with a // different alignment. let extra = align - 1; let mut buf = vec![0; size + extra]; let address = buf.as_ptr().as_usize(); // The code below handles the case where 'address' is aligned to T, so if // we got lucky and 'address' is now aligned to T (when it previously // wasn't), then we're done. if address % align == 0 { buf.truncate(size); return (buf, 0); } let padding = ((address & !(align - 1)).checked_add(align).unwrap()) .checked_sub(address) .unwrap(); assert!(padding <= 7, "padding of {} is bigger than 7", padding); assert!( padding <= extra, "padding of {} is bigger than extra {} bytes", padding, extra ); buf.truncate(size + padding); assert_eq!(size + padding, buf.len()); assert_eq!( 0, buf[padding..].as_ptr().as_usize() % align, "expected end of initial padding to be aligned to {}", align, ); (buf, padding) } /// Reads a NUL terminated label starting at the beginning of the given slice. /// /// If a NUL terminated label could not be found, then an error is returned. /// Similarly, if a label is found but doesn't match the expected label, then /// an error is returned. /// /// Upon success, the total number of bytes read (including padding bytes) is /// returned. pub(crate) fn read_label( slice: &[u8], expected_label: &'static str, ) -> Result { // Set an upper bound on how many bytes we scan for a NUL. Since no label // in this crate is longer than 256 bytes, if we can't find one within that // range, then we have corrupted data. let first_nul = slice[..cmp::min(slice.len(), 256)].iter().position(|&b| b == 0); let first_nul = match first_nul { Some(first_nul) => first_nul, None => { return Err(DeserializeError::generic( "could not find NUL terminated label \ at start of serialized object", )); } }; let len = first_nul + padding_len(first_nul); if slice.len() < len { return Err(DeserializeError::generic( "could not find properly sized label at start of serialized object" )); } if expected_label.as_bytes() != &slice[..first_nul] { return Err(DeserializeError::label_mismatch(expected_label)); } Ok(len) } /// Writes the given label to the buffer as a NUL terminated string. The label /// given must not contain NUL, otherwise this will panic. Similarly, the label /// must not be longer than 255 bytes, otherwise this will panic. /// /// Additional NUL bytes are written as necessary to ensure that the number of /// bytes written is always a multiple of 4. /// /// Upon success, the total number of bytes written (including padding) is /// returned. pub(crate) fn write_label( label: &str, dst: &mut [u8], ) -> Result { let nwrite = write_label_len(label); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("label")); } dst[..label.len()].copy_from_slice(label.as_bytes()); for i in 0..(nwrite - label.len()) { dst[label.len() + i] = 0; } assert_eq!(nwrite % 4, 0); Ok(nwrite) } /// Returns the total number of bytes (including padding) that would be written /// for the given label. This panics if the given label contains a NUL byte or /// is longer than 255 bytes. (The size restriction exists so that searching /// for a label during deserialization can be done in small bounded space.) pub(crate) fn write_label_len(label: &str) -> usize { if label.len() > 255 { panic!("label must not be longer than 255 bytes"); } if label.as_bytes().iter().position(|&b| b == 0).is_some() { panic!("label must not contain NUL bytes"); } let label_len = label.len() + 1; // +1 for the NUL terminator label_len + padding_len(label_len) } /// Reads the endianness check from the beginning of the given slice and /// confirms that the endianness of the serialized object matches the expected /// endianness. If the slice is too small or if the endianness check fails, /// this returns an error. /// /// Upon success, the total number of bytes read is returned. pub(crate) fn read_endianness_check( slice: &[u8], ) -> Result { let (n, nr) = try_read_u32(slice, "endianness check")?; assert_eq!(nr, write_endianness_check_len()); if n != 0xFEFF { return Err(DeserializeError::endian_mismatch(0xFEFF, n)); } Ok(nr) } /// Writes 0xFEFF as an integer using the given endianness. /// /// This is useful for writing into the header of a serialized object. It can /// be read during deserialization as a sanity check to ensure the proper /// endianness is used. /// /// Upon success, the total number of bytes written is returned. pub(crate) fn write_endianness_check( dst: &mut [u8], ) -> Result { let nwrite = write_endianness_check_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("endianness check")); } E::write_u32(0xFEFF, dst); Ok(nwrite) } /// Returns the number of bytes written by the endianness check. pub(crate) fn write_endianness_check_len() -> usize { size_of::() } /// Reads a version number from the beginning of the given slice and confirms /// that is matches the expected version number given. If the slice is too /// small or if the version numbers aren't equivalent, this returns an error. /// /// Upon success, the total number of bytes read is returned. /// /// N.B. Currently, we require that the version number is exactly equivalent. /// In the future, if we bump the version number without a semver bump, then /// we'll need to relax this a bit and support older versions. pub(crate) fn read_version( slice: &[u8], expected_version: u32, ) -> Result { let (n, nr) = try_read_u32(slice, "version")?; assert_eq!(nr, write_version_len()); if n != expected_version { return Err(DeserializeError::version_mismatch(expected_version, n)); } Ok(nr) } /// Writes the given version number to the beginning of the given slice. /// /// This is useful for writing into the header of a serialized object. It can /// be read during deserialization as a sanity check to ensure that the library /// code supports the format of the serialized object. /// /// Upon success, the total number of bytes written is returned. pub(crate) fn write_version( version: u32, dst: &mut [u8], ) -> Result { let nwrite = write_version_len(); if dst.len() < nwrite { return Err(SerializeError::buffer_too_small("version number")); } E::write_u32(version, dst); Ok(nwrite) } /// Returns the number of bytes written by writing the version number. pub(crate) fn write_version_len() -> usize { size_of::() } /// Reads a pattern ID from the given slice. If the slice has insufficient /// length, then this panics. If the deserialized integer exceeds the pattern /// ID limit for the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn read_pattern_id( slice: &[u8], what: &'static str, ) -> Result<(PatternID, usize), DeserializeError> { let bytes: [u8; PatternID::SIZE] = slice[..PatternID::SIZE].try_into().unwrap(); let pid = PatternID::from_ne_bytes(bytes) .map_err(|err| DeserializeError::pattern_id_error(err, what))?; Ok((pid, PatternID::SIZE)) } /// Reads a pattern ID from the given slice. If the slice has insufficient /// length, then this panics. Otherwise, the deserialized integer is assumed /// to be a valid pattern ID. /// /// This also returns the number of bytes read. pub(crate) fn read_pattern_id_unchecked(slice: &[u8]) -> (PatternID, usize) { let pid = PatternID::from_ne_bytes_unchecked( slice[..PatternID::SIZE].try_into().unwrap(), ); (pid, PatternID::SIZE) } /// Write the given pattern ID to the beginning of the given slice of bytes /// using the specified endianness. The given slice must have length at least /// `PatternID::SIZE`, or else this panics. Upon success, the total number of /// bytes written is returned. pub(crate) fn write_pattern_id( pid: PatternID, dst: &mut [u8], ) -> usize { E::write_u32(pid.as_u32(), dst); PatternID::SIZE } /// Attempts to read a state ID from the given slice. If the slice has an /// insufficient number of bytes or if the state ID exceeds the limit for /// the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_state_id( slice: &[u8], what: &'static str, ) -> Result<(StateID, usize), DeserializeError> { if slice.len() < StateID::SIZE { return Err(DeserializeError::buffer_too_small(what)); } read_state_id(slice, what) } /// Reads a state ID from the given slice. If the slice has insufficient /// length, then this panics. If the deserialized integer exceeds the state ID /// limit for the current target, then this returns an error. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn read_state_id( slice: &[u8], what: &'static str, ) -> Result<(StateID, usize), DeserializeError> { let bytes: [u8; StateID::SIZE] = slice[..StateID::SIZE].try_into().unwrap(); let sid = StateID::from_ne_bytes(bytes) .map_err(|err| DeserializeError::state_id_error(err, what))?; Ok((sid, StateID::SIZE)) } /// Reads a state ID from the given slice. If the slice has insufficient /// length, then this panics. Otherwise, the deserialized integer is assumed /// to be a valid state ID. /// /// This also returns the number of bytes read. pub(crate) fn read_state_id_unchecked(slice: &[u8]) -> (StateID, usize) { let sid = StateID::from_ne_bytes_unchecked( slice[..StateID::SIZE].try_into().unwrap(), ); (sid, StateID::SIZE) } /// Write the given state ID to the beginning of the given slice of bytes /// using the specified endianness. The given slice must have length at least /// `StateID::SIZE`, or else this panics. Upon success, the total number of /// bytes written is returned. pub(crate) fn write_state_id( sid: StateID, dst: &mut [u8], ) -> usize { E::write_u32(sid.as_u32(), dst); StateID::SIZE } /// Try to read a u16 as a usize from the beginning of the given slice in /// native endian format. If the slice has fewer than 2 bytes or if the /// deserialized number cannot be represented by usize, then this returns an /// error. The error message will include the `what` description of what is /// being deserialized, for better error messages. `what` should be a noun in /// singular form. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_u16_as_usize( slice: &[u8], what: &'static str, ) -> Result<(usize, usize), DeserializeError> { try_read_u16(slice, what).and_then(|(n, nr)| { usize::try_from(n) .map(|n| (n, nr)) .map_err(|_| DeserializeError::invalid_usize(what)) }) } /// Try to read a u32 as a usize from the beginning of the given slice in /// native endian format. If the slice has fewer than 4 bytes or if the /// deserialized number cannot be represented by usize, then this returns an /// error. The error message will include the `what` description of what is /// being deserialized, for better error messages. `what` should be a noun in /// singular form. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_u32_as_usize( slice: &[u8], what: &'static str, ) -> Result<(usize, usize), DeserializeError> { try_read_u32(slice, what).and_then(|(n, nr)| { usize::try_from(n) .map(|n| (n, nr)) .map_err(|_| DeserializeError::invalid_usize(what)) }) } /// Try to read a u16 from the beginning of the given slice in native endian /// format. If the slice has fewer than 2 bytes, then this returns an error. /// The error message will include the `what` description of what is being /// deserialized, for better error messages. `what` should be a noun in /// singular form. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_u16( slice: &[u8], what: &'static str, ) -> Result<(u16, usize), DeserializeError> { check_slice_len(slice, size_of::(), what)?; Ok((read_u16(slice), size_of::())) } /// Try to read a u32 from the beginning of the given slice in native endian /// format. If the slice has fewer than 4 bytes, then this returns an error. /// The error message will include the `what` description of what is being /// deserialized, for better error messages. `what` should be a noun in /// singular form. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_u32( slice: &[u8], what: &'static str, ) -> Result<(u32, usize), DeserializeError> { check_slice_len(slice, size_of::(), what)?; Ok((read_u32(slice), size_of::())) } /// Try to read a u128 from the beginning of the given slice in native endian /// format. If the slice has fewer than 16 bytes, then this returns an error. /// The error message will include the `what` description of what is being /// deserialized, for better error messages. `what` should be a noun in /// singular form. /// /// Upon success, this also returns the number of bytes read. pub(crate) fn try_read_u128( slice: &[u8], what: &'static str, ) -> Result<(u128, usize), DeserializeError> { check_slice_len(slice, size_of::(), what)?; Ok((read_u128(slice), size_of::())) } /// Read a u16 from the beginning of the given slice in native endian format. /// If the slice has fewer than 2 bytes, then this panics. /// /// Marked as inline to speed up sparse searching which decodes integers from /// its automaton at search time. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn read_u16(slice: &[u8]) -> u16 { let bytes: [u8; 2] = slice[..size_of::()].try_into().unwrap(); u16::from_ne_bytes(bytes) } /// Read a u32 from the beginning of the given slice in native endian format. /// If the slice has fewer than 4 bytes, then this panics. /// /// Marked as inline to speed up sparse searching which decodes integers from /// its automaton at search time. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn read_u32(slice: &[u8]) -> u32 { let bytes: [u8; 4] = slice[..size_of::()].try_into().unwrap(); u32::from_ne_bytes(bytes) } /// Read a u128 from the beginning of the given slice in native endian format. /// If the slice has fewer than 16 bytes, then this panics. pub(crate) fn read_u128(slice: &[u8]) -> u128 { let bytes: [u8; 16] = slice[..size_of::()].try_into().unwrap(); u128::from_ne_bytes(bytes) } /// Checks that the given slice has some minimal length. If it's smaller than /// the bound given, then a "buffer too small" error is returned with `what` /// describing what the buffer represents. pub(crate) fn check_slice_len( slice: &[T], at_least_len: usize, what: &'static str, ) -> Result<(), DeserializeError> { if slice.len() < at_least_len { return Err(DeserializeError::buffer_too_small(what)); } Ok(()) } /// Multiply the given numbers, and on overflow, return an error that includes /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. pub(crate) fn mul( a: usize, b: usize, what: &'static str, ) -> Result { match a.checked_mul(b) { Some(c) => Ok(c), None => Err(DeserializeError::arithmetic_overflow(what)), } } /// Add the given numbers, and on overflow, return an error that includes /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. pub(crate) fn add( a: usize, b: usize, what: &'static str, ) -> Result { match a.checked_add(b) { Some(c) => Ok(c), None => Err(DeserializeError::arithmetic_overflow(what)), } } /// Shift `a` left by `b`, and on overflow, return an error that includes /// 'what' in the error message. /// /// This is useful when doing arithmetic with untrusted data. pub(crate) fn shl( a: usize, b: usize, what: &'static str, ) -> Result { let amount = u32::try_from(b) .map_err(|_| DeserializeError::arithmetic_overflow(what))?; match a.checked_shl(amount) { Some(c) => Ok(c), None => Err(DeserializeError::arithmetic_overflow(what)), } } /// Returns the number of additional bytes required to add to the given length /// in order to make the total length a multiple of 4. The return value is /// always less than 4. pub(crate) fn padding_len(non_padding_len: usize) -> usize { (4 - (non_padding_len & 0b11)) & 0b11 } /// A simple trait for writing code generic over endianness. /// /// This is similar to what byteorder provides, but we only need a very small /// subset. pub(crate) trait Endian { /// Writes a u16 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 2, then /// this panics. fn write_u16(n: u16, dst: &mut [u8]); /// Writes a u32 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 4, then /// this panics. fn write_u32(n: u32, dst: &mut [u8]); /// Writes a u128 to the given destination buffer in a particular /// endianness. If the destination buffer has a length smaller than 16, /// then this panics. fn write_u128(n: u128, dst: &mut [u8]); } /// Little endian writing. pub(crate) enum LE {} /// Big endian writing. pub(crate) enum BE {} #[cfg(target_endian = "little")] pub(crate) type NE = LE; #[cfg(target_endian = "big")] pub(crate) type NE = BE; impl Endian for LE { fn write_u16(n: u16, dst: &mut [u8]) { dst[..2].copy_from_slice(&n.to_le_bytes()); } fn write_u32(n: u32, dst: &mut [u8]) { dst[..4].copy_from_slice(&n.to_le_bytes()); } fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_le_bytes()); } } impl Endian for BE { fn write_u16(n: u16, dst: &mut [u8]) { dst[..2].copy_from_slice(&n.to_be_bytes()); } fn write_u32(n: u32, dst: &mut [u8]) { dst[..4].copy_from_slice(&n.to_be_bytes()); } fn write_u128(n: u128, dst: &mut [u8]) { dst[..16].copy_from_slice(&n.to_be_bytes()); } } #[cfg(all(test, feature = "alloc"))] mod tests { use super::*; #[test] fn labels() { let mut buf = [0; 1024]; let nwrite = write_label("fooba", &mut buf).unwrap(); assert_eq!(nwrite, 8); assert_eq!(&buf[..nwrite], b"fooba\x00\x00\x00"); let nread = read_label(&buf, "fooba").unwrap(); assert_eq!(nread, 8); } #[test] #[should_panic] fn bad_label_interior_nul() { // interior NULs are not allowed write_label("foo\x00bar", &mut [0; 1024]).unwrap(); } #[test] fn bad_label_almost_too_long() { // ok write_label(&"z".repeat(255), &mut [0; 1024]).unwrap(); } #[test] #[should_panic] fn bad_label_too_long() { // labels longer than 255 bytes are banned write_label(&"z".repeat(256), &mut [0; 1024]).unwrap(); } #[test] fn padding() { assert_eq!(0, padding_len(8)); assert_eq!(3, padding_len(9)); assert_eq!(2, padding_len(10)); assert_eq!(1, padding_len(11)); assert_eq!(0, padding_len(12)); assert_eq!(3, padding_len(13)); assert_eq!(2, padding_len(14)); assert_eq!(1, padding_len(15)); assert_eq!(0, padding_len(16)); } } regex-automata-0.4.9/test000075500000000000000000000071011046102023000134650ustar 00000000000000#!/bin/bash # This is a script that attempts to *approximately* exhaustively run the test # suite for regex-automata. The main reason for why 'cargo test' isn't enough # is because of crate features. regex-automata has a ton of them. This script # tests many of those feature combinations (although not all) to try to get # decent coverage in a finite amount of time. set -e # cd to the directory containing this crate's Cargo.toml so that we don't need # to pass --manifest-path to every `cargo` command. cd "$(dirname "$0")" echo "===== ALL FEATURES TEST ===" cargo test --all-features # Man I don't *want* to have this many crate features, but... I really want # folks to be able to slim the crate down to just the things they want. But # the main downside is that I just can't feasibly test every combination of # features because there are too many of them. Sad, but I'm not sure if there # is a better alternative. features=( "" "unicode-word-boundary" "unicode-word-boundary,syntax,unicode-perl" "unicode-word-boundary,syntax,dfa-build" "nfa" "dfa" "hybrid" "nfa,dfa" "nfa,hybrid" "dfa,hybrid" "dfa-onepass" "nfa-pikevm" "nfa-backtrack" "std" "alloc" "syntax" "syntax,nfa-pikevm" "syntax,hybrid" "perf-literal-substring" "perf-literal-multisubstring" "meta" "meta,nfa-backtrack" "meta,hybrid" "meta,dfa-build" "meta,dfa-onepass" "meta,nfa,dfa,hybrid,nfa-backtrack" "meta,nfa,dfa,hybrid,nfa-backtrack,perf-literal-substring" "meta,nfa,dfa,hybrid,nfa-backtrack,perf-literal-multisubstring" ) for f in "${features[@]}"; do echo "===== LIB FEATURES: $f ===" # It's actually important to do a standard 'cargo build' in addition to a # 'cargo test'. In particular, in the latter case, the dev-dependencies may # wind up enabling features in dependencies (like memchr) that make it look # like everything is well, but actually isn't. For example, the 'regex-test' # dev-dependency uses 'bstr' and enables its 'std' feature, which in turn # unconditionally enables 'memchr's 'std' feature. Since we're specifically # looking to test that certain feature combinations work as expected, this # can lead to things testing okay, but would actually fail to build. Yikes. cargo build --no-default-features --lib --features "$f" cargo test --no-default-features --lib --features "$f" done # We can also run the integration test suite on stripped down features too. # But the test suite doesn't do well with things like 'std' and 'unicode' # disabled, so we always enable them. features=( "std,unicode,syntax,nfa-pikevm" "std,unicode,syntax,nfa-backtrack" "std,unicode,syntax,hybrid" "std,unicode,syntax,dfa-onepass" "std,unicode,syntax,dfa-search" "std,unicode,syntax,dfa-build" "std,unicode,meta" # This one is a little tricky because it causes the backtracker to get used # in more instances and results in failing tests for the 'earliest' tests. # The actual results are semantically consistent with the API guarantee # (the backtracker tends to report greater offsets because it isn't an FSM), # but our tests are less flexible than the API guarantee and demand offsets # reported by FSM regex engines. (Which is... all of them except for the # backtracker.) # "std,unicode,meta,nfa-backtrack" "std,unicode,meta,hybrid" "std,unicode,meta,dfa-onepass" "std,unicode,meta,dfa-build" "std,unicode,meta,nfa,dfa-onepass,hybrid" ) for f in "${features[@]}"; do echo "===== INTEGRATION FEATURES: $f ===" cargo build --no-default-features --lib --features "$f" cargo test --no-default-features --test integration --features "$f" done regex-automata-0.4.9/tests/dfa/api.rs000064400000000000000000000037511046102023000156020ustar 00000000000000use std::error::Error; use regex_automata::{ dfa::{dense, Automaton, OverlappingState}, nfa::thompson, HalfMatch, Input, MatchError, }; // Tests that quit bytes in the forward direction work correctly. #[test] fn quit_fwd() -> Result<(), Box> { let dfa = dense::Builder::new() .configure(dense::Config::new().quit(b'x', true)) .build("[[:word:]]+$")?; assert_eq!( Err(MatchError::quit(b'x', 3)), dfa.try_search_fwd(&Input::new(b"abcxyz")) ); assert_eq!( dfa.try_search_overlapping_fwd( &Input::new(b"abcxyz"), &mut OverlappingState::start() ), Err(MatchError::quit(b'x', 3)), ); Ok(()) } // Tests that quit bytes in the reverse direction work correctly. #[test] fn quit_rev() -> Result<(), Box> { let dfa = dense::Builder::new() .configure(dense::Config::new().quit(b'x', true)) .thompson(thompson::Config::new().reverse(true)) .build("^[[:word:]]+")?; assert_eq!( Err(MatchError::quit(b'x', 3)), dfa.try_search_rev(&Input::new(b"abcxyz")) ); Ok(()) } // Tests that if we heuristically enable Unicode word boundaries but then // instruct that a non-ASCII byte should NOT be a quit byte, then the builder // will panic. #[test] #[should_panic] fn quit_panics() { dense::Config::new().unicode_word_boundary(true).quit(b'\xFF', false); } // This tests an intesting case where even if the Unicode word boundary option // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode // word boundaries to be enabled. #[test] fn unicode_word_implicitly_works() -> Result<(), Box> { let mut config = dense::Config::new(); for b in 0x80..=0xFF { config = config.quit(b, true); } let dfa = dense::Builder::new().configure(config).build(r"\b")?; let expected = HalfMatch::must(0, 1); assert_eq!(Ok(Some(expected)), dfa.try_search_fwd(&Input::new(b" a"))); Ok(()) } regex-automata-0.4.9/tests/dfa/mod.rs000064400000000000000000000004151046102023000156020ustar 00000000000000#[cfg(all(feature = "dfa-build", feature = "dfa-search"))] mod api; #[cfg(feature = "dfa-onepass")] mod onepass; #[cfg(all(feature = "dfa-build", feature = "dfa-search"))] mod regression; #[cfg(all(not(miri), feature = "dfa-build", feature = "dfa-search"))] mod suite; regex-automata-0.4.9/tests/dfa/onepass/mod.rs000064400000000000000000000000351046102023000172500ustar 00000000000000#[cfg(not(miri))] mod suite; regex-automata-0.4.9/tests/dfa/onepass/suite.rs000064400000000000000000000162171046102023000176330ustar 00000000000000use { anyhow::Result, regex_automata::{ dfa::onepass::{self, DFA}, nfa::thompson, util::{iter, syntax}, }, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, testify_captures, untestify_kind}; const EXPANSIONS: &[&str] = &["is_match", "find", "captures"]; /// Tests the default configuration of the hybrid NFA/DFA. #[test] fn default() -> Result<()> { let builder = DFA::builder(); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all /// tests. #[test] fn starts_for_each_pattern() -> Result<()> { let mut builder = DFA::builder(); builder.configure(DFA::config().starts_for_each_pattern(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when byte classes are disabled. /// /// N.B. Disabling byte classes doesn't avoid any indirection at search time. /// All it does is cause every byte value to be its own distinct equivalence /// class. #[test] fn no_byte_classes() -> Result<()> { let mut builder = DFA::builder(); builder.configure(DFA::config().byte_classes(false)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } fn compiler( mut builder: onepass::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { // Check if our regex contains things that aren't supported by DFAs. // That is, Unicode word boundaries when searching non-ASCII text. if !configure_onepass_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = match builder.build_many(®exes) { Ok(re) => re, Err(err) => { let msg = err.to_string(); // This is pretty gross, but when a regex fails to compile as // a one-pass regex, then we want to be OK with that and just // skip the test. But we have to be careful to only skip it // when the expected result is that the regex compiles. If // the test is specifically checking that the regex does not // compile, then we should bubble up that error and allow the // test to pass. // // Since our error types are all generally opaque, we just // look for an error string. Not great, but not the end of the // world. if test.compiles() && msg.contains("not one-pass") { return Ok(CompiledRegex::skip()); } return Err(err.into()); } }; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) })) } } fn run_test( re: &DFA, cache: &mut onepass::Cache, test: &RegexTest, ) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => { TestResult::matched(re.is_match(cache, input.earliest(true))) } "find" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { let input = input.earliest(test.search_kind() == SearchKind::Earliest); let mut caps = re.create_captures(); let it = iter::Searcher::new(input) .into_matches_iter(|input| { re.try_search(cache, input, &mut caps)?; Ok(caps.get_match()) }) .infallible() .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }); TestResult::matches(it) } SearchKind::Overlapping => { // The one-pass DFA does not support any kind of overlapping // search. This is not just a matter of not having the API. // It's fundamentally incompatible with the one-pass concept. // If overlapping matches were possible, then the one-pass DFA // would fail to build. TestResult::skip() } }, "captures" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { let input = input.earliest(test.search_kind() == SearchKind::Earliest); let it = iter::Searcher::new(input) .into_captures_iter(re.create_captures(), |input, caps| { re.try_search(cache, input, caps) }) .infallible() .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } SearchKind::Overlapping => { // The one-pass DFA does not support any kind of overlapping // search. This is not just a matter of not having the API. // It's fundamentally incompatible with the one-pass concept. // If overlapping matches were possible, then the one-pass DFA // would fail to build. TestResult::skip() } }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_onepass_builder( test: &RegexTest, builder: &mut onepass::Builder, ) -> bool { if !test.anchored() { return false; } let match_kind = match untestify_kind(test.match_kind()) { None => return false, Some(k) => k, }; let config = DFA::config().match_kind(match_kind); builder .configure(config) .syntax(config_syntax(test)) .thompson(config_thompson(test)); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { let mut lookm = regex_automata::util::look::LookMatcher::new(); lookm.set_line_terminator(test.line_terminator()); thompson::Config::new().utf8(test.utf8()).look_matcher(lookm) } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) } regex-automata-0.4.9/tests/dfa/regression.rs000064400000000000000000000031331046102023000172030ustar 00000000000000// A regression test for checking that minimization correctly translates // whether a state is a match state or not. Previously, it was possible for // minimization to mark a non-matching state as matching. #[test] #[cfg(not(miri))] fn minimize_sets_correct_match_states() { use regex_automata::{ dfa::{dense::DFA, Automaton, StartKind}, Anchored, Input, }; let pattern = // This is a subset of the grapheme matching regex. I couldn't seem // to get a repro any smaller than this unfortunately. r"(?x) (?: \p{gcb=Prepend}* (?: (?: (?: \p{gcb=L}* (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) \p{gcb=T}* ) | \p{gcb=L}+ | \p{gcb=T}+ ) | \p{Extended_Pictographic} (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* | [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] ) [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* ) "; let dfa = DFA::builder() .configure( DFA::config().start_kind(StartKind::Anchored).minimize(true), ) .build(pattern) .unwrap(); let input = Input::new(b"\xE2").anchored(Anchored::Yes); assert_eq!(Ok(None), dfa.try_search_fwd(&input)); } regex-automata-0.4.9/tests/dfa/suite.rs000064400000000000000000000374641046102023000161720ustar 00000000000000use { anyhow::Result, regex_automata::{ dfa::{ self, dense, regex::Regex, sparse, Automaton, OverlappingState, StartKind, }, nfa::thompson, util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, untestify_kind}; const EXPANSIONS: &[&str] = &["is_match", "find", "which"]; /// Runs the test suite with the default configuration. #[test] fn unminimized_default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite with the default configuration and a prefilter enabled, /// if one can be built. #[test] fn unminimized_prefilter() -> Result<()> { let my_compiler = |test: &RegexTest, regexes: &[String]| { // Parse regexes as HIRs so we can get literals to build a prefilter. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } let kind = match untestify_kind(test.match_kind()) { None => return Ok(CompiledRegex::skip()), Some(kind) => kind, }; let pre = Prefilter::from_hirs_prefix(kind, &hirs); let mut builder = Regex::builder(); builder.dense(dense::DFA::config().prefilter(pre)); compiler(builder, |_, _, re| { Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) })(test, regexes) }; TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), my_compiler) .assert(); Ok(()) } /// Runs the test suite with start states specialized. #[test] fn unminimized_specialized_start_states() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().specialize_start_states(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite with byte classes disabled. #[test] fn unminimized_no_byte_class() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().byte_classes(false)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite with NFA shrinking enabled. #[test] fn unminimized_nfa_shrink() -> Result<()> { let mut builder = Regex::builder(); builder.thompson(thompson::Config::new().shrink(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a minimized DFA with an otherwise default /// configuration. #[test] fn minimized_default() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().minimize(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a minimized DFA with byte classes disabled. #[test] fn minimized_no_byte_class() -> Result<()> { let mut builder = Regex::builder(); builder.dense(dense::Config::new().minimize(true).byte_classes(false)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), dense_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a sparse unminimized DFA. #[test] fn sparse_unminimized_default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), sparse_compiler(builder)) .assert(); Ok(()) } /// Runs the test suite on a sparse unminimized DFA with prefilters enabled. #[test] fn sparse_unminimized_prefilter() -> Result<()> { let my_compiler = |test: &RegexTest, regexes: &[String]| { // Parse regexes as HIRs so we can get literals to build a prefilter. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } let kind = match untestify_kind(test.match_kind()) { None => return Ok(CompiledRegex::skip()), Some(kind) => kind, }; let pre = Prefilter::from_hirs_prefix(kind, &hirs); let mut builder = Regex::builder(); builder.dense(dense::DFA::config().prefilter(pre)); compiler(builder, |builder, _, re| { let fwd = re.forward().to_sparse()?; let rev = re.reverse().to_sparse()?; let re = builder.build_from_dfas(fwd, rev); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) })(test, regexes) }; TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), my_compiler) .assert(); Ok(()) } /// Another basic sanity test that checks we can serialize and then deserialize /// a regex, and that the resulting regex can be used for searching correctly. #[test] fn serialization_unminimized_default() -> Result<()> { let builder = Regex::builder(); let my_compiler = |builder| { compiler(builder, |builder, _, re| { let builder = builder.clone(); let (fwd_bytes, _) = re.forward().to_bytes_native_endian(); let (rev_bytes, _) = re.reverse().to_bytes_native_endian(); Ok(CompiledRegex::compiled(move |test| -> TestResult { let fwd: dense::DFA<&[u32]> = dense::DFA::from_bytes(&fwd_bytes).unwrap().0; let rev: dense::DFA<&[u32]> = dense::DFA::from_bytes(&rev_bytes).unwrap().0; let re = builder.build_from_dfas(fwd, rev); run_test(&re, test) })) }) }; TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), my_compiler(builder)) .assert(); Ok(()) } /// A basic sanity test that checks we can serialize and then deserialize a /// regex using sparse DFAs, and that the resulting regex can be used for /// searching correctly. #[test] fn sparse_serialization_unminimized_default() -> Result<()> { let builder = Regex::builder(); let my_compiler = |builder| { compiler(builder, |builder, _, re| { let builder = builder.clone(); let fwd_bytes = re.forward().to_sparse()?.to_bytes_native_endian(); let rev_bytes = re.reverse().to_sparse()?.to_bytes_native_endian(); Ok(CompiledRegex::compiled(move |test| -> TestResult { let fwd: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&fwd_bytes).unwrap().0; let rev: sparse::DFA<&[u8]> = sparse::DFA::from_bytes(&rev_bytes).unwrap().0; let re = builder.build_from_dfas(fwd, rev); run_test(&re, test) })) }) }; TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .blacklist("expensive") .test_iter(suite()?.iter(), my_compiler(builder)) .assert(); Ok(()) } fn dense_compiler( builder: dfa::regex::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { compiler(builder, |_, _, re| { Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) }) } fn sparse_compiler( builder: dfa::regex::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { compiler(builder, |builder, _, re| { let fwd = re.forward().to_sparse()?; let rev = re.reverse().to_sparse()?; let re = builder.build_from_dfas(fwd, rev); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) }) } fn compiler( mut builder: dfa::regex::Builder, mut create_matcher: impl FnMut( &dfa::regex::Builder, Option, Regex, ) -> Result, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { // Parse regexes as HIRs for some analysis below. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } // Get a prefilter in case the test wants it. let kind = match untestify_kind(test.match_kind()) { None => return Ok(CompiledRegex::skip()), Some(kind) => kind, }; let pre = Prefilter::from_hirs_prefix(kind, &hirs); // Check if our regex contains things that aren't supported by DFAs. // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } create_matcher(&builder, pre, builder.build_many(®exes)?) } } fn run_test(re: &Regex, test: &RegexTest) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => TestResult::matched(re.is_match(input.earliest(true))), "find" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { let input = input.earliest(test.search_kind() == SearchKind::Earliest); TestResult::matches( re.find_iter(input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }), ) } SearchKind::Overlapping => { try_search_overlapping(re, &input).unwrap() } }, "which" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { // There are no "which" APIs for standard searches. TestResult::skip() } SearchKind::Overlapping => { let dfa = re.forward(); let mut patset = PatternSet::new(dfa.pattern_len()); dfa.try_which_overlapping_matches(&input, &mut patset) .unwrap(); TestResult::which(patset.iter().map(|p| p.as_usize())) } }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_regex_builder( test: &RegexTest, builder: &mut dfa::regex::Builder, ) -> bool { let match_kind = match untestify_kind(test.match_kind()) { None => return false, Some(k) => k, }; let starts = if test.anchored() { StartKind::Anchored } else { StartKind::Unanchored }; let mut dense_config = dense::Config::new() .start_kind(starts) .match_kind(match_kind) .unicode_word_boundary(true); // When doing an overlapping search, we might try to find the start of each // match with a custom search routine. In that case, we need to tell the // reverse search (for the start offset) which pattern to look for. The // only way that API works is when anchored starting states are compiled // for each pattern. This does technically also enable it for the forward // DFA, but we're okay with that. if test.search_kind() == SearchKind::Overlapping { dense_config = dense_config.starts_for_each_pattern(true); } builder .syntax(config_syntax(test)) .thompson(config_thompson(test)) .dense(dense_config); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { let mut lookm = regex_automata::util::look::LookMatcher::new(); lookm.set_line_terminator(test.line_terminator()); thompson::Config::new().utf8(test.utf8()).look_matcher(lookm) } /// Configuration of the regex syntax from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) } /// Execute an overlapping search, and for each match found, also find its /// overlapping starting positions. /// /// N.B. This routine used to be part of the crate API, but 1) it wasn't clear /// to me how useful it was and 2) it wasn't clear to me what its semantics /// should be. In particular, a potentially surprising footgun of this routine /// that it is worst case *quadratic* in the size of the haystack. Namely, it's /// possible to report a match at every position, and for every such position, /// scan all the way to the beginning of the haystack to find the starting /// position. Typical leftmost non-overlapping searches don't suffer from this /// because, well, matches can't overlap. So subsequent searches after a match /// is found don't revisit previously scanned parts of the haystack. /// /// Its semantics can be strange for other reasons too. For example, given /// the regex '.*' and the haystack 'zz', the full set of overlapping matches /// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of /// those matches is quite strange, but makes sense when you think about the /// implementation: an end offset is found left-to-right, and then one or more /// starting offsets are found right-to-left. /// /// Nevertheless, we provide this routine in our test suite because it's /// useful to test the low level DFA overlapping search and our test suite /// is written in a way that requires starting offsets. fn try_search_overlapping( re: &Regex, input: &Input<'_>, ) -> Result { let mut matches = vec![]; let mut fwd_state = OverlappingState::start(); let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse()); while let Some(end) = { fwd_dfa.try_search_overlapping_fwd(input, &mut fwd_state)?; fwd_state.get_match() } { let revsearch = input .clone() .range(input.start()..end.offset()) .anchored(Anchored::Pattern(end.pattern())) .earliest(false); let mut rev_state = OverlappingState::start(); while let Some(start) = { rev_dfa.try_search_overlapping_rev(&revsearch, &mut rev_state)?; rev_state.get_match() } { let span = Span { start: start.offset(), end: end.offset() }; let mat = Match { id: end.pattern().as_usize(), span }; matches.push(mat); } } Ok(TestResult::matches(matches)) } regex-automata-0.4.9/tests/fuzz/dense.rs000064400000000000000000000041241046102023000163660ustar 00000000000000// This test was found by a fuzzer input that crafted a way to provide // an invalid serialization of ByteClasses that passed our verification. // Specifically, the verification step in the deserialization of ByteClasses // used an iterator that depends on part of the serialized bytes being correct. // (Specifically, the encoding of the number of classes.) #[test] fn invalid_byte_classes() { let data = include_bytes!( "testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9", ); let _ = fuzz_run(data); } #[test] fn invalid_byte_classes_min() { let data = include_bytes!( "testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9", ); let _ = fuzz_run(data); } // This is the code from the fuzz target. Kind of sucks to duplicate it here, // but this is fundamentally how we interpret the date. fn fuzz_run(given_data: &[u8]) -> Option<()> { use regex_automata::dfa::Automaton; if given_data.len() < 2 { return None; } let haystack_len = usize::from(given_data[0]); let haystack = given_data.get(1..1 + haystack_len)?; let given_dfa_bytes = given_data.get(1 + haystack_len..)?; // We help the fuzzer along by adding a preamble to the bytes that should // at least make these first parts valid. The preamble expects a very // specific sequence of bytes, so it makes sense to just force this. let label = "rust-regex-automata-dfa-dense\x00\x00\x00"; assert_eq!(0, label.len() % 4); let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec(); let version_check = 2u32.to_ne_bytes().to_vec(); let mut dfa_bytes: Vec = vec![]; dfa_bytes.extend(label.as_bytes()); dfa_bytes.extend(&endianness_check); dfa_bytes.extend(&version_check); dfa_bytes.extend(given_dfa_bytes); // This is the real test: checking that any input we give to // DFA::from_bytes will never result in a panic. let (dfa, _) = regex_automata::dfa::dense::DFA::from_bytes(&dfa_bytes).ok()?; let _ = dfa.try_search_fwd(®ex_automata::Input::new(haystack)); Some(()) } regex-automata-0.4.9/tests/fuzz/mod.rs000064400000000000000000000000271046102023000160450ustar 00000000000000mod dense; mod sparse; regex-automata-0.4.9/tests/fuzz/sparse.rs000064400000000000000000000132311046102023000165640ustar 00000000000000// This is a regression test for a bug in how special states are handled. The // fuzzer found a case where a state returned true for 'is_special_state' but // *didn't* return true for 'is_dead_state', 'is_quit_state', 'is_match_state', // 'is_start_state' or 'is_accel_state'. This in turn tripped a debug assertion // in the core matching loop that requires 'is_special_state' being true to // imply that one of the other routines returns true. // // We fixed this by adding some validation to both dense and sparse DFAs that // checks that this property is true for every state ID in the DFA. #[test] fn invalid_special_state() { let data = include_bytes!( "testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838", ); let _ = fuzz_run(data); } // This is an interesting case where a fuzzer generated a DFA with // a transition to a state ID that decoded as a valid state, but // where the ID itself did not point to one of the two existing // states for this particular DFA. This combined with marking this // transition's state ID as special but without actually making one of the // 'is_{dead,quit,match,start,accel}_state' predicates return true ended up // tripping the 'debug_assert(dfa.is_quit_state(sid))' code in the search // routine. // // We fixed this in alloc mode by checking that every transition points to a // valid state ID. Technically this bug still exists in core-only mode, but // it's not clear how to fix it. And it's worth pointing out that the search // routine won't panic in production. It will just provide invalid results. And // that's acceptable within the contract of DFA::from_bytes. #[test] fn transition_to_invalid_but_valid_state() { let data = include_bytes!( "testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9", ); let _ = fuzz_run(data); } // Another one caught by the fuzzer where it generated a DFA that reported a // start state as a match state. Since matches are always delayed by one byte, // start states specifically cannot be match states. And indeed, the search // code relies on this. #[test] fn start_state_is_not_match_state() { let data = include_bytes!( "testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000", ); let _ = fuzz_run(data); } // This is variation on 'transition_to_invalid_but_valid_state', but happens // to a start state. Namely, the fuzz data here builds a DFA with a start // state ID that is incorrect but points to a sequence of bytes that satisfies // state decoding validation. This errant state in turn has a non-zero number // of transitions, and its those transitions that point to a state that does // *not* satisfy state decoding validation. But we never checked those. So the // fix here was to add validation of the transitions off of the start state. #[test] fn start_state_has_valid_transitions() { let data = include_bytes!( "testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98", ); let _ = fuzz_run(data); } // This fuzz input generated a DFA with a state whose ID was in the match state // ID range, but where the state itself was encoded with zero pattern IDs. We // added validation code to check this case. #[test] fn match_state_inconsistency() { let data = include_bytes!( "testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570", ); let _ = fuzz_run(data); } // This fuzz input generated a DFA with a state whose ID was in the accelerator // range, but who didn't have any accelerators. This violated an invariant that // assumes that if 'dfa.is_accel_state(sid)' returns true, then the state must // have some accelerators. #[test] fn invalid_accelerators() { let data = include_bytes!( "testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2b", ); let _ = fuzz_run(data); } // This fuzz input generated a DFA with a state whose EOI transition led to // a quit state, which is generally considered illegal. Why? Because the EOI // transition is defined over a special sentinel alphabet element and one // cannot configure a DFA to "quit" on that sentinel. #[test] fn eoi_transition_to_quit_state() { let data = include_bytes!( "testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9", ); let _ = fuzz_run(data); } // This is the code from the fuzz target. Kind of sucks to duplicate it here, // but this is fundamentally how we interpret the date. fn fuzz_run(given_data: &[u8]) -> Option<()> { use regex_automata::dfa::Automaton; if given_data.len() < 2 { return None; } let haystack_len = usize::from(given_data[0]); let haystack = given_data.get(1..1 + haystack_len)?; let given_dfa_bytes = given_data.get(1 + haystack_len..)?; // We help the fuzzer along by adding a preamble to the bytes that should // at least make these first parts valid. The preamble expects a very // specific sequence of bytes, so it makes sense to just force this. let label = "rust-regex-automata-dfa-sparse\x00\x00"; assert_eq!(0, label.len() % 4); let endianness_check = 0xFEFFu32.to_ne_bytes().to_vec(); let version_check = 2u32.to_ne_bytes().to_vec(); let mut dfa_bytes: Vec = vec![]; dfa_bytes.extend(label.as_bytes()); dfa_bytes.extend(&endianness_check); dfa_bytes.extend(&version_check); dfa_bytes.extend(given_dfa_bytes); // This is the real test: checking that any input we give to // DFA::from_bytes will never result in a panic. let (dfa, _) = regex_automata::dfa::sparse::DFA::from_bytes(&dfa_bytes).ok()?; let _ = dfa.try_search_fwd(®ex_automata::Input::new(haystack)); Some(()) } ././@LongLink00006440000000000000000000000152000000000000007771Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0208a9regex-automata-0.4.9/tests/fuzz/testdata/deserialize_dense_crash-9486fb7c8a93b12c12a62166b43d31640c0000064400000000000000000000035461046102023000277610ustar 000000000000007|)))]B Y#A@0@  Y 0 A1uuuu@@@@././@LongLink00006440000000000000000000000163000000000000007773Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b43d31640c0208a9regex-automata-0.4.9/tests/fuzz/testdata/deserialize_dense_minimized-from-9486fb7c8a93b12c12a62166b4000064400000000000000000000035321046102023000305320ustar 000000000000007|)))]B Y#A@0@  Y 0 A1uuuu@@@@././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c72b000regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-0da59c0434eaf35e5a6b470fa9244bb79c000064400000000000000000000016551046102023000304020ustar 00000000000000  x1010~@-h`$5[)k././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf65efa9regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-18cfc246f2ddfc3dfc92b0c7893178c7cf000064400000000000000000000016341046102023000305630ustar 00000000000000h_hhhh11111111111111110%11111111111hhhhhhhhhkhhhhhhhhhyxyyyyyyyyyy;yy=yyyyyyyyyyyyySyyyyyyyyyyyyyyyyyyyyy/yyyyyyy~yyyy|yyyyyyyyyy001@jcc@c 0j@././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd234b98regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-61fd8e3003bf9d99f6c1e5a8488727eefd000064400000000000000000000016451046102023000304430ustar 00000000000000 ~1010~@-1h`$0[1k././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421505838regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-a1b839d899ced76d5d7d0f78f9edb7a421000064400000000000000000000014421046102023000305150ustar 00000000000000yyy9@01hhhhhhhhhhhhhhhyyyyyyyyy$|yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy0@]2@1)-+././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011816570regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-c383ae07ec5e191422eadc492117439011000064400000000000000000000016341046102023000300650ustar 000000000000009@10111111111111hhhhhhhhhh$hhhhhhh#yyyyyyyyyyyyyyyyyyyyyyyyyyy@yyyyy`y %yy~yyyyyyyyyyyyyyyy11h?yyyy9././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420449e2bregex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-d07703ceb94b10dcd9e4acb809f2051420000064400000000000000000000016321046102023000303000ustar 000000000000009@g111h_hhhh111111111111111#10111111111111yyyy101[@01[z@,cyyyS@././@LongLink00006440000000000000000000000153000000000000007772Lustar regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd1460eff9regex-automata-0.4.9/tests/fuzz/testdata/deserialize_sparse_crash-dbb8172d3984e7e7d03f4b5f8bb86ecd14000064400000000000000000000013301046102023000304730ustar 00000000000000yyy9@01hhhhhhhhhhhhhhhyyyyyyyyy$|yyyyyyyy$yyyyyyyyyyyyyyyyyyyyyyyyyyyyy`yyyyyyyyyyyyyyy0@]2@1)!)!regex-automata-0.4.9/tests/gen/README.md000064400000000000000000000045461046102023000157640ustar 00000000000000This directory contains tests for serialized objects from the regex-automata crate. Currently, there are only two supported such objects: dense and sparse DFAs. The idea behind these tests is to commit some serialized objects and run some basic tests by deserializing them and running searches and ensuring they are correct. We also make sure these are run under Miri, since deserialization is one of the biggest places where undefined behavior might occur in this crate (at the time of writing). The main thing we're testing is that the *current* code can still deserialize *old* objects correctly. Generally speaking, compatibility extends to semver compatible releases of this crate. Beyond that, no promises are made, although in practice callers can at least depend on errors occurring. (The serialized format always includes a version number, and incompatible changes increment that version number such that an error will occur if an unsupported version is detected.) To generate the dense DFAs, I used this command: ``` $ regex-cli generate serialize dense regex \ MULTI_PATTERN_V2 \ tests/gen/dense/ \ --rustfmt \ --safe \ --starts-for-each-pattern \ --specialize-start-states \ --start-kind both \ --unicode-word-boundary \ --minimize \ '\b[a-zA-Z]+\b' \ '(?m)^\S+$' \ '(?Rm)^\S+$' ``` And to generate the sparse DFAs, I used this command, which is the same as above, but with `s/dense/sparse/g`. ``` $ regex-cli generate serialize sparse regex \ MULTI_PATTERN_V2 \ tests/gen/sparse/ \ --rustfmt \ --safe \ --starts-for-each-pattern \ --specialize-start-states \ --start-kind both \ --unicode-word-boundary \ --minimize \ '\b[a-zA-Z]+\b' \ '(?m)^\S+$' \ '(?Rm)^\S+$' ``` The idea is to try to enable as many of the DFA's options as possible in order to test that serialization works for all of them. Arguably we should increase test coverage here, but this is a start. Note that in particular, this does not need to test that serialization and deserialization correctly roundtrips on its own. Indeed, the normal regex test suite has a test that does a serialization round trip for every test supported by DFAs. So that has very good coverage. What we're interested in testing here is our compatibility promise: do DFAs generated with an older revision of the code still deserialize correctly? regex-automata-0.4.9/tests/gen/dense/mod.rs000064400000000000000000000016061046102023000167220ustar 00000000000000use regex_automata::{Input, Match}; mod multi_pattern_v2; #[test] fn multi_pattern_v2() { use multi_pattern_v2::MULTI_PATTERN_V2 as RE; assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd")); assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @")); assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@")); assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n")); assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n")); assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n")); assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n")); assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@")); assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n")); // Fails because we have heuristic support for Unicode word boundaries // enabled. assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err()); } regex-automata-0.4.9/tests/gen/dense/multi_pattern_v2.rs000064400000000000000000000032711046102023000214410ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // regex-cli generate serialize dense regex MULTI_PATTERN_V2 tests/gen/dense/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$ // // regex-cli 0.0.1 is available on crates.io. use regex_automata::{ dfa::{dense::DFA, regex::Regex}, util::{lazy::Lazy, wire::AlignAs}, }; pub static MULTI_PATTERN_V2: Lazy>> = Lazy::new(|| { let dfafwd = { static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { _align: [], #[cfg(target_endian = "big")] bytes: *include_bytes!("multi_pattern_v2_fwd.bigendian.dfa"), #[cfg(target_endian = "little")] bytes: *include_bytes!( "multi_pattern_v2_fwd.littleendian.dfa" ), }; DFA::from_bytes(&ALIGNED.bytes) .expect("serialized forward DFA should be valid") .0 }; let dfarev = { static ALIGNED: &AlignAs<[u8], u32> = &AlignAs { _align: [], #[cfg(target_endian = "big")] bytes: *include_bytes!("multi_pattern_v2_rev.bigendian.dfa"), #[cfg(target_endian = "little")] bytes: *include_bytes!( "multi_pattern_v2_rev.littleendian.dfa" ), }; DFA::from_bytes(&ALIGNED.bytes) .expect("serialized reverse DFA should be valid") .0 }; Regex::builder().build_from_dfas(dfafwd, dfarev) }); regex-automata-0.4.9/tests/gen/dense/multi_pattern_v2_fwd.bigendian.dfa000064400000000000000000000255341046102023000243340ustar 00000000000000rust-regex-automata-dfa-dense(  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@regex-automata-0.4.9/tests/gen/dense/multi_pattern_v2_fwd.littleendian.dfa000064400000000000000000000255341046102023000250700ustar 00000000000000rust-regex-automata-dfa-dense(  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @  @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @  @ @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@  @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@regex-automata-0.4.9/tests/gen/dense/multi_pattern_v2_rev.bigendian.dfa000064400000000000000000000166401046102023000243460ustar 00000000000000rust-regex-automata-dfa-dense  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @regex-automata-0.4.9/tests/gen/dense/multi_pattern_v2_rev.littleendian.dfa000064400000000000000000000166401046102023000251020ustar 00000000000000rust-regex-automata-dfa-dense  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@    @regex-automata-0.4.9/tests/gen/mod.rs000064400000000000000000000000271046102023000156200ustar 00000000000000mod dense; mod sparse; regex-automata-0.4.9/tests/gen/sparse/mod.rs000064400000000000000000000016061046102023000171210ustar 00000000000000use regex_automata::{Input, Match}; mod multi_pattern_v2; #[test] fn multi_pattern_v2() { use multi_pattern_v2::MULTI_PATTERN_V2 as RE; assert_eq!(Some(Match::must(0, 0..4)), RE.find("abcd")); assert_eq!(Some(Match::must(0, 2..6)), RE.find("@ abcd @")); assert_eq!(Some(Match::must(1, 0..6)), RE.find("@abcd@")); assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd\n")); assert_eq!(Some(Match::must(0, 1..5)), RE.find("\nabcd wxyz\n")); assert_eq!(Some(Match::must(1, 1..7)), RE.find("\n@abcd@\n")); assert_eq!(Some(Match::must(2, 0..6)), RE.find("@abcd@\r\n")); assert_eq!(Some(Match::must(1, 2..8)), RE.find("\r\n@abcd@")); assert_eq!(Some(Match::must(2, 2..8)), RE.find("\r\n@abcd@\r\n")); // Fails because we have heuristic support for Unicode word boundaries // enabled. assert!(RE.try_search(&Input::new(b"\xFF@abcd@\xFF")).is_err()); } regex-automata-0.4.9/tests/gen/sparse/multi_pattern_v2.rs000064400000000000000000000030031046102023000216310ustar 00000000000000// DO NOT EDIT THIS FILE. IT WAS AUTOMATICALLY GENERATED BY: // // regex-cli generate serialize sparse regex MULTI_PATTERN_V2 regex-automata/tests/gen/sparse/ --rustfmt --safe --starts-for-each-pattern --specialize-start-states --start-kind both --unicode-word-boundary --minimize \b[a-zA-Z]+\b (?m)^\S+$ (?Rm)^\S+$ // // regex-cli 0.0.1 is available on crates.io. use regex_automata::{ dfa::{regex::Regex, sparse::DFA}, util::lazy::Lazy, }; pub static MULTI_PATTERN_V2: Lazy>> = Lazy::new(|| { let dfafwd = { #[cfg(target_endian = "big")] static BYTES: &'static [u8] = include_bytes!("multi_pattern_v2_fwd.bigendian.dfa"); #[cfg(target_endian = "little")] static BYTES: &'static [u8] = include_bytes!("multi_pattern_v2_fwd.littleendian.dfa"); DFA::from_bytes(BYTES) .expect("serialized forward DFA should be valid") .0 }; let dfarev = { #[cfg(target_endian = "big")] static BYTES: &'static [u8] = include_bytes!("multi_pattern_v2_rev.bigendian.dfa"); #[cfg(target_endian = "little")] static BYTES: &'static [u8] = include_bytes!("multi_pattern_v2_rev.littleendian.dfa"); DFA::from_bytes(BYTES) .expect("serialized reverse DFA should be valid") .0 }; Regex::builder().build_from_dfas(dfafwd, dfarev) }); regex-automata-0.4.9/tests/gen/sparse/multi_pattern_v2_fwd.bigendian.dfa000064400000000000000000000066241046102023000245320ustar 00000000000000rust-regex-automata-dfa-sparse(  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,----------- - - - -) )-GGG -) ) - -  -++  -GGGGG  -tt - -GGG  -0d Gd G  -0dddd  -0 $  $   -#0######  -p0pppNppNp  -@@@@@ @-GGG   -@@t@@t@ @  -@G@@G@@ @-) )  -@@+@@+@ @ -p)0pppppp ) -p)0pppppp )  -@@N@@N@ @ -W@)@W@WWWWW )-) ) -####F##F#  -#######   -@@@@@ @ -@@@FF  -) $  $  ) -) $ $ $ $ )  -@ $@ @ $@ @ @ -@)@@ $ $ )  -@d@ G@d@ G@ @d0 S        regex-automata-0.4.9/tests/gen/sparse/multi_pattern_v2_fwd.littleendian.dfa000064400000000000000000000066241046102023000252660ustar 00000000000000rust-regex-automata-dfa-sparse(  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,----------- - - - -) )-GGG -) ) - -  -++  -GGGGG  -tt - -GGG  -0dG dG   -0dddd  -0$  $    -#0######  -p0pppNppNp  -@@@@@ @-GGG   -@@t@@t@ @  -@G@@G@@ @-) )  -@@+@@+@ @ -p)0pppppp ) -p)0pppppp )  -@@N@@N@ @ -W@)@W@WWWWW )-) ) -####F##F#  -#######   -@@@@@ @ -@@@FF  -)$  $   ) -)$ $ $ $  )  -@$ @ @$ @ @ @ -@)@@$ $ )  -@d@G @d@G @ @d0 S        regex-automata-0.4.9/tests/gen/sparse/multi_pattern_v2_rev.bigendian.dfa000064400000000000000000000036001046102023000245350ustar 00000000000000rust-regex-automata-dfa-sparse  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------x- - - -w)ww )- -   - -sEss E- -  -:: - -www -    -sssss  -      -wwwww -w)ww ) -@@@   -     -@@:@@:@ @-sEss E -@&@@ss & -W@@W@WwWWwWW `{>`{````{{{{{{> E`>regex-automata-0.4.9/tests/gen/sparse/multi_pattern_v2_rev.littleendian.dfa000064400000000000000000000036001046102023000252710ustar 00000000000000rust-regex-automata-dfa-sparse  !"""""""""""""""""""""""""""""#$%&'''''''''())*+++,-----------x- - - -w)ww )- -    - -sEss E- -  -:: - -www -     -sssss  -       -wwwww -w)ww ) -@@@   -      -@@:@@:@ @-sEss E -@&@@ss & -W@@W@WwWWwWW `{>`{````{{{{{{> E`>regex-automata-0.4.9/tests/hybrid/api.rs000064400000000000000000000137731046102023000163360ustar 00000000000000use std::error::Error; use regex_automata::{ hybrid::dfa::{OverlappingState, DFA}, nfa::thompson, HalfMatch, Input, MatchError, }; // Tests that too many cache resets cause the lazy DFA to quit. // // We only test this on 64-bit because the test is gingerly crafted based on // implementation details of cache sizes. It's not a great test because of // that, but it does check some interesting properties around how positions are // reported when a search "gives up." // // NOTE: If you change something in lazy DFA implementation that causes this // test to fail by reporting different "gave up" positions, then it's generally // okay to update the positions in the test below as long as you're sure your // changes are correct. Namely, it is expected that if there are changes in the // cache size (or changes in how big things are inside the cache), then its // utilization may change slightly and thus impact where a search gives up. // Precisely where a search gives up is not an API guarantee, so changing the // offsets here is OK. #[test] #[cfg(target_pointer_width = "64")] #[cfg(not(miri))] fn too_many_cache_resets_cause_quit() -> Result<(), Box> { // This is a carefully chosen regex. The idea is to pick one that requires // some decent number of states (hence the bounded repetition). But we // specifically choose to create a class with an ASCII letter and a // non-ASCII letter so that we can check that no new states are created // once the cache is full. Namely, if we fill up the cache on a haystack // of 'a's, then in order to match one 'β', a new state will need to be // created since a 'β' is encoded with multiple bytes. // // So we proceed by "filling" up the cache by searching a haystack of just // 'a's. The cache won't have enough room to add enough states to find the // match (because of the bounded repetition), which should result in it // giving up before it finds a match. // // Since there's now no more room to create states, we search a haystack // of 'β' and confirm that it gives up immediately. let pattern = r"[aβ]{99}"; let dfa = DFA::builder() .configure( // Configure it so that we have the minimum cache capacity // possible. And that if any resets occur, the search quits. DFA::config() .skip_cache_capacity_check(true) .cache_capacity(0) .minimum_cache_clear_count(Some(0)), ) .thompson(thompson::NFA::config()) .build(pattern)?; let mut cache = dfa.create_cache(); let haystack = "a".repeat(101).into_bytes(); let err = MatchError::gave_up(24); // Notice that we make the same amount of progress in each search! That's // because the cache is reused and already has states to handle the first // N bytes. assert_eq!( Err(err.clone()), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) ); assert_eq!( Err(err.clone()), dfa.try_search_overlapping_fwd( &mut cache, &Input::new(&haystack), &mut OverlappingState::start() ), ); let haystack = "β".repeat(101).into_bytes(); let err = MatchError::gave_up(2); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) ); // no need to test that other find routines quit, since we did that above // OK, if we reset the cache, then we should be able to create more states // and make more progress with searching for betas. cache.reset(&dfa); let err = MatchError::gave_up(26); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) ); // ... switching back to ASCII still makes progress since it just needs to // set transitions on existing states! let haystack = "a".repeat(101).into_bytes(); let err = MatchError::gave_up(13); assert_eq!( Err(err), dfa.try_search_fwd(&mut cache, &Input::new(&haystack)) ); Ok(()) } // Tests that quit bytes in the forward direction work correctly. #[test] fn quit_fwd() -> Result<(), Box> { let dfa = DFA::builder() .configure(DFA::config().quit(b'x', true)) .build("[[:word:]]+$")?; let mut cache = dfa.create_cache(); assert_eq!( dfa.try_search_fwd(&mut cache, &Input::new("abcxyz")), Err(MatchError::quit(b'x', 3)), ); assert_eq!( dfa.try_search_overlapping_fwd( &mut cache, &Input::new(b"abcxyz"), &mut OverlappingState::start() ), Err(MatchError::quit(b'x', 3)), ); Ok(()) } // Tests that quit bytes in the reverse direction work correctly. #[test] fn quit_rev() -> Result<(), Box> { let dfa = DFA::builder() .configure(DFA::config().quit(b'x', true)) .thompson(thompson::Config::new().reverse(true)) .build("^[[:word:]]+")?; let mut cache = dfa.create_cache(); assert_eq!( dfa.try_search_rev(&mut cache, &Input::new("abcxyz")), Err(MatchError::quit(b'x', 3)), ); Ok(()) } // Tests that if we heuristically enable Unicode word boundaries but then // instruct that a non-ASCII byte should NOT be a quit byte, then the builder // will panic. #[test] #[should_panic] fn quit_panics() { DFA::config().unicode_word_boundary(true).quit(b'\xFF', false); } // This tests an intesting case where even if the Unicode word boundary option // is disabled, setting all non-ASCII bytes to be quit bytes will cause Unicode // word boundaries to be enabled. #[test] fn unicode_word_implicitly_works() -> Result<(), Box> { let mut config = DFA::config(); for b in 0x80..=0xFF { config = config.quit(b, true); } let dfa = DFA::builder().configure(config).build(r"\b")?; let mut cache = dfa.create_cache(); let expected = HalfMatch::must(0, 1); assert_eq!( Ok(Some(expected)), dfa.try_search_fwd(&mut cache, &Input::new(" a")), ); Ok(()) } regex-automata-0.4.9/tests/hybrid/mod.rs000064400000000000000000000000461046102023000163310ustar 00000000000000mod api; #[cfg(not(miri))] mod suite; regex-automata-0.4.9/tests/hybrid/suite.rs000064400000000000000000000316671046102023000167200ustar 00000000000000use { anyhow::Result, regex_automata::{ hybrid::{ dfa::{OverlappingState, DFA}, regex::{self, Regex}, }, nfa::thompson, util::{prefilter::Prefilter, syntax}, Anchored, Input, PatternSet, }, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, untestify_kind}; const EXPANSIONS: &[&str] = &["is_match", "find", "which"]; /// Tests the default configuration of the hybrid NFA/DFA. #[test] fn default() -> Result<()> { let builder = Regex::builder(); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA with prefilters enabled. #[test] fn prefilter() -> Result<()> { let my_compiler = |test: &RegexTest, regexes: &[String]| { // Parse regexes as HIRs so we can get literals to build a prefilter. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } let kind = match untestify_kind(test.match_kind()) { None => return Ok(CompiledRegex::skip()), Some(kind) => kind, }; let pre = Prefilter::from_hirs_prefix(kind, &hirs); let mut builder = Regex::builder(); builder.dfa(DFA::config().prefilter(pre)); compiler(builder)(test, regexes) }; TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), my_compiler) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA with NFA shrinking enabled. /// /// This is *usually* not the configuration one wants for a lazy DFA. NFA /// shrinking is mostly only advantageous when building a full DFA since it /// can sharply decrease the amount of time determinization takes. But NFA /// shrinking is itself otherwise fairly expensive currently. Since a lazy DFA /// has no compilation time (other than for building the NFA of course) before /// executing a search, it's usually worth it to forgo NFA shrinking. /// /// Nevertheless, we test to make sure everything is OK with NFA shrinking. As /// a bonus, there are some tests we don't need to skip because they now fit in /// the default cache capacity. #[test] fn nfa_shrink() -> Result<()> { let mut builder = Regex::builder(); builder.thompson(thompson::Config::new().shrink(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when 'starts_for_each_pattern' is enabled for all /// tests. #[test] fn starts_for_each_pattern() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().starts_for_each_pattern(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when 'specialize_start_states' is enabled. #[test] fn specialize_start_states() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().specialize_start_states(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when byte classes are disabled. /// /// N.B. Disabling byte classes doesn't avoid any indirection at search time. /// All it does is cause every byte value to be its own distinct equivalence /// class. #[test] fn no_byte_classes() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().byte_classes(false)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests that hybrid NFA/DFA never clears its cache for any test with the /// default capacity. /// /// N.B. If a regex suite test is added that causes the cache to be cleared, /// then this should just skip that test. (Which can be done by calling the /// 'blacklist' method on 'TestRunner'.) #[test] fn no_cache_clearing() -> Result<()> { let mut builder = Regex::builder(); builder.dfa(DFA::config().minimum_cache_clear_count(Some(0))); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) // Without NFA shrinking, this test blows the default cache capacity. .blacklist("expensive/regression-many-repeat-no-stack-overflow") .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the hybrid NFA/DFA when the minimum cache capacity is set. #[test] fn min_cache_capacity() -> Result<()> { let mut builder = Regex::builder(); builder .dfa(DFA::config().cache_capacity(0).skip_cache_capacity_check(true)); TestRunner::new()? .expand(EXPANSIONS, |t| t.compiles()) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } fn compiler( mut builder: regex::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { // Parse regexes as HIRs for some analysis below. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } // Check if our regex contains things that aren't supported by DFAs. // That is, Unicode word boundaries when searching non-ASCII text. if !test.haystack().is_ascii() { for hir in hirs.iter() { if hir.properties().look_set().contains_word_unicode() { return Ok(CompiledRegex::skip()); } } } if !configure_regex_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) })) } } fn run_test( re: &Regex, cache: &mut regex::Cache, test: &RegexTest, ) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => { TestResult::matched(re.is_match(cache, input.earliest(true))) } "find" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { let input = input.earliest(test.search_kind() == SearchKind::Earliest); TestResult::matches( re.find_iter(cache, input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }), ) } SearchKind::Overlapping => { try_search_overlapping(re, cache, &input).unwrap() } }, "which" => match test.search_kind() { SearchKind::Earliest | SearchKind::Leftmost => { // There are no "which" APIs for standard searches. TestResult::skip() } SearchKind::Overlapping => { let dfa = re.forward(); let cache = cache.as_parts_mut().0; let mut patset = PatternSet::new(dfa.pattern_len()); dfa.try_which_overlapping_matches(cache, &input, &mut patset) .unwrap(); TestResult::which(patset.iter().map(|p| p.as_usize())) } }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_regex_builder( test: &RegexTest, builder: &mut regex::Builder, ) -> bool { let match_kind = match untestify_kind(test.match_kind()) { None => return false, Some(k) => k, }; let mut dfa_config = DFA::config().match_kind(match_kind).unicode_word_boundary(true); // When doing an overlapping search, we might try to find the start of each // match with a custom search routine. In that case, we need to tell the // reverse search (for the start offset) which pattern to look for. The // only way that API works is when anchored starting states are compiled // for each pattern. This does technically also enable it for the forward // DFA, but we're okay with that. if test.search_kind() == SearchKind::Overlapping { dfa_config = dfa_config.starts_for_each_pattern(true); } builder .syntax(config_syntax(test)) .thompson(config_thompson(test)) .dfa(dfa_config); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { let mut lookm = regex_automata::util::look::LookMatcher::new(); lookm.set_line_terminator(test.line_terminator()); thompson::Config::new().utf8(test.utf8()).look_matcher(lookm) } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) } /// Execute an overlapping search, and for each match found, also find its /// overlapping starting positions. /// /// N.B. This routine used to be part of the crate API, but 1) it wasn't clear /// to me how useful it was and 2) it wasn't clear to me what its semantics /// should be. In particular, a potentially surprising footgun of this routine /// that it is worst case *quadratic* in the size of the haystack. Namely, it's /// possible to report a match at every position, and for every such position, /// scan all the way to the beginning of the haystack to find the starting /// position. Typical leftmost non-overlapping searches don't suffer from this /// because, well, matches can't overlap. So subsequent searches after a match /// is found don't revisit previously scanned parts of the haystack. /// /// Its semantics can be strange for other reasons too. For example, given /// the regex '.*' and the haystack 'zz', the full set of overlapping matches /// is: [0, 0], [1, 1], [0, 1], [2, 2], [1, 2], [0, 2]. The ordering of /// those matches is quite strange, but makes sense when you think about the /// implementation: an end offset is found left-to-right, and then one or more /// starting offsets are found right-to-left. /// /// Nevertheless, we provide this routine in our test suite because it's /// useful to test the low level DFA overlapping search and our test suite /// is written in a way that requires starting offsets. fn try_search_overlapping( re: &Regex, cache: &mut regex::Cache, input: &Input<'_>, ) -> Result { let mut matches = vec![]; let mut fwd_state = OverlappingState::start(); let (fwd_dfa, rev_dfa) = (re.forward(), re.reverse()); let (fwd_cache, rev_cache) = cache.as_parts_mut(); while let Some(end) = { fwd_dfa.try_search_overlapping_fwd( fwd_cache, input, &mut fwd_state, )?; fwd_state.get_match() } { let revsearch = input .clone() .range(input.start()..end.offset()) .anchored(Anchored::Pattern(end.pattern())) .earliest(false); let mut rev_state = OverlappingState::start(); while let Some(start) = { rev_dfa.try_search_overlapping_rev( rev_cache, &revsearch, &mut rev_state, )?; rev_state.get_match() } { let span = Span { start: start.offset(), end: end.offset() }; let mat = Match { id: end.pattern().as_usize(), span }; matches.push(mat); } } Ok(TestResult::matches(matches)) } regex-automata-0.4.9/tests/lib.rs000064400000000000000000000070061046102023000150420ustar 00000000000000// We have a similar config in the regex-automata crate root. Basically, it is // just too annoying to deal with dead code when a subset of features is // enabled. #![cfg_attr( not(all( feature = "std", feature = "nfa", feature = "dfa", feature = "hybrid", feature = "perf-literal-substring", feature = "perf-literal-multisubstring", )), allow(dead_code, unused_imports, unused_variables) )] // Similar deal with Miri. Just let dead code warnings be. #![cfg_attr(miri, allow(dead_code, unused_imports, unused_variables))] #[cfg(any(feature = "dfa-search", feature = "dfa-onepass"))] mod dfa; #[cfg(feature = "dfa-search")] mod fuzz; #[cfg(feature = "dfa-search")] mod gen; #[cfg(feature = "hybrid")] mod hybrid; #[cfg(feature = "meta")] mod meta; #[cfg(any(feature = "nfa-backtrack", feature = "nfa-pikevm"))] mod nfa; fn suite() -> anyhow::Result { let _ = env_logger::try_init(); let mut tests = regex_test::RegexTests::new(); macro_rules! load { ($name:expr) => {{ const DATA: &[u8] = include_bytes!(concat!("../../testdata/", $name, ".toml")); tests.load_slice($name, DATA)?; }}; } load!("anchored"); load!("bytes"); load!("crazy"); load!("crlf"); load!("earliest"); load!("empty"); load!("expensive"); load!("flags"); load!("iter"); load!("leftmost-all"); load!("line-terminator"); load!("misc"); load!("multiline"); load!("no-unicode"); load!("overlapping"); load!("regression"); load!("set"); load!("substring"); load!("unicode"); load!("utf8"); load!("word-boundary"); load!("word-boundary-special"); load!("fowler/basic"); load!("fowler/nullsubexpr"); load!("fowler/repetition"); Ok(tests) } /// Configure a regex_automata::Input with the given test configuration. fn create_input<'h>( test: &'h regex_test::RegexTest, ) -> regex_automata::Input<'h> { use regex_automata::Anchored; let bounds = test.bounds(); let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No }; regex_automata::Input::new(test.haystack()) .range(bounds.start..bounds.end) .anchored(anchored) } /// Convert capture matches into the test suite's capture values. /// /// The given captures must represent a valid match, where the first capturing /// group has a non-None span. Otherwise this panics. fn testify_captures( caps: ®ex_automata::util::captures::Captures, ) -> regex_test::Captures { assert!(caps.is_match(), "expected captures to represent a match"); let spans = caps.iter().map(|group| { group.map(|m| regex_test::Span { start: m.start, end: m.end }) }); // These unwraps are OK because we assume our 'caps' represents a match, // and a match always gives a non-zero number of groups with the first // group being non-None. regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans) .unwrap() } /// Convert a test harness match kind to a regex-automata match kind. If /// regex-automata doesn't support the harness kind, then `None` is returned. fn untestify_kind( kind: regex_test::MatchKind, ) -> Option { match kind { regex_test::MatchKind::All => Some(regex_automata::MatchKind::All), regex_test::MatchKind::LeftmostFirst => { Some(regex_automata::MatchKind::LeftmostFirst) } regex_test::MatchKind::LeftmostLongest => None, } } regex-automata-0.4.9/tests/meta/mod.rs000064400000000000000000000000351046102023000157740ustar 00000000000000#[cfg(not(miri))] mod suite; regex-automata-0.4.9/tests/meta/suite.rs000064400000000000000000000157541046102023000163640ustar 00000000000000use { anyhow::Result, regex_automata::{ meta::{self, Regex}, util::syntax, MatchKind, PatternSet, }, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, testify_captures}; const BLACKLIST: &[&str] = &[ // These 'earliest' tests are blacklisted because the meta searcher doesn't // give the same offsets that the test expects. This is legal because the // 'earliest' routines don't guarantee a particular match offset other // than "the earliest the regex engine can report a match." Some regex // engines will quit earlier than others. The backtracker, for example, // can't really quit before finding the full leftmost-first match. Many of // the literal searchers also don't have the ability to quit fully or it's // otherwise not worth doing. (A literal searcher not quitting as early as // possible usually means looking at a few more bytes. That's no biggie.) "earliest/", ]; /// Tests the default configuration of the meta regex engine. #[test] fn default() -> Result<()> { let builder = Regex::builder(); let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(BLACKLIST) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the default configuration minus the full DFA. #[test] fn no_dfa() -> Result<()> { let mut builder = Regex::builder(); builder.configure(Regex::config().dfa(false)); let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(BLACKLIST) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the default configuration minus the full DFA and lazy DFA. #[test] fn no_dfa_hybrid() -> Result<()> { let mut builder = Regex::builder(); builder.configure(Regex::config().dfa(false).hybrid(false)); let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(BLACKLIST) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the default configuration minus the full DFA, lazy DFA and one-pass /// DFA. #[test] fn no_dfa_hybrid_onepass() -> Result<()> { let mut builder = Regex::builder(); builder.configure(Regex::config().dfa(false).hybrid(false).onepass(false)); let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(BLACKLIST) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } /// Tests the default configuration minus the full DFA, lazy DFA, one-pass /// DFA and backtracker. #[test] fn no_dfa_hybrid_onepass_backtrack() -> Result<()> { let mut builder = Regex::builder(); builder.configure( Regex::config() .dfa(false) .hybrid(false) .onepass(false) .backtrack(false), ); let mut runner = TestRunner::new()?; runner .expand(&["is_match", "find", "captures"], |test| test.compiles()) .blacklist_iter(BLACKLIST) .test_iter(suite()?.iter(), compiler(builder)) .assert(); Ok(()) } fn compiler( mut builder: meta::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { if !configure_meta_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) })) } } fn run_test(re: &Regex, test: &RegexTest) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => TestResult::matched(re.is_match(input)), "find" => match test.search_kind() { SearchKind::Earliest => TestResult::matches( re.find_iter(input.earliest(true)) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }), ), SearchKind::Leftmost => TestResult::matches( re.find_iter(input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }), ), SearchKind::Overlapping => { let mut patset = PatternSet::new(re.pattern_len()); re.which_overlapping_matches(&input, &mut patset); TestResult::which(patset.iter().map(|p| p.as_usize())) } }, "captures" => match test.search_kind() { SearchKind::Earliest => { let it = re .captures_iter(input.earliest(true)) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } SearchKind::Leftmost => { let it = re .captures_iter(input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } SearchKind::Overlapping => { // There is no overlapping regex API that supports captures. TestResult::skip() } }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_meta_builder( test: &RegexTest, builder: &mut meta::Builder, ) -> bool { let match_kind = match test.match_kind() { regex_test::MatchKind::All => MatchKind::All, regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst, regex_test::MatchKind::LeftmostLongest => return false, }; let meta_config = Regex::config() .match_kind(match_kind) .utf8_empty(test.utf8()) .line_terminator(test.line_terminator()); builder.configure(meta_config).syntax(config_syntax(test)); true } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) } regex-automata-0.4.9/tests/nfa/mod.rs000064400000000000000000000000161046102023000156110ustar 00000000000000mod thompson; regex-automata-0.4.9/tests/nfa/thompson/backtrack/mod.rs000064400000000000000000000000351046102023000214060ustar 00000000000000#[cfg(not(miri))] mod suite; regex-automata-0.4.9/tests/nfa/thompson/backtrack/suite.rs000064400000000000000000000203731046102023000217670ustar 00000000000000use { anyhow::Result, regex_automata::{ nfa::thompson::{ self, backtrack::{self, BoundedBacktracker}, NFA, }, util::{prefilter::Prefilter, syntax}, Input, }, regex_test::{ CompiledRegex, Match, MatchKind, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, testify_captures}; /// Tests the default configuration of the bounded backtracker. #[test] fn default() -> Result<()> { let builder = BoundedBacktracker::builder(); let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); // At the time of writing, every regex search in the test suite fits // into the backtracker's default visited capacity (except for the // blacklisted tests below). If regexes are added that blow that capacity, // then they should be blacklisted here. A tempting alternative is to // automatically skip them by checking the haystack length against // BoundedBacktracker::max_haystack_len, but that could wind up hiding // interesting failure modes. e.g., If the visited capacity is somehow // wrong or smaller than it should be. runner.blacklist("expensive/backtrack-blow-visited-capacity"); runner.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests the backtracker with prefilters enabled. #[test] fn prefilter() -> Result<()> { let my_compiler = |test: &RegexTest, regexes: &[String]| { // Parse regexes as HIRs so we can get literals to build a prefilter. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } // We can always select leftmost-first here because the backtracker // only supports leftmost-first matching. let pre = Prefilter::from_hirs_prefix( regex_automata::MatchKind::LeftmostFirst, &hirs, ); let mut builder = BoundedBacktracker::builder(); builder.configure(BoundedBacktracker::config().prefilter(pre)); compiler(builder)(test, regexes) }; let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); runner.blacklist("expensive/backtrack-blow-visited-capacity"); runner.test_iter(suite()?.iter(), my_compiler).assert(); Ok(()) } /// Tests the bounded backtracker when its visited capacity is set to its /// minimum amount. #[test] fn min_visited_capacity() -> Result<()> { let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); runner .test_iter(suite()?.iter(), move |test, regexes| { let nfa = NFA::compiler() .configure(config_thompson(test)) .syntax(config_syntax(test)) .build_many(®exes)?; let mut builder = BoundedBacktracker::builder(); if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } // Setup the bounded backtracker so that its visited capacity is // the absolute minimum required for the test's haystack. builder.configure(BoundedBacktracker::config().visited_capacity( backtrack::min_visited_capacity( &nfa, &Input::new(test.haystack()), ), )); let re = builder.build_from_nfa(nfa)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) })) }) .assert(); Ok(()) } fn compiler( mut builder: backtrack::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { if !configure_backtrack_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) })) } } fn run_test( re: &BoundedBacktracker, cache: &mut backtrack::Cache, test: &RegexTest, ) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => match test.search_kind() { SearchKind::Earliest | SearchKind::Overlapping => { TestResult::skip() } SearchKind::Leftmost => { let input = input.earliest(true); TestResult::matched(re.try_is_match(cache, input).unwrap()) } }, "find" => match test.search_kind() { SearchKind::Earliest | SearchKind::Overlapping => { TestResult::skip() } SearchKind::Leftmost => TestResult::matches( re.try_find_iter(cache, input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|result| result.unwrap()) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }), ), }, "captures" => match test.search_kind() { SearchKind::Earliest | SearchKind::Overlapping => { TestResult::skip() } SearchKind::Leftmost => TestResult::captures( re.try_captures_iter(cache, input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|result| result.unwrap()) .map(|caps| testify_captures(&caps)), ), }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_backtrack_builder( test: &RegexTest, builder: &mut backtrack::Builder, ) -> bool { match (test.search_kind(), test.match_kind()) { // For testing the standard search APIs. This is the only supported // configuration for the backtracker. (SearchKind::Leftmost, MatchKind::LeftmostFirst) => {} // Overlapping APIs not supported at all for backtracker. (SearchKind::Overlapping, _) => return false, // Backtracking doesn't really support the notion of 'earliest'. // Namely, backtracking already works by returning as soon as it knows // it has found a match. It just so happens that this corresponds to // the standard 'leftmost' formulation. // // The 'earliest' definition in this crate does indeed permit this // behavior, so this is "fine," but our test suite specifically looks // for the earliest position at which a match is known, which our // finite automata based regex engines have no problem providing. So // for backtracking, we just skip these tests. (SearchKind::Earliest, _) => return false, // For backtracking, 'all' semantics don't really make sense. (_, MatchKind::All) => return false, // Not supported at all in regex-automata. (_, MatchKind::LeftmostLongest) => return false, }; let backtrack_config = BoundedBacktracker::config(); builder .configure(backtrack_config) .syntax(config_syntax(test)) .thompson(config_thompson(test)); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { let mut lookm = regex_automata::util::look::LookMatcher::new(); lookm.set_line_terminator(test.line_terminator()); thompson::Config::new().utf8(test.utf8()).look_matcher(lookm) } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) } regex-automata-0.4.9/tests/nfa/thompson/mod.rs000064400000000000000000000001341046102023000174610ustar 00000000000000#[cfg(feature = "nfa-backtrack")] mod backtrack; #[cfg(feature = "nfa-pikevm")] mod pikevm; regex-automata-0.4.9/tests/nfa/thompson/pikevm/mod.rs000064400000000000000000000000351046102023000207540ustar 00000000000000#[cfg(not(miri))] mod suite; regex-automata-0.4.9/tests/nfa/thompson/pikevm/suite.rs000064400000000000000000000132271046102023000213350ustar 00000000000000use { anyhow::Result, regex_automata::{ nfa::thompson::{ self, pikevm::{self, PikeVM}, }, util::{prefilter::Prefilter, syntax}, PatternSet, }, regex_test::{ CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner, }, }; use crate::{create_input, suite, testify_captures, untestify_kind}; /// Tests the default configuration of the hybrid NFA/DFA. #[test] fn default() -> Result<()> { let builder = PikeVM::builder(); let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); runner.test_iter(suite()?.iter(), compiler(builder)).assert(); Ok(()) } /// Tests the PikeVM with prefilters enabled. #[test] fn prefilter() -> Result<()> { let my_compiler = |test: &RegexTest, regexes: &[String]| { // Parse regexes as HIRs so we can get literals to build a prefilter. let mut hirs = vec![]; for pattern in regexes.iter() { hirs.push(syntax::parse_with(pattern, &config_syntax(test))?); } let kind = match untestify_kind(test.match_kind()) { None => return Ok(CompiledRegex::skip()), Some(kind) => kind, }; let pre = Prefilter::from_hirs_prefix(kind, &hirs); let mut builder = PikeVM::builder(); builder.configure(PikeVM::config().prefilter(pre)); compiler(builder)(test, regexes) }; let mut runner = TestRunner::new()?; runner.expand(&["is_match", "find", "captures"], |test| test.compiles()); runner.test_iter(suite()?.iter(), my_compiler).assert(); Ok(()) } fn compiler( mut builder: pikevm::Builder, ) -> impl FnMut(&RegexTest, &[String]) -> Result { move |test, regexes| { if !configure_pikevm_builder(test, &mut builder) { return Ok(CompiledRegex::skip()); } let re = builder.build_many(®exes)?; let mut cache = re.create_cache(); Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, &mut cache, test) })) } } fn run_test( re: &PikeVM, cache: &mut pikevm::Cache, test: &RegexTest, ) -> TestResult { let input = create_input(test); match test.additional_name() { "is_match" => TestResult::matched(re.is_match(cache, input)), "find" => match test.search_kind() { SearchKind::Earliest => { let it = re .find_iter(cache, input.earliest(true)) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }); TestResult::matches(it) } SearchKind::Leftmost => { let it = re .find_iter(cache, input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|m| Match { id: m.pattern().as_usize(), span: Span { start: m.start(), end: m.end() }, }); TestResult::matches(it) } SearchKind::Overlapping => { let mut patset = PatternSet::new(re.get_nfa().pattern_len()); re.which_overlapping_matches(cache, &input, &mut patset); TestResult::which(patset.iter().map(|p| p.as_usize())) } }, "captures" => match test.search_kind() { SearchKind::Earliest => { let it = re .captures_iter(cache, input.earliest(true)) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } SearchKind::Leftmost => { let it = re .captures_iter(cache, input) .take(test.match_limit().unwrap_or(std::usize::MAX)) .map(|caps| testify_captures(&caps)); TestResult::captures(it) } SearchKind::Overlapping => { // There is no overlapping PikeVM API that supports captures. TestResult::skip() } }, name => TestResult::fail(&format!("unrecognized test name: {}", name)), } } /// Configures the given regex builder with all relevant settings on the given /// regex test. /// /// If the regex test has a setting that is unsupported, then this returns /// false (implying the test should be skipped). fn configure_pikevm_builder( test: &RegexTest, builder: &mut pikevm::Builder, ) -> bool { let match_kind = match untestify_kind(test.match_kind()) { None => return false, Some(k) => k, }; let pikevm_config = PikeVM::config().match_kind(match_kind); builder .configure(pikevm_config) .syntax(config_syntax(test)) .thompson(config_thompson(test)); true } /// Configuration of a Thompson NFA compiler from a regex test. fn config_thompson(test: &RegexTest) -> thompson::Config { let mut lookm = regex_automata::util::look::LookMatcher::new(); lookm.set_line_terminator(test.line_terminator()); thompson::Config::new().utf8(test.utf8()).look_matcher(lookm) } /// Configuration of the regex parser from a regex test. fn config_syntax(test: &RegexTest) -> syntax::Config { syntax::Config::new() .case_insensitive(test.case_insensitive()) .unicode(test.unicode()) .utf8(test.utf8()) .line_terminator(test.line_terminator()) }