regex-cursor-0.1.4/.cargo_vcs_info.json0000644000000001360000000000100134630ustar { "git": { "sha1": "b293eb72c12950e77ca929ffd016ad1b070ec265" }, "path_in_vcs": "" }regex-cursor-0.1.4/.gitignore000064400000000000000000000006351046102023000142470ustar 00000000000000# Generated by Cargo # will have compiled files and executables debug/ target/ # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html Cargo.lock # These are backup files generated by rustfmt **/*.rs.bk # MSVC Windows builds of rustc generate these, which store debugging information *.pdbregex-cursor-0.1.4/Cargo.toml0000644000000025560000000000100114710ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.65" name = "regex-cursor" version = "0.1.4" description = "regex fork that can search discontiguous haystacks" documentation = "https://docs.rs/regex-cursor" readme = "README.md" keywords = [ "regex", "dfa", "automata", "automaton", "nfa", ] categories = ["text-processing"] license = "MIT OR Apache-2.0" repository = "https://github.com/pascalkuthe/regex-cursor" [dependencies.log] version = "0.4.20" [dependencies.memchr] version = "2.6" [dependencies.regex-automata] version = "0.4.5" [dependencies.regex-syntax] version = "0.8.2" [dependencies.ropey] version = "1.6.0" optional = true default-features = false [dev-dependencies.anyhow] version = "1.0.79" [dev-dependencies.proptest] version = "1.2.0" [dev-dependencies.regex-test] version = "0.1.0" [features] default = [ "perf-inline", "ropey", ] perf-inline = [] ropey = ["dep:ropey"] regex-cursor-0.1.4/Cargo.toml.orig000064400000000000000000000015601046102023000151440ustar 00000000000000[package] name = "regex-cursor" description = "regex fork that can search discontiguous haystacks" version = "0.1.4" edition = "2021" documentation = "https://docs.rs/regex-cursor" author = "Pascal Kuthe " repository = "https://github.com/pascalkuthe/regex-cursor" readme = "README.md" keywords = ["regex", "dfa", "automata", "automaton", "nfa"] license = "MIT OR Apache-2.0" categories = ["text-processing"] rust-version = "1.65" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] log = "0.4.20" memchr = "2.6" regex-automata = "0.4.5" regex-syntax = "0.8.2" ropey = { version = "1.6.0", default-features = false, optional = true } [dev-dependencies] anyhow = "1.0.79" proptest = "1.2.0" regex-test = "0.1.0" [features] default = ["perf-inline", "ropey"] perf-inline = [] ropey = ["dep:ropey"] regex-cursor-0.1.4/LICENSE-APACHE000064400000000000000000000251371046102023000142070ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. regex-cursor-0.1.4/LICENSE-MIT000064400000000000000000000020401046102023000137030ustar 00000000000000Copyright (c) 2024 Pascal Kuthe Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regex-cursor-0.1.4/README.md000064400000000000000000000045241046102023000135370ustar 00000000000000# regex-cursor This crate provides routines for searching **discontiguous strings** for matches of a [regular expression] (aka "regex"). It is based on [regex-automata] and most of the code is adapted from the various crates in the [regex](https://github.com/rust-lang/regex) repository. It is intended as a prototype for upstream support for "streaming regex". The cursor based API in this crate is very similar to the API already exposed by `regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that collections can implement. A sketch of the cursor API is shown below. The string is yielded in multiple byte chunks. Calling advance moves the cursor to the next chunk. Calling backtrack moves the cursor a chunk back. Backtracking is required by this create. That makes it unsuitable for searching fully unbuffered streams like bytes send over a TCP connection. ``` rust pub trait Cursor { fn chunk(&self) -> &[u8] { .. } fn advance(&mut self) -> bool { .. } fn bracktrack(&mut self) -> bool { .. } } ``` Working on this crate showed me that regex backtracks a lot more than expected with most functionality fundamentally requiring backtracking. For network usecases that do not buffer their input the primary usecase would likely be detecting a match (without necessarily requiring the matched byte range). Such usecases can be covered by manually feeding bytes into the hybrid and DFA engines from the regex-automata crate. This approach also has the advantage of allowing the caller to pause the match (async) while waiting for more data allowing the caller to drive the search instead of the engine itself. The only part of this crate that could be applied to the fully streaming case is the streaming PikeVM implementation. However, there are some limitations: * only a single search can be run since the PikeVM may look ahead multiple bytes to disambiguate alternative matches * Prefilters longer than one byte can not work * utf-8 mode can not be supported (empty matches may occur between unicode boundaries) Currently, the PikeVM implementation is not written with this use case in mind and may call backtrack unnecessarily, but that could be addressed in the future, but especially the first point is very limiting. The pikevm also does not allow the user to drive the search and would block on network calls for example (no async). regex-cursor-0.1.4/proptest-regressions/engines/dfa/test.txt000064400000000000000000000014111046102023000223730ustar 00000000000000# Seeds for failure cases proptest has generated in the past. It is # automatically read and these particular cases re-run before any # novel cases are generated. # # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 2795080e34081178522520583a3fffdcfeadb09aa47a298f991e102fb6064559 # shrinks to mut haystack = "𛅕", needle = "" cc 561c3e868d6f45d3071f185399fcd6031baede9ecbda8b4a1f3e9760775dc27e # shrinks to mut haystack = "Σ0🌀𑍇𑵐:𫠠𝕒 ", needle = ":" cc 63a23412cc7362942174b377418542dd6430d448b0f72833809e22588e872d09 # shrinks to mut haystack = "a", needle = "" cc 311b1045964903485e0577546cf1341422999100f2e3274f8d4ea61fea074b20 # shrinks to mut haystack = "®", needle = "." regex-cursor-0.1.4/proptest-regressions/engines/hybrid/test.txt000064400000000000000000000017401046102023000231270ustar 00000000000000# Seeds for failure cases proptest has generated in the past. It is # automatically read and these particular cases re-run before any # novel cases are generated. # # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 3152dced60f8c193302e2adbe9ebd67be558b4af65991b997e5f776920c0459f # shrinks to haystack = "", needle = "" cc 0a97b5285cbdc808df0e0e829c62fe77de165b9aaf8f15dc0d41a150407a4b01 # shrinks to haystack = "Y", needle = "Y" cc 3121032e282f21b11023cec49d0119661db16574d821f15b91400b6d66449702 # shrinks to haystack = "&&", needle = "&" cc f8813009c0bd8c6bdd386e9b17ce8bb83e513707c27985bc2757c56549c7290c # shrinks to haystack = ":a", needle = "$|:" cc 1cd08976b659689543c93e102417319e7dafe94333d0f2813f5c68dc935bb6cf # shrinks to haystack = "Σ /ⶠaAA ﷏00AAΣ/എ", needle = "/" cc 7fdff08fc051c9b641db028206943cbb84ca26f8a88e06eadaa5b09b66148d34 # shrinks to mut haystack = "𑊊", needle = "𑒀?." regex-cursor-0.1.4/proptest-regressions/engines/pikevm/tests.txt000064400000000000000000000013431046102023000233230ustar 00000000000000# Seeds for failure cases proptest has generated in the past. It is # automatically read and these particular cases re-run before any # novel cases are generated. # # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 4c899804f8e28d294268b2c482879338edc3be0210465aeaf6a03d65626d386f # shrinks to haystack = "Ѩ", needle = "Ѩ*|A0" cc 9dcbeee2d5ffde3324638f38b2eefc96a95b0665810c02c12093976a0aba96c5 # shrinks to haystack = "", needle = "^" cc 0311c531b8a3e09dc21270ace24fc7cdec1d773228a9ce3843888afe4774c4a2 # shrinks to haystack = "", needle = "$" cc 578435f522160de6326c7cf57b367dc9e52679b796ecf8d331a9684a9ef4d1f7 # shrinks to haystack = " ", needle = "." regex-cursor-0.1.4/proptest-regressions/literal/tests.txt000064400000000000000000000023701046102023000220350ustar 00000000000000# Seeds for failure cases proptest has generated in the past. It is # automatically read and these particular cases re-run before any # novel cases are generated. # # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc a1f6f819109c893f29c5f71a0ac13dfcbf04de0dc6411615de2d9587b12d6edf # shrinks to haystack = "", needle = "🌀🤀𛱰a0Aa®ଏ¡𞥞®0" cc 9fc9553316dab0f5611d42ebdbfda893e991f183f013a13e105570d9bb935bbb # shrinks to haystack = "🀄", needle = [128] cc 14528483978ac457a80022577321d49eadc3952a4bc848dcf622730341424c50 # shrinks to haystack = "\"", needle = "\"" cc 0906f449ec7e583178f7865198d5c6c8589f6a760f57fe1e94fa71b751a13dcc # shrinks to haystack = "*", needle = "*" cc 3dc047ca1210586977bea6afe1c52f3f21b8f778358932316bce56a9c8dd069a # shrinks to mut haystack = "®", needle = "¯" cc d37b534f1d1d9b91a41efb745325c95e429901bd53d2bc4a31fd55997e5b243a # shrinks to mut haystack = "Ѩ", needle = "Ѩ" cc ea94b3aca8d5e5c4728504f773d8ec61d1e7a0e3aa8e186b9c953a199cd7e3e2 # shrinks to mut haystack = "A® a𛲜�a0 a0 𖬀 ", needle = "�" cc 80ea1772c0da540fd9e502978e22f1678ea0a06ec302d38891ecf36be39f966c # shrinks to mut haystack = "0Aa0 ��⺀ A", needle = "�" regex-cursor-0.1.4/proptest-regressions/util/tests.txt000064400000000000000000000006101046102023000213510ustar 00000000000000# Seeds for failure cases proptest has generated in the past. It is # automatically read and these particular cases re-run before any # novel cases are generated. # # It is recommended to check this file in to source control so that # everyone who runs the test benefits from these saved cases. cc 06febfa67a8673673da6a2a4d70869e49f8d45945ae98745208a6266253a5bed # shrinks to haystack = "®" regex-cursor-0.1.4/rustfmt.toml000064400000000000000000000003171046102023000146550ustar 00000000000000use_small_heuristics = "Max" newline_style = "Unix" use_field_init_shorthand = true imports_granularity = "Module" group_imports = "StdExternalCrate" format_macro_matchers = true format_macro_bodies = true regex-cursor-0.1.4/src/cursor.rs000064400000000000000000000160261046102023000147320ustar 00000000000000pub trait IntoCursor { type Cursor: Cursor; fn into_cursor(self) -> Self::Cursor; } impl IntoCursor for C { type Cursor = Self; fn into_cursor(self) -> Self { self } } /// A cursor that allows traversing a discontiguous string like a rope. pub trait Cursor { /// Returns the current chunk. If [`utf8_aware`](Cursor::utf8_aware) returns true then this function /// must **never** return a chunk that splits a unicode codepoint. /// See [`utf8_aware`](Cursor::utf8_aware) for details. /// /// Must never return an empty byteslice unless the underlying collection is empty. fn chunk(&self) -> &[u8]; /// Whether this cursor is aware of utf-8 codepoint boundaries. /// /// **`true`** means that his cursor must never split a unicode codepoint at a /// chunk boundary. In that case all regex features are supported. /// /// **`false`** means that his cursor can not be used for utf-8 mode /// matching (only affects empty strings) and can not be used to match /// unicode word boundaries. fn utf8_aware(&self) -> bool { true } /// Advances the cursor to the next chunk if possible. In that case `true` /// must be returned. If the end of data is reached this function should /// return `false` and **not change the chunk** fn advance(&mut self) -> bool; /// Moves the cursor to the previous chunk if possible. In that case `true` /// must be returned If the start of data is reached this function should /// return `false` and **not change the chunk** fn backtrack(&mut self) -> bool; /// Returns the total length of the data. This does not /// take the current cursor position into account and should /// not change with calls to [`advance`](Cursor::advance) and [`backtrack`](Cursor::backtrack). fn total_bytes(&self) -> Option; /// The offset of the current chunk from the start of the haystack in bytes fn offset(&self) -> usize; } impl Cursor for &mut C { fn chunk(&self) -> &[u8] { C::chunk(self) } fn utf8_aware(&self) -> bool { C::utf8_aware(self) } fn advance(&mut self) -> bool { C::advance(self) } fn backtrack(&mut self) -> bool { C::backtrack(self) } fn total_bytes(&self) -> Option { C::total_bytes(self) } fn offset(&self) -> usize { C::offset(self) } } impl Cursor for &[u8] { fn chunk(&self) -> &[u8] { self } // true since there are no chunk bounderies fn utf8_aware(&self) -> bool { true } fn advance(&mut self) -> bool { false } fn backtrack(&mut self) -> bool { false } fn total_bytes(&self) -> Option { Some(self.len()) } fn offset(&self) -> usize { 0 } } impl Cursor for &str { fn chunk(&self) -> &[u8] { self.as_bytes() } // true since there are no chunk bounderies fn utf8_aware(&self) -> bool { true } fn advance(&mut self) -> bool { false } fn backtrack(&mut self) -> bool { false } fn total_bytes(&self) -> Option { Some(::len(self)) } fn offset(&self) -> usize { 0 } } #[cfg(feature = "ropey")] #[derive(Clone, Copy)] enum Pos { ChunkStart, ChunkEnd, } #[cfg(feature = "ropey")] #[derive(Clone)] pub struct RopeyCursor<'a> { iter: ropey::iter::Chunks<'a>, current: &'a [u8], pos: Pos, len: usize, offset: usize, } #[cfg(feature = "ropey")] impl<'a> RopeyCursor<'a> { pub fn new(slice: ropey::RopeSlice<'a>) -> Self { let iter = slice.chunks(); let mut res = Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset: 0 }; res.advance(); res } pub fn at(slice: ropey::RopeSlice<'a>, at: usize) -> Self { let (iter, offset, _, _) = slice.chunks_at_byte(at); if offset == slice.len_bytes() { let mut res = Self { current: &[], iter, pos: Pos::ChunkStart, len: slice.len_bytes(), offset }; res.backtrack(); res } else { let mut res = Self { current: &[], iter, pos: Pos::ChunkEnd, len: slice.len_bytes(), offset }; res.advance(); res } } } #[cfg(feature = "ropey")] impl Cursor for RopeyCursor<'_> { fn chunk(&self) -> &[u8] { self.current } fn advance(&mut self) -> bool { match self.pos { Pos::ChunkStart => { self.iter.next(); self.pos = Pos::ChunkEnd; } Pos::ChunkEnd => (), } for next in self.iter.by_ref() { if next.is_empty() { continue; } self.offset += self.current.len(); self.current = next.as_bytes(); return true; } false } fn backtrack(&mut self) -> bool { match self.pos { Pos::ChunkStart => {} Pos::ChunkEnd => { self.iter.prev(); self.pos = Pos::ChunkStart; } } while let Some(prev) = self.iter.prev() { if prev.is_empty() { continue; } self.offset -= prev.len(); self.current = prev.as_bytes(); return true; } false } fn utf8_aware(&self) -> bool { true } fn total_bytes(&self) -> Option { Some(self.len) } fn offset(&self) -> usize { self.offset } } #[cfg(feature = "ropey")] impl<'a> IntoCursor for ropey::RopeSlice<'a> { type Cursor = RopeyCursor<'a>; fn into_cursor(self) -> Self::Cursor { RopeyCursor::new(self) } } #[cfg(feature = "ropey")] impl<'a> IntoCursor for &'a ropey::Rope { type Cursor = RopeyCursor<'a>; fn into_cursor(self) -> Self::Cursor { RopeyCursor::new(self.slice(..)) } } #[cfg(all(feature = "ropey", test))] mod ropey_test { use ropey::Rope; use crate::cursor::IntoCursor; use crate::Cursor; #[test] fn smoke_test() { let rope = Rope::from_str("abc"); let mut cursor = rope.into_cursor(); assert_eq!(cursor.chunk(), "abc".as_bytes()); assert!(!cursor.advance()); assert_eq!(cursor.chunk(), "abc".as_bytes()); assert!(!cursor.backtrack()); assert_eq!(cursor.chunk(), "abc".as_bytes()); let rope = Rope::from("abc".repeat(5000)); let mut cursor = rope.into_cursor(); let mut offset = 0; loop { assert_eq!(cursor.offset(), offset); offset += cursor.chunk().len(); if !cursor.advance() { break; } } loop { offset -= cursor.chunk().len(); assert_eq!(cursor.offset(), offset); if !cursor.backtrack() { break; } } assert_eq!(cursor.offset(), 0); assert_eq!(offset, 0); } } regex-cursor-0.1.4/src/engines/dfa/accel.rs000064400000000000000000000051201046102023000166370ustar 00000000000000use crate::cursor::Cursor; use crate::Input; /// Search for between 1 and 3 needle bytes in the given haystack, starting the /// search at the given position. If `needles` has a length other than 1-3, /// then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_fwd_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option { let bs = needles; let i = match needles.len() { 1 => memchr::memchr(bs[0], &haystack[at..])?, 2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?, 3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?, 0 => panic!("cannot find with empty needles"), n => panic!("invalid needles length: {}", n), }; Some(at + i) } /// Search for between 1 and 3 needle bytes in the given input, starting the /// search at the given position. If `needles` has a length other than 1-3, /// then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_fwd( needles: &[u8], input: &mut Input, at: usize, ) -> Option { if let Some(pos) = find_fwd_imp(needles, input.chunk(), at) { return Some(pos); } while input.advance() { if let Some(pos) = find_fwd_imp(needles, input.chunk(), 0) { return Some(pos); } } None } /// Search for between 1 and 3 needle bytes in the given haystack in reverse, /// starting the search at the given position. If `needles` has a length other /// than 1-3, then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_rev_imp(needles: &[u8], haystack: &[u8], at: usize) -> Option { let bs = needles; match needles.len() { 1 => memchr::memrchr(bs[0], &haystack[..at]), 2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]), 3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]), 0 => panic!("cannot find with empty needles"), n => panic!("invalid needles length: {}", n), } } /// Search for between 1 and 3 needle bytes in the given input, starting the /// search at the given position. If `needles` has a length other than 1-3, /// then this panics. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn find_rev( needles: &[u8], input: &mut Input, at: usize, ) -> Option { if let Some(pos) = find_rev_imp(needles, input.chunk(), at) { return Some(pos); } while input.backtrack() { if let Some(pos) = find_rev_imp(needles, input.chunk(), input.chunk().len()) { return Some(pos); } } None } regex-cursor-0.1.4/src/engines/dfa/search.rs000064400000000000000000000737251046102023000170550ustar 00000000000000use regex_automata::{ dfa::{Automaton, StartError}, util::{prefilter::Prefilter, primitives::StateID, start}, Anchored, HalfMatch, MatchError, }; use crate::{cursor::Cursor, engines::dfa::accel, literal, util::empty, Input}; /// Executes a forward search and returns the end position of the leftmost /// match that is found. If no match exists, then `None` is returned. /// /// In particular, this method continues searching even after it enters /// a match state. The search only terminates once it has reached the /// end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Notes for implementors /// /// Implementors of this trait are not required to implement any particular /// match semantics (such as leftmost-first), which are instead manifest in /// the DFA's transitions. But this search routine should behave as a /// general "leftmost" search. /// /// In particular, this method must continue searching even after it enters /// a match state. The search should only terminate once it has reached /// the end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// Since this trait provides an implementation for this method by default, /// it's unlikely that one will need to implement this. /// /// # Example /// /// This example shows how to use this method with a /// [`dense::DFA`](crate::dfa::dense::DFA). /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// let dfa = dense::DFA::new("foo[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 8)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = dense::DFA::new("abc|a")?; /// let expected = Some(HalfMatch::must(0, 3)); /// assert_eq!(expected, dfa.try_search_fwd(&Input::new(b"abc"))?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specific pattern search /// /// This example shows how to build a multi-DFA that permits searching for /// specific patterns. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// dfa::{Automaton, dense}, /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = dense::Builder::new() /// .configure(dense::Config::new().starts_for_each_pattern(true)) /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let haystack = "foo123".as_bytes(); /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// let expected = Some(HalfMatch::must(1, 6)); /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// use regex_automata::{dfa::{Automaton, dense}, HalfMatch, Input}; /// /// // N.B. We disable Unicode here so that we use a simple ASCII word /// // boundary. Alternatively, we could enable heuristic support for /// // Unicode word boundaries. /// let dfa = dense::DFA::new(r"(?-u)\b[0-9]{3}\b")?; /// let haystack = "foo123bar".as_bytes(); /// /// // Since we sub-slice the haystack, the search doesn't know about the /// // larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `3` instead of `6`. /// let input = Input::new(&haystack[3..6]); /// let expected = Some(HalfMatch::must(0, 3)); /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let input = Input::new(haystack).range(3..6); /// let expected = None; /// let got = dfa.try_search_fwd(&input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_fwd( dfa: &A, input: &mut Input, ) -> Result, MatchError> { let utf8empty = dfa.has_empty() && dfa.is_utf8(); let hm = match find_fwd(dfa, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; // We get to this point when we know our DFA can match the empty string // AND when UTF-8 mode is enabled. In this case, we skip any matches // whose offset splits a codepoint. Such a match is necessarily a // zero-width match, because UTF-8 mode requires the underlying NFA // to be built such that all non-empty matches span valid UTF-8. // Therefore, any match that ends in the middle of a codepoint cannot // be part of a span of valid UTF-8 and thus must be an empty match. // In such cases, we skip it, so as not to report matches that split a // codepoint. // // Note that this is not a checked assumption. Callers *can* provide an // NFA with UTF-8 mode enabled but produces non-empty matches that span // invalid UTF-8. But doing so is documented to result in unspecified // behavior. empty::skip_splits_fwd(input, hm, hm.offset(), |input| { let got = find_fwd(dfa, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes a reverse search and returns the start of the position of the /// leftmost match that is found. If no match exists, then `None` is /// returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the DFA quitting. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to use this method with a /// [`dense::DFA`](crate::dfa::dense::DFA). In particular, this /// routine is principally useful when used in conjunction with the /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) /// configuration. In general, it's unlikely to be correct to use /// both `try_search_fwd` and `try_search_rev` with the same DFA since /// any particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson, /// dfa::{Automaton, dense}, /// HalfMatch, Input, /// }; /// /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("foo[0-9]+")?; /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"foo12345"))?); /// /// // Even though a match is found after reading the last byte (`c`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = dense::Builder::new() /// .thompson(thompson::Config::new().reverse(true)) /// .build("abc|c")?; /// let expected = Some(HalfMatch::must(0, 0)); /// assert_eq!(expected, dfa.try_search_rev(&Input::new(b"abc"))?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"")?; /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// original NFA disabled (which results in disabling UTF-8 mode on the /// DFA): /// /// ``` /// use regex_automata::{ /// dfa::{dense::DFA, Automaton}, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build(r"")?; /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_rev( dfa: &A, input: &mut Input, ) -> Result, MatchError> { let utf8empty = dfa.has_empty() && dfa.is_utf8(); let hm = match find_rev(dfa, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; empty::skip_splits_rev(input, hm, hm.offset(), |input| { let got = find_rev(dfa, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } #[inline(never)] pub fn find_fwd( dfa: &A, input: &mut Input, ) -> Result, MatchError> { input.move_to(input.start()); if input.is_done() { return Ok(None); } // Searching with a pattern ID is always anchored, so we should never use // a prefilter. let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_prefilter() }; if pre.is_some() { if input.get_earliest() { find_fwd_imp(dfa, input, pre, true) } else { find_fwd_imp(dfa, input, pre, false) } } else if input.get_earliest() { find_fwd_imp(dfa, input, None, true) } else { find_fwd_imp(dfa, input, None, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_fwd_imp( dfa: &A, input: &mut Input, pre: Option<&'_ Prefilter>, earliest: bool, ) -> Result, MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.universal_start_state(Anchored::No).is_some(); let mut mat = None; let mut sid = init_fwd(dfa, input)?; if let Some(pre) = pre { // If a prefilter doesn't report false positives, then we don't need to // touch the DFA at all. However, since all matches include the pattern // ID, and the prefilter infrastructure doesn't report pattern IDs, we // limit this optimization to cases where there is exactly one pattern. // In that case, any match must be the 0th pattern. match literal::find(pre, input) { None => return Ok(mat), Some(ref span) => { input.move_to(span.start); if !universal_start { sid = prefilter_restart(dfa, input)?; } } } } // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr) => {{ debug_assert!(input.chunk_pos() < input.chunk().len()); let byte = *input.chunk().get_unchecked(input.chunk_pos()); dfa.next_state_unchecked($sid, byte) }}; } 'outer: loop { // SAFETY: There are two safety invariants we need to uphold here in // the loops below: that 'sid' and 'prev_sid' are valid state IDs // for this DFA, and that 'at' is a valid index into .chunk'. // For the former, we rely on the invariant that next_state* and // start_state_forward always returns a valid state ID (given a valid // state ID in the former case). For the latter safety invariant, we // always guard unchecked access with a check that 'at' is less than // 'end', where 'end <=.chunk.len()'. In the unrolled loop below, we // ensure that 'at' is always in bounds. // // PERF: See a similar comment in src/hybrid/search.rs that justifies // this extra work to make the search loop fast. The same reasoning and // benchmarks apply here. let mut prev_sid; loop { if input.at() >= input.end() || input.chunk_pos() >= input.chunk().len() && !input.advance() { break 'outer; } prev_sid = unsafe { next_unchecked!(sid) }; if dfa.is_special_state(prev_sid) || input.at() + 3 >= input.end() { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos += 1; if input.chunk_pos + 3 >= input.chunk().len() { core::mem::swap(&mut prev_sid, &mut sid); continue; } sid = unsafe { next_unchecked!(prev_sid) }; if dfa.is_special_state(sid) { break; } input.chunk_pos += 1; prev_sid = unsafe { next_unchecked!(sid) }; if dfa.is_special_state(prev_sid) { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos += 1; sid = unsafe { next_unchecked!(prev_sid) }; if dfa.is_special_state(sid) { break; } input.chunk_pos += 1; } if dfa.is_special_state(sid) { if dfa.is_start_state(sid) { if let Some(pre) = pre { let old_pos = input.at(); match literal::find(pre, input) { None => return Ok(mat), Some(ref span) => { // We want to skip any update to 'at' below // at the end of this iteration and just // jump immediately back to the next state // transition at the leading position of the // candidate match. // // ... but only if we actually made progress // with our prefilter, otherwise if the start // state has a self-loop, we can get stuck. if span.start > old_pos { input.move_to(span.start); if !universal_start { sid = prefilter_restart(dfa, input)?; } continue; } else if input.at() != old_pos { // the prefilter may need to do some scan ahead input.move_to(old_pos); } } } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); input.chunk_pos = accel::find_fwd(needles, input, input.chunk_pos + 1) .unwrap_or_else(|| input.chunk().len()); continue; } } else if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); mat = Some(HalfMatch::new(pattern, input.at())); if earliest { return Ok(mat); } if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); input.chunk_pos = accel::find_fwd(needles, input, input.chunk_pos + 1) .unwrap_or_else(|| input.chunk().len()); continue; } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); input.chunk_pos = accel::find_fwd(needles, input, input.chunk_pos + 1) .unwrap_or_else(|| input.chunk().len()); continue; } else if dfa.is_dead_state(sid) { return Ok(mat); } else { // It's important that this is a debug_assert, since this can // actually be tripped even if DFA::from_bytes succeeds and // returns a supposedly valid DFA. debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.chunk()[input.chunk_pos], input.at())); } } input.chunk_pos += 1; } eoi_fwd(dfa, input, &mut sid, &mut mat)?; Ok(mat) } #[inline(never)] pub fn find_rev( dfa: &A, input: &mut Input, ) -> Result, MatchError> { input.move_to(input.end()); if input.is_done() { return Ok(None); } if input.get_earliest() { find_rev_imp(dfa, input, true) } else { find_rev_imp(dfa, input, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_rev_imp( dfa: &A, input: &mut Input, earliest: bool, ) -> Result, MatchError> { let mut mat = None; let mut sid = init_rev(dfa, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use // an unsigned offset, 'at >= 0' is trivially always true. We could avoid // this extra case handling by using a signed offset, but Rust makes it // annoying to do. So... We just handle the empty case separately. if input.start() == input.end() || input.chunk_pos == 0 && !input.backtrack() { eoi_rev(dfa, input, &mut sid, &mut mat)?; return Ok(mat); } input.chunk_pos -= 1; // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr) => {{ let byte = *input.chunk().get_unchecked(input.chunk_pos); dfa.next_state_unchecked($sid, byte) }}; } #[rustfmt::skip] macro_rules! ensure_chunk { () => { if input.chunk_pos == 0 && !input.backtrack() { break; } }; } loop { // SAFETY: See comments in 'find_fwd' for a safety argument. let mut prev_sid; while input.at() >= input.start() { prev_sid = unsafe { next_unchecked!(sid) }; if dfa.is_special_state(prev_sid) || input.at() <= input.start().saturating_add(3) { core::mem::swap(&mut prev_sid, &mut sid); break; } ensure_chunk!(); input.chunk_pos -= 1; if input.chunk_pos <= 2 { core::mem::swap(&mut prev_sid, &mut sid); continue; } sid = unsafe { next_unchecked!(prev_sid) }; if dfa.is_special_state(sid) { break; } input.chunk_pos -= 1; prev_sid = unsafe { next_unchecked!(sid) }; if dfa.is_special_state(prev_sid) { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos -= 1; sid = unsafe { next_unchecked!(prev_sid) }; if dfa.is_special_state(sid) { break; } input.chunk_pos -= 1; } if dfa.is_special_state(sid) { if dfa.is_start_state(sid) { if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); input.chunk_pos = accel::find_rev(needles, input, input.chunk_pos) .map(|i| i + 1) .unwrap_or(0); } } else if dfa.is_match_state(sid) { let pattern = dfa.match_pattern(sid, 0); // Since reverse searches report the beginning of a match // and the beginning is inclusive (not exclusive like the // end of a match), we add 1 to make it inclusive. mat = Some(HalfMatch::new(pattern, input.at() + 1)); if earliest { return Ok(mat); } if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); input.chunk_pos = accel::find_rev(needles, input, input.chunk_pos) .map(|i| i + 1) .unwrap_or(0); } } else if dfa.is_accel_state(sid) { let needles = dfa.accelerator(sid); // If the accelerator returns nothing, why don't we quit the // search? Well, if the accelerator doesn't find anything, that // doesn't mean we don't have a match. It just means that we // can't leave the current state given one of the 255 possible // byte values. However, there might be an EOI transition. So // we set 'at' to the end of the.chunk, which will cause // this loop to stop and fall down into the EOI transition. input.chunk_pos = accel::find_rev(needles, input, input.chunk_pos).map(|i| i + 1).unwrap_or(0); } else if dfa.is_dead_state(sid) { return Ok(mat); } else { debug_assert!(dfa.is_quit_state(sid)); return Err(MatchError::quit(input.chunk()[input.chunk_pos], input.at())); } } if input.at() <= input.start() { break; } ensure_chunk!(); input.chunk_pos -= 1; } eoi_rev(dfa, input, &mut sid, &mut mat)?; Ok(mat) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd( dfa: &A, input: &mut Input, ) -> Result { let look_behind = input.ensure_look_behind(); let start_config = start::Config::new().look_behind(look_behind).anchored(input.get_anchored()); // let sid = dfa.start_state(&start_config)?; dfa.start_state(&start_config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input.at().checked_sub(1).expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => MatchError::unsupported_anchored(mode), _ => panic!("damm forward compatability"), }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &A, input: &mut Input, ) -> Result { let chunk_pos = input.chunk_pos(); let mut look_ahead = input.chunk().get(chunk_pos).copied(); // this branch is probably not need since chunk_pos should be in bounds // anyway but I would rather not make that a validity invariant if look_ahead.is_none() && input.advance() { look_ahead = input.chunk().first().copied(); input.backtrack(); } let start_config = start::Config::new().look_behind(look_ahead).anchored(input.get_anchored()); dfa.start_state(&start_config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input.start().checked_sub(1).expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => MatchError::unsupported_anchored(mode), _ => panic!("damm forward compatability"), }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd( dfa: &A, input: &mut Input, sid: &mut StateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); input.move_to(sp.end); match input.chunk().get(sp.end - input.chunk_offset()) { Some(&b) => { *sid = dfa.next_state(*sid, b); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!dfa.is_quit_state(*sid)); } } Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev( dfa: &A, input: &mut Input, sid: &mut StateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); if sp.start > 0 { input.move_to(input.start() - 1); let byte = input.chunk()[sp.start - input.chunk_offset() - 1]; *sid = dfa.next_state(*sid, byte); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if dfa.is_quit_state(*sid) { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa.next_eoi_state(*sid); if dfa.is_match_state(*sid) { let pattern = dfa.match_pattern(*sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!dfa.is_quit_state(*sid)); } Ok(()) } /// Re-compute the starting state that a DFA should be in after finding a /// prefilter candidate match at the position `at`. /// /// The function with the same name has a bit more docs in hybrid/search.rs. #[cfg_attr(feature = "perf-inline", inline(always))] fn prefilter_restart( dfa: &A, input: &mut Input, ) -> Result { init_fwd(dfa, input) } regex-cursor-0.1.4/src/engines/dfa/test.rs000064400000000000000000000040031046102023000165460ustar 00000000000000use proptest::proptest; use crate::engines::dfa::find_iter; use crate::input::Input; #[test] fn searcher() { let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap(); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) .build("vec") .unwrap(); let rope = ropey::Rope::from_str(&text); let matches: Vec<_> = find_iter(®ex, Input::new(rope.slice(..))) .map(|range| rope.byte_slice(range.range())) .collect(); assert_eq!(matches.len(), 68); } #[test] fn anchor() { let haystack = ":a"; let needle = "$|:"; let foo = ropey::Rope::from_str(haystack); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false)) .build(needle) .unwrap(); let iter1: Vec<_> = regex.find_iter(haystack).collect(); let iter2: Vec<_> = find_iter(®ex, Input::new(&foo)).collect(); assert_eq!(iter1, iter2); } #[test] fn hotloop_transition() { let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ"; let needle = "/"; let foo = ropey::Rope::from_str(haystack); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) .build(needle) .unwrap(); let iter1: Vec<_> = regex.find_iter(haystack).collect(); let iter2: Vec<_> = find_iter(®ex, Input::new(&foo)).collect(); assert_eq!(iter1, iter2); } proptest! { #[test] fn matches(mut haystack: String, needle: String) { haystack = haystack.repeat(1024); let foo = ropey::Rope::from_str(&haystack); let Ok(regex) = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new() .case_insensitive(true) ) .build(&needle) else { return Ok(()) }; let iter1 = regex.find_iter( &haystack); let iter2 = find_iter(®ex, Input::new(&foo)); crate::util::iter::prop_assert_eq(iter1, iter2)?; } } regex-cursor-0.1.4/src/engines/dfa.rs000064400000000000000000000223541046102023000156000ustar 00000000000000pub use regex_automata::dfa::regex::Regex; use regex_automata::dfa::Automaton; use regex_automata::{Anchored, Match, MatchError}; use crate::cursor::Cursor; use crate::util::iter; use crate::Input; pub use crate::engines::dfa::search::{try_search_fwd, try_search_rev}; mod accel; mod search; #[cfg(test)] mod test; /// Returns true if either the given input specifies an anchored search /// or if the underlying NFA is always anchored. fn is_anchored(regex: &Regex, input: &Input) -> bool { match input.get_anchored() { Anchored::No => regex.forward().is_always_start_anchored(), Anchored::Yes | Anchored::Pattern(_) => true, } } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// The above conditions also apply to the iterator returned as well. For /// example, if the lazy DFA gives up or quits during a search using this /// method, then a panic will occur during iteration. /// /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) /// if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter(regex: &Regex, input: Input) -> FindMatches<'_, C> { let it = iter::Searcher::new(input); FindMatches { re: regex, it } } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, hybrid::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..11)), /// re.find(&mut cache, "zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// let mut cache = re.create_cache(); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); /// # Ok::<(), Box>(()) /// ``` pub fn find(regex: &Regex, input: &mut Input) -> Option { try_search(regex, input).unwrap() } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but with two differences: /// /// 1. It is not generic over `Into` and instead accepts a /// `&Input`. This permits reusing the same `Input` for multiple searches /// without needing to create a new one. This _may_ help with latency. /// 2. It returns an error if the search could not complete where as /// [`Regex::find`] will panic. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. pub fn try_search( regex: &Regex, input: &mut Input, ) -> Result, MatchError> { let fwd = regex.forward(); let end = match try_search_fwd(fwd, input)? { None => return Ok(None), Some(end) => end, }; // This special cases an empty match at the beginning of the search. If // our end matches our start, then since a reverse DFA can't match past // the start, it must follow that our starting position is also our end // position. So short circuit and skip the reverse search. if input.start() == end.offset() { return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset()))); } // We can also skip the reverse search if we know our search was // anchored. This occurs either when the input config is anchored or // when we know the regex itself is anchored. In this case, we know the // start of the match, if one is found, must be the start of the // search. if is_anchored(regex, input) { return Ok(Some(Match::new(end.pattern(), input.start()..end.offset()))); } // N.B. I have tentatively convinced myself that it isn't necessary // to specify the specific pattern for the reverse search since the // reverse search will always find the same pattern to match as the // forward search. But I lack a rigorous proof. Why not just provide // the pattern anyway? Well, if it is needed, then leaving it out // gives us a chance to find a witness. (Also, if we don't need to // specify the pattern, then we don't need to build the reverse DFA // with 'starts_for_each_pattern' enabled. It doesn't matter too much // for the lazy DFA, but does make the overall DFA bigger.) // // We also need to be careful to disable 'earliest' for the reverse // search, since it could be enabled for the forward search. In the // reverse case, to satisfy "leftmost" criteria, we need to match as // much as we can. We also need to be careful to make the search // anchored. We don't want the reverse search to report any matches // other than the one beginning at the end of our forward search. let match_range = input.start()..end.offset(); let start = input.with(|mut revsearch| { revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false); try_search_rev(regex.reverse(), revsearch) }); let start = start?.expect("reverse search must match if forward search does"); debug_assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); debug_assert!(start.offset() <= end.offset()); debug_assert!(end.offset() <= input.end()); debug_assert!(input.start() <= start.offset()); Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, C: Cursor> { re: &'r Regex, it: iter::Searcher, } impl<'r, C: Cursor> Iterator for FindMatches<'r, C> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut it } = *self; it.advance(|input| try_search(re, input)) } } regex-cursor-0.1.4/src/engines/hybrid/search.rs000064400000000000000000001203621046102023000175720ustar 00000000000000use regex_automata::hybrid::dfa::{Cache, DFA}; use regex_automata::hybrid::{LazyStateID, StartError}; use regex_automata::util::prefilter::Prefilter; use regex_automata::util::start; use regex_automata::{HalfMatch, MatchError}; use crate::cursor::Cursor; use crate::input::Input; use crate::literal; use crate::util::empty::{skip_splits_fwd, skip_splits_rev}; /// Executes a forward search and returns the end position of the leftmost /// match that is found. If no match exists, then `None` is returned. /// /// In particular, this method continues searching even after it enters /// a match state. The search only terminates once it has reached the /// end of the input or when it has entered a dead or quit state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This example shows how to run a basic search. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// let dfa = DFA::new("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 8); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &mut Input::new("foo12345"))?, /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over later parts. /// let dfa = DFA::new("abc|a")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 3); /// assert_eq!(Some(expected), dfa.try_search_fwd( /// &mut cache, &mut Input::new("abc"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specific pattern search /// /// This example shows how to build a lazy multi-DFA that permits searching /// for specific patterns. /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// Anchored, HalfMatch, PatternID, Input, /// }; /// /// let dfa = DFA::builder() /// .configure(DFA::config().starts_for_each_pattern(true)) /// .build_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let mut cache = dfa.create_cache(); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(HalfMatch::must(0, 6)); /// let got = dfa.try_search_fwd(&mut cache, &mut Input::new(haystack))?; /// assert_eq!(expected, got); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(HalfMatch::must(1, 6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// let got = dfa.try_search_fwd(&mut cache, &input)?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// use regex_automata::{hybrid::dfa::DFA, HalfMatch, Input}; /// /// // N.B. We disable Unicode here so that we use a simple ASCII word /// // boundary. Alternatively, we could enable heuristic support for /// // Unicode word boundaries since our haystack is pure ASCII. /// let dfa = DFA::new(r"(?-u)\b[0-9]{3}\b")?; /// let mut cache = dfa.create_cache(); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about the /// // larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `3` instead of `6`. /// let expected = Some(HalfMatch::must(0, 3)); /// let got = dfa.try_search_fwd( /// &mut cache, /// &mut Input::new(&haystack[3..6]), /// )?; /// assert_eq!(expected, got); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let got = dfa.try_search_fwd( /// &mut cache, /// &mut Input::new(haystack).range(3..6), /// )?; /// assert_eq!(expected, got); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_fwd( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result, MatchError> { let utf8empty = dfa.get_nfa().has_empty() && dfa.get_nfa().is_utf8(); let hm = match find_fwd(dfa, cache, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; // We get to this point when we know our DFA can match the empty string // AND when UTF-8 mode is enabled. In this case, we skip any matches // whose offset splits a codepoint. Such a match is necessarily a // zero-width match, because UTF-8 mode requires the underlying NFA // to be built such that all non-empty matches span valid UTF-8. // Therefore, any match that ends in the middle of a codepoint cannot // be part of a span of valid UTF-8 and thus must be an empty match. // In such cases, we skip it, so as not to report matches that split a // codepoint. // // Note that this is not a checked assumption. Callers *can* provide an // NFA with UTF-8 mode enabled but produces non-empty matches that span // invalid UTF-8. But doing so is documented to result in unspecified // behavior. skip_splits_fwd(input, hm, hm.offset(), |input| { let got = find_fwd(dfa, cache, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } /// Executes a reverse search and returns the start of the position of the /// leftmost match that is found. If no match exists, then `None` is /// returned. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. /// /// # Example /// /// This routine is principally useful when used in /// conjunction with the /// [`nfa::thompson::Config::reverse`](crate::nfa::thompson::Config::reverse) /// configuration. In general, it's unlikely to be correct to use both /// `try_search_fwd` and `try_search_rev` with the same DFA since any /// particular DFA will only support searching in one direction with /// respect to the pattern. /// /// ``` /// use regex_automata::{ /// nfa::thompson, /// hybrid::dfa::DFA, /// HalfMatch, Input, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build("foo[0-9]+")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 0); /// assert_eq!( /// Some(expected), /// dfa.try_search_rev(&mut cache, &mut Input::new("foo12345"))?, /// ); /// /// // Even though a match is found after reading the last byte (`c`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build("abc|c")?; /// let mut cache = dfa.create_cache(); /// let expected = HalfMatch::must(0, 0); /// assert_eq!(Some(expected), dfa.try_search_rev( /// &mut cache, &mut Input::new("abc"))?, /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: UTF-8 mode /// /// This examples demonstrates that UTF-8 mode applies to reverse /// DFAs. When UTF-8 mode is enabled in the underlying NFA, then all /// matches reported must correspond to valid UTF-8 spans. This includes /// prohibiting zero-width matches that split a codepoint. /// /// UTF-8 mode is enabled by default. Notice below how the only zero-width /// matches reported are those at UTF-8 boundaries: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true)) /// .build(r"")?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&mut cache, &input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` /// /// Now let's look at the same example, but with UTF-8 mode on the /// underlying NFA disabled: /// /// ``` /// use regex_automata::{ /// hybrid::dfa::DFA, /// nfa::thompson, /// HalfMatch, Input, MatchKind, /// }; /// /// let dfa = DFA::builder() /// .thompson(thompson::Config::new().reverse(true).utf8(false)) /// .build(r"")?; /// let mut cache = dfa.create_cache(); /// /// // Run the reverse DFA to collect all matches. /// let mut input = Input::new("☃"); /// let mut matches = vec![]; /// loop { /// match dfa.try_search_rev(&mut cache, &input)? { /// None => break, /// Some(hm) => { /// matches.push(hm); /// if hm.offset() == 0 || input.end() == 0 { /// break; /// } else if hm.offset() < input.end() { /// input.set_end(hm.offset()); /// } else { /// // This is only necessary to handle zero-width /// // matches, which of course occur in this example. /// // Without this, the search would never advance /// // backwards beyond the initial match. /// input.set_end(input.end() - 1); /// } /// } /// } /// } /// /// // No matches split a codepoint. /// let expected = vec![ /// HalfMatch::must(0, 3), /// HalfMatch::must(0, 2), /// HalfMatch::must(0, 1), /// HalfMatch::must(0, 0), /// ]; /// assert_eq!(expected, matches); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn try_search_rev( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result, MatchError> { let utf8empty = dfa.get_nfa().has_empty() && dfa.get_nfa().is_utf8(); let hm = match find_rev(dfa, cache, input)? { None => return Ok(None), Some(hm) if !utf8empty => return Ok(Some(hm)), Some(hm) => hm, }; skip_splits_rev(input, hm, hm.offset(), |input| { let got = find_rev(dfa, cache, input)?; Ok(got.map(|hm| (hm, hm.offset()))) }) } #[inline(never)] pub(crate) fn find_fwd( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result, MatchError> { input.move_to(input.start()); if input.is_done() { return Ok(None); } let pre = if input.get_anchored().is_anchored() { None } else { dfa.get_config().get_prefilter() }; // So what we do here is specialize four different versions of 'find_fwd': // one for each of the combinations for 'has prefilter' and 'is earliest // search'. The reason for doing this is that both of these things require // branches and special handling in some code that can be very hot, // and shaving off as much as we can when we don't need it tends to be // beneficial in ad hoc benchmarks. To see these differences, you often // need a query with a high match count. In other words, specializing these // four routines *tends* to help latency more than throughput. if pre.is_some() { if input.get_earliest() { find_fwd_imp(dfa, cache, input, pre, true) } else { find_fwd_imp(dfa, cache, input, pre, false) } } else if input.get_earliest() { find_fwd_imp(dfa, cache, input, None, true) } else { find_fwd_imp(dfa, cache, input, None, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_fwd_imp( dfa: &DFA, cache: &mut Cache, input: &mut Input, pre: Option<&'_ Prefilter>, earliest: bool, ) -> Result, MatchError> { // See 'prefilter_restart' docs for explanation. let universal_start = dfa.get_nfa().look_set_prefix_any().is_empty(); let mut mat = None; let mut sid = init_fwd(dfa, cache, input)?; if let Some(pre) = pre { // If a prefilter doesn't report false positives, then we don't need to // touch the DFA at all. However, since all matches include the pattern // ID, and the prefilter infrastructure doesn't report pattern IDs, we // limit this optimization to cases where there is exactly one pattern. // In that case, any match must be the 0th pattern. match literal::find(pre, input) { None => return Ok(mat), Some(ref span) => { input.move_to(span.start); if !universal_start { sid = prefilter_restart(dfa, cache, input)?; } } } } // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr) => {{ debug_assert!(input.chunk_pos() < input.chunk().len()); let byte = *input.chunk().get_unchecked(input.chunk_pos()); dfa.next_state_untagged_unchecked(cache, $sid, byte) }}; } macro_rules! ensure_chunk { () => {{ if input.chunk_pos() >= input.chunk().len() && !input.advance() { break; } }}; } cache.search_start(input.at()); while input.at() < input.end() { if sid.is_tagged() { ensure_chunk!(); cache.search_update(input.at()); sid = dfa .next_state(cache, sid, input.chunk()[input.chunk_pos]) .map_err(|_| gave_up(input.at()))?; } else { // SAFETY: There are two safety invariants we need to uphold // here in the loops below: that 'sid' and 'prev_sid' are valid // state IDs for this DFA, and that 'at' is a valid index into // 'haystack'. For the former, we rely on the invariant that // next_state* and start_state_forward always returns a valid state // ID (given a valid state ID in the former case), and that we are // only at this place in the code if 'sid' is untagged. Moreover, // every call to next_state_untagged_unchecked below is guarded by // a check that sid is untagged. For the latter safety invariant, // we always guard unchecked access with a check that 'at' is less // than 'end', where 'end <= haystack.len()'. In the unrolled loop // below, we ensure that 'at' is always in bounds. // // PERF: For justification of omitting bounds checks, it gives us a // ~10% bump in search time. This was used for a benchmark: // // regex-cli find hybrid dfa @bigfile '(?m)^.+$' -UBb // // PERF: For justification for the loop unrolling, we use a few // different tests: // // regex-cli find hybrid dfa @$bigfile '\w{50}' -UBb // regex-cli find hybrid dfa @$bigfile '(?m)^.+$' -UBb // regex-cli find hybrid dfa @$bigfile 'ZQZQZQZQ' -UBb // // And there are three different configurations: // // nounroll: this entire 'else' block vanishes and we just // always use 'dfa.next_state(..)'. // unroll1: just the outer loop below // unroll2: just the inner loop below // unroll3: both the outer and inner loops below // // This results in a matrix of timings for each of the above // regexes with each of the above unrolling configurations: // // '\w{50}' '(?m)^.+$' 'ZQZQZQZQ' // nounroll 1.51s 2.34s 1.51s // unroll1 1.53s 2.32s 1.56s // unroll2 2.22s 1.50s 0.61s // unroll3 1.67s 1.45s 0.61s // // Ideally we'd be able to find a configuration that yields the // best time for all regexes, but alas we settle for unroll3 that // gives us *almost* the best for '\w{50}' and the best for the // other two regexes. // // So what exactly is going on here? The first unrolling (grouping // together runs of untagged transitions) specifically targets // our choice of representation. The second unrolling (grouping // together runs of self-transitions) specifically targets a common // DFA topology. Let's dig in a little bit by looking at our // regexes: // // '\w{50}': This regex spends a lot of time outside of the DFA's // start state matching some part of the '\w' repetition. This // means that it's a bit of a worst case for loop unrolling that // targets self-transitions since the self-transitions in '\w{50}' // are not particularly active for this haystack. However, the // first unrolling (grouping together untagged transitions) // does apply quite well here since very few transitions hit // match/dead/quit/unknown states. It is however worth mentioning // that if start states are configured to be tagged (which you // typically want to do if you have a prefilter), then this regex // actually slows way down because it is constantly ping-ponging // out of the unrolled loop and into the handling of a tagged start // state below. But when start states aren't tagged, the unrolled // loop stays hot. (This is why it's imperative that start state // tagging be disabled when there isn't a prefilter!) // // '(?m)^.+$': There are two important aspects of this regex: 1) // on this haystack, its match count is very high, much higher // than the other two regex and 2) it spends the vast majority // of its time matching '.+'. Since Unicode mode is disabled, // this corresponds to repeatedly following self transitions for // the vast majority of the input. This does benefit from the // untagged unrolling since most of the transitions will be to // untagged states, but the untagged unrolling does more work than // what is actually required. Namely, it has to keep track of the // previous and next state IDs, which I guess requires a bit more // shuffling. This is supported by the fact that nounroll+unroll1 // are both slower than unroll2+unroll3, where the latter has a // loop unrolling that specifically targets self-transitions. // // 'ZQZQZQZQ': This one is very similar to '(?m)^.+$' because it // spends the vast majority of its time in self-transitions for // the (implicit) unanchored prefix. The main difference with // '(?m)^.+$' is that it has a much lower match count. So there // isn't much time spent in the overhead of reporting matches. This // is the primary explainer in the perf difference here. We include // this regex and the former to make sure we have comparison points // with high and low match counts. // // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. // // NOTE: In a follow-up, it turns out that the "inner" loop // mentioned above was a pretty big pessimization in some other // cases. Namely, it resulted in too much ping-ponging into and out // of the loop, which resulted in nearly ~2x regressions in search // time when compared to the originaly lazy DFA in the regex crate. // So I've removed the second loop unrolling that targets the // self-transition case. let mut prev_sid = sid; while input.at() < input.end() { ensure_chunk!(); prev_sid = unsafe { next_unchecked!(sid) }; if prev_sid.is_tagged() || input.at() + 3 >= input.end() { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos += 1; if input.chunk_pos + 3 >= input.chunk().len() { core::mem::swap(&mut prev_sid, &mut sid); continue; } sid = unsafe { next_unchecked!(prev_sid) }; if sid.is_tagged() { break; } input.chunk_pos += 1; prev_sid = unsafe { next_unchecked!(sid) }; if prev_sid.is_tagged() { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos += 1; sid = unsafe { next_unchecked!(prev_sid) }; if sid.is_tagged() { break; } input.chunk_pos += 1; } // If we quit out of the code above with an unknown state ID at // any point, then we need to re-compute that transition using // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { cache.search_update(input.at()); sid = dfa .next_state(cache, prev_sid, input.chunk()[input.chunk_pos]) .map_err(|_| gave_up(input.at()))?; } } if sid.is_tagged() { if sid.is_start() { if let Some(pre) = pre { let old_pos = input.at(); match literal::find(pre, input) { None => { cache.search_finish(input.at()); return Ok(mat); } Some(ref span) => { // We want to skip any update to 'at' below // at the end of this iteration and just // jump immediately back to the next state // transition at the leading position of the // candidate match. // // ... but only if we actually made progress // with our prefilter, otherwise if the start // state has a self-loop, we can get stuck. if span.start > old_pos { input.move_to(span.start); if !universal_start { sid = prefilter_restart(dfa, cache, input)?; } continue; } else if input.at() != old_pos { // the prefilter may need to do some scan ahead input.move_to(old_pos); } } } } } else if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); // Since slice ranges are inclusive at the beginning and // exclusive at the end, and since forward searches report // the end, we can return 'at' as-is. This only works because // matches are delayed by 1 byte. So by the time we observe a // match, 'at' has already been set to 1 byte past the actual // match location, which is precisely the exclusive ending // bound of the match. mat = Some(HalfMatch::new(pattern, input.at())); if earliest { cache.search_finish(input.at()); return Ok(mat); } } else if sid.is_dead() { cache.search_finish(input.at()); return Ok(mat); } else if sid.is_quit() { cache.search_finish(input.at()); return Err(MatchError::quit(input.chunk()[input.chunk_pos], input.at())); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } input.chunk_pos += 1; } eoi_fwd(dfa, cache, input, &mut sid, &mut mat)?; cache.search_finish(input.end()); Ok(mat) } #[inline(never)] pub(crate) fn find_rev( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result, MatchError> { input.move_to(input.end()); if input.is_done() { return Ok(None); } if input.get_earliest() { find_rev_imp(dfa, cache, input, true) } else { find_rev_imp(dfa, cache, input, false) } } #[cfg_attr(feature = "perf-inline", inline(always))] fn find_rev_imp( dfa: &DFA, cache: &mut Cache, input: &mut Input, earliest: bool, ) -> Result, MatchError> { let mut mat = None; let mut sid = init_rev(dfa, cache, input)?; // In reverse search, the loop below can't handle the case of searching an // empty slice. Ideally we could write something congruent to the forward // search, i.e., 'while at >= start', but 'start' might be 0. Since we use // an unsigned offset, 'at >= 0' is trivially always true. We could avoid // this extra case handling by using a signed offset, but Rust makes it // annoying to do. So... We just handle the empty case separately. if input.start() == input.end() || input.chunk_pos == 0 && !input.backtrack() { eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; return Ok(mat); } input.chunk_pos -= 1; // This could just be a closure, but then I think it would be unsound // because it would need to be safe to invoke. This way, the lack of safety // is clearer in the code below. macro_rules! next_unchecked { ($sid:expr) => {{ let byte = *input.chunk().get_unchecked(input.chunk_pos); dfa.next_state_untagged_unchecked(cache, $sid, byte) }}; } #[rustfmt::skip] macro_rules! ensure_chunk { () => { if input.chunk_pos == 0 && !input.backtrack() { break; } }; } cache.search_start(input.at()); loop { if sid.is_tagged() { cache.search_update(input.at()); sid = dfa .next_state(cache, sid, input.chunk()[input.chunk_pos]) .map_err(|_| gave_up(input.at()))?; } else { // SAFETY: See comments in 'find_fwd' for a safety argument. // // PERF: The comments in 'find_fwd' also provide a justification // from a performance perspective as to 1) why we elide bounds // checks and 2) why we do a specialized version of unrolling // below. The reverse search does have a slightly different // consideration in that most reverse searches tend to be // anchored and on shorter haystacks. However, this still makes a // difference. Take this command for example: // // regex-cli find hybrid regex @$bigfile '(?m)^.+$' -UBb // // (Notice that we use 'find hybrid regex', not 'find hybrid dfa' // like in the justification for the forward direction. The 'regex' // sub-command will find start-of-match and thus run the reverse // direction.) // // Without unrolling below, the above command takes around 3.76s. // But with the unrolling below, we get down to 2.55s. If we keep // the unrolling but add in bounds checks, then we get 2.86s. // // NOTE: I used 'OpenSubtitles2018.raw.sample.en' for 'bigfile'. let mut prev_sid = sid; while input.at() >= input.start() { prev_sid = unsafe { next_unchecked!(sid) }; if prev_sid.is_tagged() || input.at() <= input.start().saturating_add(3) { core::mem::swap(&mut prev_sid, &mut sid); break; } ensure_chunk!(); input.chunk_pos -= 1; if input.chunk_pos <= 2 { core::mem::swap(&mut prev_sid, &mut sid); continue; } sid = unsafe { next_unchecked!(prev_sid) }; if sid.is_tagged() { break; } input.chunk_pos -= 1; prev_sid = unsafe { next_unchecked!(sid) }; if prev_sid.is_tagged() { core::mem::swap(&mut prev_sid, &mut sid); break; } input.chunk_pos -= 1; sid = unsafe { next_unchecked!(prev_sid) }; if sid.is_tagged() { break; } input.chunk_pos -= 1; } // If we quit out of the code above with an unknown state ID at // any point, then we need to re-compute that transition using // 'next_state', which will do NFA powerset construction for us. if sid.is_unknown() { cache.search_update(input.at()); sid = dfa .next_state(cache, prev_sid, input.chunk()[input.chunk_pos]) .map_err(|_| gave_up(input.at()))?; } } if sid.is_tagged() { if sid.is_start() { // do nothing } else if sid.is_match() { let pattern = dfa.match_pattern(cache, sid, 0); // Since reverse searches report the beginning of a match // and the beginning is inclusive (not exclusive like the // end of a match), we add 1 to make it inclusive. mat = Some(HalfMatch::new(pattern, input.at() + 1)); if earliest { cache.search_finish(input.at()); return Ok(mat); } } else if sid.is_dead() { cache.search_finish(input.at()); return Ok(mat); } else if sid.is_quit() { cache.search_finish(input.at()); return Err(MatchError::quit(input.chunk()[input.chunk_pos], input.at())); } else { debug_assert!(sid.is_unknown()); unreachable!("sid being unknown is a bug"); } } if input.at() <= input.start() { break; } ensure_chunk!(); input.chunk_pos -= 1; } cache.search_finish(input.start()); eoi_rev(dfa, cache, input, &mut sid, &mut mat)?; Ok(mat) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_fwd( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result { let look_behind = input.ensure_look_behind(); let start_config = start::Config::new().look_behind(look_behind).anchored(input.get_anchored()); // let sid = dfa.start_state(&start_config)?; dfa.start_state(cache, &start_config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input.at().checked_sub(1).expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => MatchError::unsupported_anchored(mode), StartError::Cache { .. } => MatchError::gave_up(input.end()), _ => panic!("damm forward compatability"), }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn init_rev( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result { let chunk_pos = input.chunk_pos(); let mut look_ahead = input.chunk().get(chunk_pos).copied(); // this branch is probably not need since chunk_pos should be in bounds // anyway but I would rather not make that a validity invariant if look_ahead.is_none() && input.advance() { look_ahead = input.chunk().first().copied(); input.backtrack(); } let start_config = start::Config::new().look_behind(look_ahead).anchored(input.get_anchored()); dfa.start_state(cache, &start_config).map_err(|err| match err { StartError::Quit { byte } => { let offset = input.start().checked_sub(1).expect("no quit in start without look-behind"); MatchError::quit(byte, offset) } StartError::UnsupportedAnchored { mode } => MatchError::unsupported_anchored(mode), StartError::Cache { .. } => MatchError::gave_up(input.end()), _ => panic!("damm forward compatability"), }) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_fwd( dfa: &DFA, cache: &mut Cache, input: &mut Input, sid: &mut LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); input.move_to(sp.end); match input.chunk().get(sp.end - input.chunk_offset()) { Some(&b) => { *sid = dfa.next_state(cache, *sid, b).map_err(|_| gave_up(sp.end))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } else if sid.is_quit() { return Err(MatchError::quit(b, sp.end)); } } None => { *sid = dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.end))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.end)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } } Ok(()) } #[cfg_attr(feature = "perf-inline", inline(always))] fn eoi_rev( dfa: &DFA, cache: &mut Cache, input: &mut Input, sid: &mut LazyStateID, mat: &mut Option, ) -> Result<(), MatchError> { let sp = input.get_span(); // debug_assert_eq!(sp.start, 0); if sp.start > 0 { input.move_to(input.start() - 1); let byte = input.chunk()[sp.start - input.chunk_offset() - 1]; *sid = dfa.next_state(cache, *sid, byte).map_err(|_| gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, sp.start)); } else if sid.is_quit() { return Err(MatchError::quit(byte, sp.start - 1)); } } else { *sid = dfa.next_eoi_state(cache, *sid).map_err(|_| gave_up(sp.start))?; if sid.is_match() { let pattern = dfa.match_pattern(cache, *sid, 0); *mat = Some(HalfMatch::new(pattern, 0)); } // N.B. We don't have to check 'is_quit' here because the EOI // transition can never lead to a quit state. debug_assert!(!sid.is_quit()); } Ok(()) } // /// Re-compute the starting state that a DFA should be in after finding a // /// prefilter candidate match at the position `at`. // /// // /// It is always correct to call this, but not always necessary. Namely, // /// whenever the DFA has a universal start state, the DFA can remain in the // /// start state that it was in when it ran the prefilter. Why? Because in that // /// case, there is only one start state. // /// // /// When does a DFA have a universal start state? In precisely cases where // /// it has no look-around assertions in its prefix. So for example, `\bfoo` // /// does not have a universal start state because the start state depends on // /// whether the byte immediately before the start position is a word byte or // /// not. However, `foo\b` does have a universal start state because the word // /// boundary does not appear in the pattern's prefix. // /// // /// So... most cases don't need this, but when a pattern doesn't have a // /// universal start state, then after a prefilter candidate has been found, the // /// current state *must* be re-litigated as if computing the start state at the // /// beginning of the search because it might change. That is, not all start // /// states are created equal. // /// // /// Why avoid it? Because while it's not super expensive, it isn't a trivial // /// operation to compute the start state. It is much better to avoid it and // /// just state in the current state if you know it to be correct. // #[cfg_attr(feature = "perf-inline", inline(always))] // fn prefilter_restart<'h>( // dfa: &DFA, // cache: &mut Cache, // input: &mut Input<'h>, // at: usize, // ) -> Result { // let mut input = input.clone(); // input.set_start(at); // init_fwd(dfa, cache, &input) // } /// A convenience routine for constructing a "gave up" match error. #[cfg_attr(feature = "perf-inline", inline(always))] fn gave_up(offset: usize) -> MatchError { MatchError::gave_up(offset) } /// Re-compute the starting state that a DFA should be in after finding a /// prefilter candidate match at the position `at`. /// /// The function with the same name has a bit more docs in hybrid/search.rs. #[cfg_attr(feature = "perf-inline", inline(always))] fn prefilter_restart( dfa: &DFA, cache: &mut Cache, input: &mut Input, ) -> Result { init_fwd(dfa, cache, input) } regex-cursor-0.1.4/src/engines/hybrid/test.rs000064400000000000000000000046021046102023000173020ustar 00000000000000use proptest::proptest; use crate::engines::hybrid::find_iter; use crate::input::Input; #[test] fn searcher() { let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap(); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) .build("vec") .unwrap(); let mut cache = regex.create_cache(); let rope = ropey::Rope::from_str(&text); let matches: Vec<_> = find_iter(®ex, &mut cache, Input::new(&rope)) .map(|range| rope.byte_slice(range.range())) .collect(); assert_eq!(matches.len(), 68); } #[test] fn anchor() { let haystack = ":a"; let needle = "$|:"; let foo = ropey::Rope::from_str(haystack); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true).unicode(false)) .build(needle) .unwrap(); let mut cache1 = regex.create_cache(); let mut cache2 = regex.create_cache(); let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect(); let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(&foo)).collect(); assert_eq!(iter1, iter2); } #[test] fn hotloop_transition() { let haystack = "Σ /ⶠaAA ﷏00AAΣ/എ"; let needle = "/"; let foo = ropey::Rope::from_str(haystack); let regex = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new().case_insensitive(true)) .build(needle) .unwrap(); let mut cache1 = regex.create_cache(); let mut cache2 = regex.create_cache(); let iter1: Vec<_> = regex.find_iter(&mut cache1, haystack).collect(); let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(&foo)).collect(); assert_eq!(iter1, iter2); } proptest! { #[test] fn matches(mut haystack: String, needle: String) { haystack = haystack.repeat(1024); let foo = ropey::Rope::from_str(&haystack); let Ok(regex) = super::Regex::builder() .syntax(regex_automata::util::syntax::Config::new() .case_insensitive(true) ) .build(&needle) else { return Ok(()) }; let mut cache1 = regex.create_cache(); let mut cache2 = regex.create_cache(); let iter1 = regex.find_iter(&mut cache1, &haystack); let iter2 = find_iter(®ex, &mut cache2, Input::new(&foo)); crate::util::iter::prop_assert_eq(iter1, iter2)?; } } regex-cursor-0.1.4/src/engines/hybrid.rs000064400000000000000000000232311046102023000163220ustar 00000000000000pub use regex_automata::hybrid::regex::{Cache, Regex}; use regex_automata::{Anchored, Match, MatchError}; use crate::cursor::Cursor; use crate::input::Input; use crate::util::iter; pub use crate::engines::hybrid::search::{try_search_fwd, try_search_rev}; mod search; #[cfg(test)] mod test; /// Returns true if either the given input specifies an anchored search /// or if the underlying NFA is always anchored. fn is_anchored(regex: &Regex, input: &Input) -> bool { match input.get_anchored() { Anchored::No => regex.forward().get_nfa().is_always_start_anchored(), Anchored::Yes | Anchored::Pattern(_) => true, } } /// Returns an iterator over all non-overlapping leftmost matches in the /// given bytes. If no match exists, then the iterator yields no elements. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// The above conditions also apply to the iterator returned as well. For /// example, if the lazy DFA gives up or quits during a search using this /// method, then a panic will occur during iteration. /// /// Use [`Regex::try_search`] with [`util::iter::Searcher`](iter::Searcher) /// if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{hybrid::regex::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// /// let text = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(&mut cache, text).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter<'r, 'c, C: Cursor>( regex: &'r Regex, cache: &'c mut Cache, input: Input, ) -> FindMatches<'r, 'c, C> { let it = iter::Searcher::new(input); FindMatches { re: regex, cache, it } } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// # Panics /// /// This routine panics if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search panics, callers cannot know whether a match exists or /// not. /// /// Use [`Regex::try_search`] if you want to handle these error conditions. /// /// # Example /// /// ``` /// use regex_automata::{Match, hybrid::regex::Regex}; /// /// let re = Regex::new("foo[0-9]+")?; /// let mut cache = re.create_cache(); /// assert_eq!( /// Some(Match::must(0, 3..11)), /// re.find(&mut cache, "zzzfoo12345zzz"), /// ); /// /// // Even though a match is found after reading the first byte (`a`), /// // the default leftmost-first match semantics demand that we find the /// // earliest match that prefers earlier parts of the pattern over latter /// // parts. /// let re = Regex::new("abc|a")?; /// let mut cache = re.create_cache(); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(&mut cache, "abc")); /// # Ok::<(), Box>(()) /// ``` pub fn find(regex: &Regex, cache: &mut Cache, input: &mut Input) -> Option { try_search(regex, cache, input).unwrap() } /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but with two differences: /// /// 1. It is not generic over `Into` and instead accepts a /// `&Input`. This permits reusing the same `Input` for multiple searches /// without needing to create a new one. This _may_ help with latency. /// 2. It returns an error if the search could not complete where as /// [`Regex::find`] will panic. /// /// # Errors /// /// This routine errors if the search could not complete. This can occur /// in a number of circumstances: /// /// * The configuration of the lazy DFA may permit it to "quit" the search. /// For example, setting quit bytes or enabling heuristic support for /// Unicode word boundaries. The default configuration does not enable any /// option that could result in the lazy DFA quitting. /// * The configuration of the lazy DFA may also permit it to "give up" /// on a search if it makes ineffective use of its transition table /// cache. The default configuration does not enable this by default, /// although it is typically a good idea to. /// * When the provided `Input` configuration is not supported. For /// example, by providing an unsupported anchor mode. /// /// When a search returns an error, callers cannot know whether a match /// exists or not. pub fn try_search( regex: &Regex, cache: &mut Cache, input: &mut Input, ) -> Result, MatchError> { let (fcache, rcache) = cache.as_parts_mut(); let end = match try_search_fwd(regex.forward(), fcache, input)? { None => return Ok(None), Some(end) => end, }; // This special cases an empty match at the beginning of the search. If // our end matches our start, then since a reverse DFA can't match past // the start, it must follow that our starting position is also our end // position. So short circuit and skip the reverse search. if input.start() == end.offset() { return Ok(Some(Match::new(end.pattern(), end.offset()..end.offset()))); } // We can also skip the reverse search if we know our search was // anchored. This occurs either when the input config is anchored or // when we know the regex itself is anchored. In this case, we know the // start of the match, if one is found, must be the start of the // search. if is_anchored(regex, input) { return Ok(Some(Match::new(end.pattern(), input.start()..end.offset()))); } // N.B. I have tentatively convinced myself that it isn't necessary // to specify the specific pattern for the reverse search since the // reverse search will always find the same pattern to match as the // forward search. But I lack a rigorous proof. Why not just provide // the pattern anyway? Well, if it is needed, then leaving it out // gives us a chance to find a witness. (Also, if we don't need to // specify the pattern, then we don't need to build the reverse DFA // with 'starts_for_each_pattern' enabled. It doesn't matter too much // for the lazy DFA, but does make the overall DFA bigger.) // // We also need to be careful to disable 'earliest' for the reverse // search, since it could be enabled for the forward search. In the // reverse case, to satisfy "leftmost" criteria, we need to match as // much as we can. We also need to be careful to make the search // anchored. We don't want the reverse search to report any matches // other than the one beginning at the end of our forward search. let match_range = input.start()..end.offset(); let start = input.with(|mut revsearch| { revsearch = revsearch.span(match_range).anchored(Anchored::Yes).earliest(false); try_search_rev(regex.reverse(), rcache, revsearch) }); let start = start?.expect("reverse search must match if forward search does"); debug_assert_eq!( start.pattern(), end.pattern(), "forward and reverse search must match same pattern", ); debug_assert!(start.offset() <= end.offset()); debug_assert!(end.offset() <= input.end()); debug_assert!(input.start() <= start.offset()); Ok(Some(Match::new(end.pattern(), start.offset()..end.offset()))) } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the regex object. /// * `'h` represents the lifetime of the haystack being searched. /// * `'c` represents the lifetime of the regex cache. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, 'c, C: Cursor> { re: &'r Regex, cache: &'c mut Cache, it: iter::Searcher, } impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; it.advance(|input| try_search(re, cache, input)) } } regex-cursor-0.1.4/src/engines/meta/error.rs000064400000000000000000000136241046102023000171250ustar 00000000000000use regex_automata::{nfa, MatchError, MatchErrorKind, PatternID}; use regex_syntax::{ast, hir}; /// An error that occurs when construction of a `Regex` fails. /// /// A build error is generally a result of one of two possible failure /// modes. First is a parse or syntax error in the concrete syntax of a /// pattern. Second is that the construction of the underlying regex matcher /// fails, usually because it gets too big with respect to limits like /// [`Config::nfa_size_limit`](crate::meta::Config::nfa_size_limit). /// /// This error provides very little introspection capabilities. You can: /// /// * Ask for the [`PatternID`] of the pattern that caused an error, if one /// is available. This is available for things like syntax errors, but not for /// cases where build limits are exceeded. /// * Ask for the underlying syntax error, but only if the error is a syntax /// error. /// * Ask for a human readable message corresponding to the underlying error. /// * The `BuildError::source` method (from the `std::error::Error` /// trait implementation) may be used to query for an underlying error if one /// exists. There are no API guarantees about which error is returned. /// /// When the `std` feature is enabled, this implements `std::error::Error`. #[derive(Clone, Debug)] pub struct BuildError { kind: BuildErrorKind, } #[derive(Clone, Debug)] enum BuildErrorKind { Syntax { pid: PatternID, err: regex_syntax::Error }, NFA(nfa::thompson::BuildError), } impl BuildError { /// If it is known which pattern ID caused this build error to occur, then /// this method returns it. /// /// Some errors are not associated with a particular pattern. However, any /// errors that occur as part of parsing a pattern are guaranteed to be /// associated with a pattern ID. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` pub fn pattern(&self) -> Option { match self.kind { BuildErrorKind::Syntax { pid, .. } => Some(pid), _ => None, } } /// If this error occurred because the regex exceeded the configured size /// limit before being built, then this returns the configured size limit. /// /// The limit returned is what was configured, and corresponds to the /// maximum amount of heap usage in bytes. pub fn size_limit(&self) -> Option { match self.kind { BuildErrorKind::NFA(ref err) => err.size_limit(), _ => None, } } /// If this error corresponds to a syntax error, then a reference to it is /// returned by this method. pub fn syntax_error(&self) -> Option<®ex_syntax::Error> { match self.kind { BuildErrorKind::Syntax { ref err, .. } => Some(err), _ => None, } } pub(crate) fn ast(pid: PatternID, err: ast::Error) -> BuildError { let err = regex_syntax::Error::from(err); BuildError { kind: BuildErrorKind::Syntax { pid, err } } } pub(crate) fn hir(pid: PatternID, err: hir::Error) -> BuildError { let err = regex_syntax::Error::from(err); BuildError { kind: BuildErrorKind::Syntax { pid, err } } } pub(crate) fn nfa(err: nfa::thompson::BuildError) -> BuildError { BuildError { kind: BuildErrorKind::NFA(err) } } } impl std::error::Error for BuildError { fn source(&self) -> Option<&(dyn std::error::Error + 'static)> { match self.kind { BuildErrorKind::Syntax { ref err, .. } => Some(err), BuildErrorKind::NFA(ref err) => Some(err), } } } impl core::fmt::Display for BuildError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { match self.kind { BuildErrorKind::Syntax { pid, .. } => { write!(f, "error parsing pattern {}", pid.as_usize()) } BuildErrorKind::NFA(_) => write!(f, "error building NFA"), } } } /// An error that occurs when a regex engine "gives up" for some reason before /// finishing a search. Usually this occurs because of heuristic Unicode word /// boundary support or because of ineffective cache usage in the lazy DFA. /// /// When this error occurs, callers should retry the regex search with a /// different regex engine. /// /// Note that this has convenient `From` impls that will automatically /// convert a `MatchError` into this error. This works because the meta /// regex engine internals guarantee that errors like `HaystackTooLong` and /// `UnsupportedAnchored` will never occur. The only errors left are `Quit` and /// `GaveUp`, which both correspond to this "failure" error. #[derive(Debug)] pub(crate) struct RetryFailError { offset: usize, } impl RetryFailError { pub(crate) fn from_offset(offset: usize) -> RetryFailError { RetryFailError { offset } } } impl std::error::Error for RetryFailError {} impl core::fmt::Display for RetryFailError { fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result { write!(f, "regex engine failed at offset {:?}", self.offset) } } impl From for RetryFailError { fn from(merr: MatchError) -> RetryFailError { use MatchErrorKind::*; match *merr.kind() { Quit { offset, .. } => RetryFailError::from_offset(offset), GaveUp { offset } => RetryFailError::from_offset(offset), // These can never occur because we avoid them by construction // or with higher level control flow logic. For example, the // backtracker's wrapper will never hand out a backtracker engine // when the haystack would be too long. _ => { unreachable!("found impossible error in meta engine: {}", merr) } } } } regex-cursor-0.1.4/src/engines/meta/literal.rs000064400000000000000000000062561046102023000174330ustar 00000000000000use std::{vec, vec::Vec}; use log::debug; use regex_automata::MatchKind; use regex_syntax::hir::Hir; use crate::engines::meta::regex::RegexInfo; /// Pull out an alternation of literals from the given sequence of HIR /// expressions. /// /// There are numerous ways for this to fail. Generally, this only applies /// to regexes of the form 'foo|bar|baz|...|quux'. It can also fail if there /// are "too few" alternates, in which case, the regex engine is likely faster. /// /// And currently, this only returns something when 'hirs.len() == 1'. pub(crate) fn alternation_literals(info: &RegexInfo, hirs: &[&Hir]) -> Option>> { use regex_syntax::hir::{HirKind, Literal}; // Might as well skip the work below if we know we can't build an // Aho-Corasick searcher. if !cfg!(feature = "perf-literal-multisubstring") { return None; } // This is pretty hacky, but basically, if `is_alternation_literal` is // true, then we can make several assumptions about the structure of our // HIR. This is what justifies the `unreachable!` statements below. if hirs.len() != 1 || !info.props()[0].look_set().is_empty() || info.props()[0].explicit_captures_len() > 0 || !info.props()[0].is_alternation_literal() || info.config().get_match_kind() != MatchKind::LeftmostFirst { return None; } let hir = &hirs[0]; let alts = match *hir.kind() { HirKind::Alternation(ref alts) => alts, _ => return None, // one literal isn't worth it }; let mut lits = vec![]; for alt in alts { let mut lit = vec![]; match *alt.kind() { HirKind::Literal(Literal(ref bytes)) => lit.extend_from_slice(bytes), HirKind::Concat(ref exprs) => { for e in exprs { match *e.kind() { HirKind::Literal(Literal(ref bytes)) => { lit.extend_from_slice(bytes); } _ => unreachable!("expected literal, got {:?}", e), } } } _ => unreachable!("expected literal or concat, got {:?}", alt), } lits.push(lit); } // Why do this? Well, when the number of literals is small, it's likely // that we'll use the lazy DFA which is in turn likely to be faster than // Aho-Corasick in such cases. Primarily because Aho-Corasick doesn't have // a "lazy DFA" but either a contiguous NFA or a full DFA. We rarely use // the latter because it is so hungry (in time and space), and the former // is decently fast, but not as fast as a well oiled lazy DFA. // // However, once the number starts getting large, the lazy DFA is likely // to start thrashing because of the modest default cache size. When // exactly does this happen? Dunno. But at whatever point that is (we make // a guess below based on ad hoc benchmarking), we'll want to cut over to // Aho-Corasick, where even the contiguous NFA is likely to do much better. if lits.len() < 3000 { debug!("skipping Aho-Corasick because there are too few literals"); return None; } Some(lits) } regex-cursor-0.1.4/src/engines/meta/mod.rs000064400000000000000000000050501046102023000165450ustar 00000000000000/*! Provides a regex matcher that composes several other regex matchers automatically. This module is home to a meta [`Regex`], which provides a convenient high level API for executing regular expressions in linear time. # Comparison with the `regex` crate A meta `Regex` is the implementation used directly by the `regex` crate. Indeed, the `regex` crate API is essentially just a light wrapper over a meta `Regex`. This means that if you need the full flexibility offered by this API, then you should be able to switch to using this API directly without any changes in match semantics or syntax. However, there are some API level differences: * The `regex` crate API returns match objects that include references to the haystack itself, which in turn makes it easy to access the matching strings without having to slice the haystack yourself. In contrast, a meta `Regex` returns match objects that only have offsets in them. * At time of writing, a meta `Regex` doesn't have some of the convenience routines that the `regex` crate has, such as replacements. Note though that [`Captures::interpolate_string`](crate::util::captures::Captures::interpolate_string) will handle the replacement string interpolation for you. * A meta `Regex` supports the [`Input`](crate::Input) abstraction, which provides a way to configure a search in more ways than is supported by the `regex` crate. For example, [`Input::anchored`](crate::Input::anchored) can be used to run an anchored search, regardless of whether the pattern is itself anchored with a `^`. * A meta `Regex` supports multi-pattern searching everywhere. Indeed, every [`Match`](crate::Match) returned by the search APIs include a [`PatternID`](crate::PatternID) indicating which pattern matched. In the single pattern case, all matches correspond to [`PatternID::ZERO`](crate::PatternID::ZERO). In contrast, the `regex` crate has distinct `Regex` and a `RegexSet` APIs. The former only supports a single pattern, while the latter supports multiple patterns but cannot report the offsets of a match. * A meta `Regex` provides the explicit capability of bypassing its internal memory pool for automatically acquiring mutable scratch space required by its internal regex engines. Namely, a [`Cache`] can be explicitly provided to lower level routines such as [`Regex::search_with`]. */ pub use self::regex::{Builder, Cache, CapturesMatches, Config, FindMatches, Regex, Split, SplitN}; pub use regex_automata::meta::BuildError; mod error; // mod limited; mod literal; mod regex; // mod reverse_inner; // mod stopat; mod strategy; mod wrappers; regex-cursor-0.1.4/src/engines/meta/regex.rs000064400000000000000000004172251046102023000171130ustar 00000000000000use core::{ borrow::Borrow, panic::{RefUnwindSafe, UnwindSafe}, }; use std::{boxed::Box, sync::Arc, vec, vec::Vec}; use regex_automata::{ nfa::thompson::WhichCaptures, util::{ captures::{Captures, GroupInfo}, pool::{Pool, PoolGuard}, prefilter::Prefilter, primitives::NonMaxUsize, }, HalfMatch, Match, MatchKind, PatternID, Span, }; use regex_syntax::{ ast, hir::{self, Hir}, }; use crate::{ cursor::Cursor, engines::meta::{error::BuildError, strategy::Strategy, wrappers}, util::iter, Input, }; /// A type alias for our pool of meta::Cache that fixes the type parameters to /// what we use for the meta regex below. type CachePool = Pool; /// Same as above, but for the guard returned by a pool. type CachePoolGuard<'a> = PoolGuard<'a, Cache, CachePoolFn>; /// The type of the closure we use to create new caches. We need to spell out /// all of the marker traits or else we risk leaking !MARKER impls. type CachePoolFn = Box Cache + Send + Sync + UnwindSafe + RefUnwindSafe>; /// A regex matcher that works by composing several other regex matchers /// automatically. /// /// In effect, a meta regex papers over a lot of the quirks or performance /// problems in each of the regex engines in this crate. Its goal is to provide /// an infallible and simple API that "just does the right thing" in the common /// case. /// /// A meta regex is the implementation of a `Regex` in the `regex` crate. /// Indeed, the `regex` crate API is essentially just a light wrapper over /// this type. This includes the `regex` crate's `RegexSet` API! /// /// # Composition /// /// This is called a "meta" matcher precisely because it uses other regex /// matchers to provide a convenient high level regex API. Here are some /// examples of how other regex matchers are composed: /// /// * When calling [`Regex::captures`], instead of immediately /// running a slower but more capable regex engine like the /// [`PikeVM`](crate::nfa::thompson::pikevm::PikeVM), the meta regex engine /// will usually first look for the bounds of a match with a higher throughput /// regex engine like a [lazy DFA](crate::hybrid). Only when a match is found /// is a slower engine like `PikeVM` used to find the matching span for each /// capture group. /// * While higher throughout engines like the lazy DFA cannot handle /// Unicode word boundaries in general, they can still be used on pure ASCII /// haystacks by pretending that Unicode word boundaries are just plain ASCII /// word boundaries. However, if a haystack is not ASCII, the meta regex engine /// will automatically switch to a (possibly slower) regex engine that supports /// Unicode word boundaries in general. /// * In some cases where a regex pattern is just a simple literal or a small /// set of literals, an actual regex engine won't be used at all. Instead, /// substring or multi-substring search algorithms will be employed. /// /// There are many other forms of composition happening too, but the above /// should give a general idea. In particular, it may perhaps be surprising /// that *multiple* regex engines might get executed for a single search. That /// is, the decision of what regex engine to use is not _just_ based on the /// pattern, but also based on the dynamic execution of the search itself. /// /// The primary reason for this composition is performance. The fundamental /// tension is that the faster engines tend to be less capable, and the more /// capable engines tend to be slower. /// /// Note that the forms of composition that are allowed are determined by /// compile time crate features and configuration. For example, if the `hybrid` /// feature isn't enabled, or if [`Config::hybrid`] has been disabled, then the /// meta regex engine will never use a lazy DFA. /// /// # Synchronization and cloning /// /// Most of the regex engines in this crate require some kind of mutable /// "scratch" space to read and write from while performing a search. Since /// a meta regex composes these regex engines, a meta regex also requires /// mutable scratch space. This scratch space is called a [`Cache`]. /// /// Most regex engines _also_ usually have a read-only component, typically /// a [Thompson `NFA`](crate::nfa::thompson::NFA). /// /// In order to make the `Regex` API convenient, most of the routines hide /// the fact that a `Cache` is needed at all. To achieve this, a [memory /// pool](crate::util::pool::Pool) is used internally to retrieve `Cache` /// values in a thread safe way that also permits reuse. This in turn implies /// that every such search call requires some form of synchronization. Usually /// this synchronization is fast enough to not notice, but in some cases, it /// can be a bottleneck. This typically occurs when all of the following are /// true: /// /// * The same `Regex` is shared across multiple threads simultaneously, /// usually via a [`util::lazy::Lazy`](crate::util::lazy::Lazy) or something /// similar from the `once_cell` or `lazy_static` crates. /// * The primary unit of work in each thread is a regex search. /// * Searches are run on very short haystacks. /// /// This particular case can lead to high contention on the pool used by a /// `Regex` internally, which can in turn increase latency to a noticeable /// effect. This cost can be mitigated in one of the following ways: /// /// * Use a distinct copy of a `Regex` in each thread, usually by cloning it. /// Cloning a `Regex` _does not_ do a deep copy of its read-only component. /// But it does lead to each `Regex` having its own memory pool, which in /// turn eliminates the problem of contention. In general, this technique should /// not result in any additional memory usage when compared to sharing the same /// `Regex` across multiple threads simultaneously. /// * Use lower level APIs, like [`Regex::search_with`], which permit passing /// a `Cache` explicitly. In this case, it is up to you to determine how best /// to provide a `Cache`. For example, you might put a `Cache` in thread-local /// storage if your use case allows for it. /// /// Overall, this is an issue that happens rarely in practice, but it can /// happen. /// /// # Warning: spin-locks may be used in alloc-only mode /// /// When this crate is built without the `std` feature and the high level APIs /// on a `Regex` are used, then a spin-lock will be used to synchronize access /// to an internal pool of `Cache` values. This may be undesirable because /// a spin-lock is [effectively impossible to implement correctly in user /// space][spinlocks-are-bad]. That is, more concretely, the spin-lock could /// result in a deadlock. /// /// [spinlocks-are-bad]: https://matklad.github.io/2020/01/02/spinlocks-considered-harmful.html /// /// If one wants to avoid the use of spin-locks when the `std` feature is /// disabled, then you must use APIs that accept a `Cache` value explicitly. /// For example, [`Regex::search_with`]. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"^[0-9]{4}-[0-9]{2}-[0-9]{2}$")?; /// assert!(re.is_match("2010-03-14")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: anchored search /// /// This example shows how to use [`Input::anchored`] to run an anchored /// search, even when the regex pattern itself isn't anchored. An anchored /// search guarantees that if a match is found, then the start offset of the /// match corresponds to the offset at which the search was started. /// /// ``` /// use regex_automata::{meta::Regex, Anchored, Input, Match}; /// /// let re = Regex::new(r"\bfoo\b")?; /// let input = Input::new("xx foo xx").range(3..).anchored(Anchored::Yes); /// // The offsets are in terms of the original haystack. /// assert_eq!(Some(Match::must(0, 3..6)), re.find(input)); /// /// // Notice that no match occurs here, because \b still takes the /// // surrounding context into account, even if it means looking back /// // before the start of your search. /// let hay = "xxfoo xx"; /// let input = Input::new(hay).range(2..).anchored(Anchored::Yes); /// assert_eq!(None, re.find(input)); /// // Indeed, you cannot achieve the above by simply slicing the /// // haystack itself, since the regex engine can't see the /// // surrounding context. This is why 'Input' permits setting /// // the bounds of a search! /// let input = Input::new(&hay[2..]).anchored(Anchored::Yes); /// // WRONG! /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: earliest search /// /// This example shows how to use [`Input::earliest`] to run a search that /// might stop before finding the typical leftmost match. /// /// ``` /// use regex_automata::{meta::Regex, Anchored, Input, Match}; /// /// let re = Regex::new(r"[a-z]{3}|b")?; /// let input = Input::new("abc").earliest(true); /// assert_eq!(Some(Match::must(0, 1..2)), re.find(input)); /// /// // Note that "earliest" isn't really a match semantic unto itself. /// // Instead, it is merely an instruction to whatever regex engine /// // gets used internally to quit as soon as it can. For example, /// // this regex uses a different search technique, and winds up /// // producing a different (but valid) match! /// let re = Regex::new(r"abc|b")?; /// let input = Input::new("abc").earliest(true); /// assert_eq!(Some(Match::must(0, 0..3)), re.find(input)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change /// the line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug)] pub struct Regex { /// The actual regex implementation. imp: Arc, /// A thread safe pool of caches. /// /// For the higher level search APIs, a `Cache` is automatically plucked /// from this pool before running a search. The lower level `with` methods /// permit the caller to provide their own cache, thereby bypassing /// accesses to this pool. /// /// Note that we put this outside the `Arc` so that cloning a `Regex` /// results in creating a fresh `CachePool`. This in turn permits callers /// to clone regexes into separate threads where each such regex gets /// the pool's "thread owner" optimization. Otherwise, if one shares the /// `Regex` directly, then the pool will go through a slower mutex path for /// all threads except for the "owner." pool: CachePool, } /// The internal implementation of `Regex`, split out so that it can be wrapped /// in an `Arc`. #[derive(Debug)] struct RegexI { /// The core matching engine. /// /// Why is this reference counted when RegexI is already wrapped in an Arc? /// Well, we need to capture this in a closure to our `Pool` below in order /// to create new `Cache` values when needed. So since it needs to be in /// two places, we make it reference counted. /// /// We make `RegexI` itself reference counted too so that `Regex` itself /// stays extremely small and very cheap to clone. strat: Arc, /// Metadata about the regexes driving the strategy. The metadata is also /// usually stored inside the strategy too, but we put it here as well /// so that we can get quick access to it (without virtual calls) before /// executing the regex engine. For example, we use this metadata to /// detect a subset of cases where we know a match is impossible, and can /// thus avoid calling into the strategy at all. /// /// Since `RegexInfo` is stored in multiple places, it is also reference /// counted. info: RegexInfo, } /// Convenience constructors for a `Regex` using the default configuration. impl Regex { /// Builds a `Regex` from a single pattern string using the default /// configuration. /// /// If there was a problem parsing the pattern or a problem turning it into /// a regex matcher, then an error is returned. /// /// If you want to change the configuration of a `Regex`, use a [`Builder`] /// with a [`Config`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new(r"(?Rm)^foo$")?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn new(pattern: &str) -> Result { Self::builder().build(pattern) } /// Builds a `Regex` from many pattern strings using the default /// configuration. /// /// If there was a problem parsing any of the patterns or a problem turning /// them into a regex matcher, then an error is returned. /// /// If you want to change the configuration of a `Regex`, use a [`Builder`] /// with a [`Config`]. /// /// # Example: simple lexer /// /// This simplistic example leverages the multi-pattern support to build a /// simple little lexer. The pattern ID in the match tells you which regex /// matched, which in turn might be used to map back to the "type" of the /// token returned by the lexer. /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new_many(&[ /// r"[[:space:]]", /// r"[A-Za-z0-9][A-Za-z0-9_]+", /// r"->", /// r".", /// ])?; /// let haystack = "fn is_boss(bruce: i32, springsteen: String) -> bool;"; /// let matches: Vec = re.find_iter(haystack).collect(); /// assert_eq!(matches, vec![ /// Match::must(1, 0..2), // 'fn' /// Match::must(0, 2..3), // ' ' /// Match::must(1, 3..10), // 'is_boss' /// Match::must(3, 10..11), // '(' /// Match::must(1, 11..16), // 'bruce' /// Match::must(3, 16..17), // ':' /// Match::must(0, 17..18), // ' ' /// Match::must(1, 18..21), // 'i32' /// Match::must(3, 21..22), // ',' /// Match::must(0, 22..23), // ' ' /// Match::must(1, 23..34), // 'springsteen' /// Match::must(3, 34..35), // ':' /// Match::must(0, 35..36), // ' ' /// Match::must(1, 36..42), // 'String' /// Match::must(3, 42..43), // ')' /// Match::must(0, 43..44), // ' ' /// Match::must(2, 44..46), // '->' /// Match::must(0, 46..47), // ' ' /// Match::must(1, 47..51), // 'bool' /// Match::must(3, 51..52), // ';' /// ]); /// /// # Ok::<(), Box>(()) /// ``` /// /// One can write a lexer like the above using a regex like /// `(?P[[:space:]])|(?P[A-Za-z0-9][A-Za-z0-9_]+)|...`, /// but then you need to ask whether capture group matched to determine /// which branch in the regex matched, and thus, which token the match /// corresponds to. In contrast, the above example includes the pattern ID /// in the match. There's no need to use capture groups at all. /// /// # Example: finding the pattern that caused an error /// /// When a syntax error occurs, it is possible to ask which pattern /// caused the syntax error. /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::new_many(&["a", "b", r"\p{Foo}", "c"]).unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` /// /// # Example: zero patterns is valid /// /// Building a regex with zero patterns results in a regex that never /// matches anything. Because this routine is generic, passing an empty /// slice usually requires a turbo-fish (or something else to help type /// inference). /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::new_many::<&str>(&[])?; /// assert_eq!(None, re.find("")); /// /// # Ok::<(), Box>(()) /// ``` pub fn new_many>(patterns: &[P]) -> Result { Self::builder().build_many(patterns) } /// Return a default configuration for a `Regex`. /// /// This is a convenience routine to avoid needing to import the [`Config`] /// type when customizing the construction of a `Regex`. /// /// # Example: lower the NFA size limit /// /// In some cases, the default size limit might be too big. The size limit /// can be lowered, which will prevent large regex patterns from compiling. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// # Ok::<(), Box>(()) /// ``` pub fn config() -> Config { Config::new() } /// Return a builder for configuring the construction of a `Regex`. /// /// This is a convenience routine to avoid needing to import the /// [`Builder`] type in common cases. /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change /// the line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn builder() -> Builder { Builder::new() } } /// High level convenience routines for using a regex to search a haystack. impl Regex { /// Returns true if and only if this regex matches the given haystack. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. (Consider how this might make /// a difference given the regex `a+` on the haystack `aaaaaaaaaaaaaaa`. /// This routine _may_ stop after it sees the first `a`, but routines like /// `find` need to continue searching because `+` is greedy by default.) /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new("foo[0-9]+bar")?; /// /// assert!(re.is_match("foo12345bar")); /// assert!(!re.is_match("foobar")); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: consistency with search APIs /// /// `is_match` is guaranteed to return `true` whenever `find` returns a /// match. This includes searches that are executed entirely within a /// codepoint: /// /// ``` /// use regex_automata::{meta::Regex, Input}; /// /// let re = Regex::new("a*")?; /// /// // This doesn't match because the default configuration bans empty /// // matches from splitting a codepoint. /// assert!(!re.is_match(Input::new("☃").span(1..2))); /// assert_eq!(None, re.find(Input::new("☃").span(1..2))); /// /// # Ok::<(), Box>(()) /// ``` /// /// Notice that when UTF-8 mode is disabled, then the above reports a /// match because the restriction against zero-width matches that split a /// codepoint has been lifted: /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("a*")?; /// /// assert!(re.is_match(Input::new("☃").span(1..2))); /// assert_eq!( /// Some(Match::must(0, 1..1)), /// re.find(Input::new("☃").span(1..2)), /// ); /// /// # Ok::<(), Box>(()) /// ``` /// /// A similar idea applies when using line anchors with CRLF mode enabled, /// which prevents them from matching between a `\r` and a `\n`. /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?Rm:$)")?; /// assert!(!re.is_match(Input::new("\r\n").span(1..1))); /// // A regular line anchor, which only considers \n as a /// // line terminator, will match. /// let re = Regex::new(r"(?m:$)")?; /// assert!(re.is_match(Input::new("\r\n").span(1..1))); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_match(&self, mut input: Input) -> bool { input.earliest(true); if self.imp.info.is_impossible(&input) { return false; } let mut guard = self.pool.get(); let result = self.imp.strat.is_match(&mut guard, &mut input); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } /// Executes a leftmost search and returns the first match that is found, /// if one exists. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!(Some(Match::must(0, 0..8)), re.find("foo12345")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find(&self, input: Input) -> Option where C: Cursor, { self.search(input) } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Span}; /// /// let re = Regex::new(r"^([0-9]{4})-([0-9]{2})-([0-9]{2})$")?; /// let mut caps = re.create_captures(); /// /// re.captures("2010-03-14", &mut caps); /// assert!(caps.is_match()); /// assert_eq!(Some(Span::from(0..4)), caps.get_group(1)); /// assert_eq!(Some(Span::from(5..7)), caps.get_group(2)); /// assert_eq!(Some(Span::from(8..10)), caps.get_group(3)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures(&self, input: Input, caps: &mut Captures) { self.search_captures(input, caps) } /// Returns an iterator over all non-overlapping leftmost matches in /// the given haystack. If no match exists, then the iterator yields no /// elements. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("foo[0-9]+")?; /// let haystack = "foo1 foo12 foo123"; /// let matches: Vec = re.find_iter(haystack).collect(); /// assert_eq!(matches, vec![ /// Match::must(0, 0..4), /// Match::must(0, 5..10), /// Match::must(0, 11..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn find_iter(&self, input: Input) -> FindMatches<'_, C> { let cache = self.pool.get(); let it = iter::Searcher::new(input); FindMatches { re: self, cache, it } } /// Returns an iterator over all non-overlapping `Captures` values. If no /// match exists, then the iterator yields no elements. /// /// This yields the same matches as [`Regex::find_iter`], but it includes /// the spans of all capturing groups that participate in each match. /// /// **Tip:** See [`util::iter::Searcher`](crate::util::iter::Searcher) for /// how to correctly iterate over all matches in a haystack while avoiding /// the creation of a new `Captures` value for every match. (Which you are /// forced to do with an `Iterator`.) /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Span}; /// /// let re = Regex::new("foo(?P[0-9]+)")?; /// /// let haystack = "foo1 foo12 foo123"; /// let matches: Vec = re /// .captures_iter(haystack) /// // The unwrap is OK since 'numbers' matches if the pattern matches. /// .map(|caps| caps.get_group_by_name("numbers").unwrap()) /// .collect(); /// assert_eq!(matches, vec![ /// Span::from(3..4), /// Span::from(8..10), /// Span::from(14..17), /// ]); /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn captures_iter(&self, input: Input) -> CapturesMatches<'_, C> { let cache = self.pool.get(); let caps = self.create_captures(); let it = iter::Searcher::new(input); CapturesMatches { re: self, cache, caps, it } } /// Returns an iterator of spans of the haystack given, delimited by a /// match of the regex. Namely, each element of the iterator corresponds to /// a part of the haystack that *isn't* matched by the regular expression. /// /// # Example /// /// To split a string delimited by arbitrary amounts of spaces or tabs: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"[ \t]+")?; /// let hay = "a b \t c\td e"; /// let fields: Vec<&str> = re.split(hay).map(|span| &hay[span]).collect(); /// assert_eq!(fields, vec!["a", "b", "c", "d", "e"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: more cases /// /// Basic usage: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["Mary", "had", "a", "little", "lamb"]); /// /// let re = Regex::new(r"X")?; /// let hay = ""; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X")?; /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "", "tiger", "leopard"]); /// /// let re = Regex::new(r"::")?; /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "tiger", "leopard"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// If a haystack contains multiple contiguous matches, you will end up /// with empty spans yielded by the iterator: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"X")?; /// let hay = "XXXXaXXbXc"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// /// let re = Regex::new(r"/")?; /// let hay = "(///)"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["(", "", "", ")"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// Separators at the start or end of a haystack are neighbored by empty /// spans. /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"0")?; /// let hay = "010"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "1", ""]); /// /// # Ok::<(), Box>(()) /// ``` /// /// When the empty string is used as a regex, it splits at every valid /// UTF-8 boundary by default (which includes the beginning and end of the /// haystack): /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"")?; /// let hay = "rust"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "r", "u", "s", "t", ""]); /// /// // Splitting by an empty string is UTF-8 aware by default! /// let re = Regex::new(r"")?; /// let hay = "☃"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "☃", ""]); /// /// # Ok::<(), Box>(()) /// ``` /// /// But note that UTF-8 mode for empty strings can be disabled, which will /// then result in a match at every byte offset in the haystack, /// including between every UTF-8 code unit. /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build(r"")?; /// let hay = "☃".as_bytes(); /// let got: Vec<&[u8]> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![ /// // Writing byte string slices is just brutal. The problem is that /// // b"foo" has type &[u8; 3] instead of &[u8]. /// &[][..], &[b'\xE2'][..], &[b'\x98'][..], &[b'\x83'][..], &[][..], /// ]); /// /// # Ok::<(), Box>(()) /// ``` /// /// Contiguous separators (commonly shows up with whitespace), can lead to /// possibly surprising behavior. For example, this code is correct: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["", "", "", "", "a", "", "b", "c"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// It does *not* give you `["a", "b", "c"]`. For that behavior, you'd want /// to match contiguous space characters: /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" +")?; /// let hay = " a b c"; /// let got: Vec<&str> = re.split(hay).map(|sp| &hay[sp]).collect(); /// // N.B. This does still include a leading empty span because ' +' /// // matches at the beginning of the haystack. /// assert_eq!(got, vec!["", "a", "b", "c"]); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn split(&self, input: Input) -> Split<'_, C> { Split { finder: self.find_iter(input), last: 0 } } /// Returns an iterator of at most `limit` spans of the haystack given, /// delimited by a match of the regex. (A `limit` of `0` will return no /// spans.) Namely, each element of the iterator corresponds to a part /// of the haystack that *isn't* matched by the regular expression. The /// remainder of the haystack that is not split will be the last element in /// the iterator. /// /// # Example /// /// Get the first two words in some haystack: /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"\W+").unwrap(); /// let hay = "Hey! How are you?"; /// let fields: Vec<&str> = /// re.splitn(hay, 3).map(|span| &hay[span]).collect(); /// assert_eq!(fields, vec!["Hey", "How", "are you?"]); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Examples: more cases /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r" ")?; /// let hay = "Mary had a little lamb"; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["Mary", "had", "a little lamb"]); /// /// let re = Regex::new(r"X")?; /// let hay = ""; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec![""]); /// /// let re = Regex::new(r"X")?; /// let hay = "lionXXtigerXleopard"; /// let got: Vec<&str> = re.splitn(hay, 3).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "", "tigerXleopard"]); /// /// let re = Regex::new(r"::")?; /// let hay = "lion::tiger::leopard"; /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["lion", "tiger::leopard"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 1).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["abcXdef"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcdef"; /// let got: Vec<&str> = re.splitn(hay, 2).map(|sp| &hay[sp]).collect(); /// assert_eq!(got, vec!["abcdef"]); /// /// let re = Regex::new(r"X")?; /// let hay = "abcXdef"; /// let got: Vec<&str> = re.splitn(hay, 0).map(|sp| &hay[sp]).collect(); /// assert!(got.is_empty()); /// /// # Ok::<(), Box>(()) /// ``` pub fn splitn(&self, input: Input, limit: usize) -> SplitN<'_, C> { SplitN { splits: self.split(input), limit } } } /// Lower level search routines that give more control. impl Regex { /// Returns the start and end offset of the leftmost match. If no match /// exists, then `None` is returned. /// /// This is like [`Regex::find`] but, but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!(Some(Match::must(0, 29..36)), re.search(&input)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search(&self, mut input: Input) -> Option { if self.imp.info.is_impossible(&input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search(&mut guard, &mut input); // We do this dance with the guard and explicitly put it back in the // pool because it seems to result in better codegen. If we let the // guard's Drop impl put it back in the pool, then functions like // ptr::drop_in_place get called and they *don't* get inlined. This // isn't usually a big deal, but in latency sensitive benchmarks the // extra function call can matter. // // I used `rebar measure -f '^grep/every-line$' -e meta` to measure // the effects here. // // Note that this doesn't eliminate the latency effects of using the // pool. There is still some (minor) cost for the "thread owner" of the // pool. (i.e., The thread that first calls a regex search routine.) // However, for other threads using the regex, the pool access can be // quite expensive as it goes through a mutex. Callers can avoid this // by either cloning the Regex (which creates a distinct copy of the // pool), or callers can use the lower level APIs that accept a 'Cache' // directly and do their own handling. PoolGuard::put(guard); result } /// Returns the end offset of the leftmost match. If no match exists, then /// `None` is returned. /// /// This is distinct from [`Regex::search`] in that it only returns the end /// of a match and not the start of the match. Depending on a variety of /// implementation details, this _may_ permit the regex engine to do less /// overall work. For example, if a DFA is being used to execute a search, /// then the start of a match usually requires running a separate DFA in /// reverse to the find the start of a match. If one only needs the end of /// a match, then the separate reverse scan to find the start of a match /// can be skipped. (Note that the reverse scan is avoided even when using /// `Regex::search` when possible, for example, in the case of an anchored /// search.) /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, HalfMatch}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!(Some(HalfMatch::must(0, 36)), re.search_half(&input)); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_half(&self, mut input: Input) -> Option { if self.imp.info.is_impossible(&input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search_half(&mut guard, &mut input); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided [`Captures`] /// value. If no match was found, then [`Captures::is_match`] is guaranteed /// to return `false`. /// /// This is like [`Regex::captures`], but it accepts a concrete `&Input` /// instead of an `Into`. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-pattern `Regex` that permits /// searching for specific patterns. /// /// ``` /// use regex_automata::{ /// meta::Regex, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let mut caps = re.create_captures(); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search_captures(&Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re = Regex::new(r"\b[0-9]{3}\b")?; /// let mut caps = re.create_captures(); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// let input = Input::new(&haystack[3..6]); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search_captures(&input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_captures(&self, input: Input, caps: &mut Captures) { caps.set_pattern(None); let pid = self.search_slots(input, caps.slots_mut()); caps.set_pattern(pid); } /// Executes a leftmost forward search and writes the spans of capturing /// groups that participated in a match into the provided `slots`, and /// returns the matching pattern ID. The contents of the slots for patterns /// other than the matching pattern are unspecified. If no match was found, /// then `None` is returned and the contents of `slots` is unspecified. /// /// This is like [`Regex::search`], but it accepts a raw slots slice /// instead of a `Captures` value. This is useful in contexts where you /// don't want or need to allocate a `Captures`. /// /// It is legal to pass _any_ number of slots to this routine. If the regex /// engine would otherwise write a slot offset that doesn't fit in the /// provided slice, then it is simply skipped. In general though, there are /// usually three slice lengths you might want to use: /// /// * An empty slice, if you only care about which pattern matched. /// * A slice with [`pattern_len() * 2`](Regex::pattern_len) slots, if you /// only care about the overall match spans for each matching pattern. /// * A slice with /// [`slot_len()`](crate::util::captures::GroupInfo::slot_len) slots, which /// permits recording match offsets for every capturing group in every /// pattern. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, PatternID, Input}; /// /// let re = Regex::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots(&input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots( &self, mut input: Input, slots: &mut [Option], ) -> Option { if self.imp.info.is_impossible(&input) { return None; } let mut guard = self.pool.get(); let result = self.imp.strat.search_slots(&mut guard, &mut input, slots); // See 'Regex::search' for why we put the guard back explicitly. PoolGuard::put(guard); result } // /// Writes the set of patterns that match anywhere in the given search // /// configuration to `patset`. If multiple patterns match at the same // /// position and this `Regex` was configured with [`MatchKind::All`] // /// semantics, then all matching patterns are written to the given set. // /// // /// Unless all of the patterns in this `Regex` are anchored, then generally // /// speaking, this will scan the entire haystack. // /// // /// This search routine *does not* clear the pattern set. This gives some // /// flexibility to the caller (e.g., running multiple searches with the // /// same pattern set), but does make the API bug-prone if you're reusing // /// the same pattern set for multiple searches but intended them to be // /// independent. // /// // /// If a pattern ID matched but the given `PatternSet` does not have // /// sufficient capacity to store it, then it is not inserted and silently // /// dropped. // /// // /// # Example // /// // /// This example shows how to find all matching patterns in a haystack, // /// even when some patterns match at the same position as other patterns. // /// It is important that we configure the `Regex` with [`MatchKind::All`] // /// semantics here, or else overlapping matches will not be reported. // /// // /// ``` // /// # if cfg!(miri) { return Ok(()); } // miri takes too long // /// use regex_automata::{meta::Regex, Input, MatchKind, PatternSet}; // /// // /// let patterns = &[ // /// r"\w+", r"\d+", r"\pL+", r"foo", r"bar", r"barfoo", r"foobar", // /// ]; // /// let re = Regex::builder() // /// .configure(Regex::config().match_kind(MatchKind::All)) // /// .build_many(patterns)?; // /// // /// let input = Input::new("foobar"); // /// let mut patset = PatternSet::new(re.pattern_len()); // /// re.which_overlapping_matches(&input, &mut patset); // /// let expected = vec![0, 2, 3, 4, 6]; // /// let got: Vec = patset.iter().map(|p| p.as_usize()).collect(); // /// assert_eq!(expected, got); // /// // /// # Ok::<(), Box>(()) // /// ``` // #[inline] // pub fn which_overlapping_matches(&self, mut input: Input, patset: &mut PatternSet) { // if self.imp.info.is_impossible(input) { // return; // } // let mut guard = self.pool.get(); // let result = self.imp.strat.which_overlapping_matches(&mut guard, input, patset); // // See 'Regex::search' for why we put the guard back explicitly. // PoolGuard::put(guard); // result // } } /// Lower level search routines that give more control, and require the caller /// to provide an explicit [`Cache`] parameter. impl Regex { /// This is like [`Regex::search`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let mut cache = re.create_cache(); /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!( /// Some(Match::must(0, 29..36)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_with(&self, cache: &mut Cache, input: &mut Input) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search(cache, input) } /// This is like [`Regex::search_half`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, HalfMatch}; /// /// let re = Regex::new(r"Samwise|Sam")?; /// let mut cache = re.create_cache(); /// let input = Input::new( /// "one of the chief characters, Samwise the Brave", /// ); /// assert_eq!( /// Some(HalfMatch::must(0, 36)), /// re.search_half_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_half_with( &self, cache: &mut Cache, input: &mut Input, ) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search_half(cache, input) } /// This is like [`Regex::search_captures`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example: specific pattern search /// /// This example shows how to build a multi-pattern `Regex` that permits /// searching for specific patterns. /// /// ``` /// use regex_automata::{ /// meta::Regex, /// Anchored, Match, PatternID, Input, /// }; /// /// let re = Regex::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123"; /// /// // Since we are using the default leftmost-first match and both /// // patterns match at the same starting position, only the first pattern /// // will be returned in this case when doing a search for any of the /// // patterns. /// let expected = Some(Match::must(0, 0..6)); /// re.search_captures_with(&mut cache, &Input::new(haystack), &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we want to check whether some other pattern matches, then we /// // can provide its pattern ID. /// let expected = Some(Match::must(1, 0..6)); /// let input = Input::new(haystack) /// .anchored(Anchored::Pattern(PatternID::must(1))); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: specifying the bounds of a search /// /// This example shows how providing the bounds of a search can produce /// different results than simply sub-slicing the haystack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re = Regex::new(r"\b[0-9]{3}\b")?; /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures()); /// let haystack = "foo123bar"; /// /// // Since we sub-slice the haystack, the search doesn't know about /// // the larger context and assumes that `123` is surrounded by word /// // boundaries. And of course, the match position is reported relative /// // to the sub-slice as well, which means we get `0..3` instead of /// // `3..6`. /// let expected = Some(Match::must(0, 0..3)); /// let input = Input::new(&haystack[3..6]); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// // But if we provide the bounds of the search within the context of the /// // entire haystack, then the search can take the surrounding context /// // into account. (And if we did find a match, it would be reported /// // as a valid offset into `haystack` instead of its sub-slice.) /// let expected = None; /// let input = Input::new(haystack).range(3..6); /// re.search_captures_with(&mut cache, &input, &mut caps); /// assert_eq!(expected, caps.get_match()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_captures_with( &self, cache: &mut Cache, input: &mut Input, caps: &mut Captures, ) { caps.set_pattern(None); let pid = self.search_slots_with(cache, input, caps.slots_mut()); caps.set_pattern(pid); } /// This is like [`Regex::search_slots`], but requires the caller to /// explicitly pass a [`Cache`]. /// /// # Why pass a `Cache` explicitly? /// /// Passing a `Cache` explicitly will bypass the use of an internal memory /// pool used by `Regex` to get a `Cache` for a search. The use of this /// pool can be slower in some cases when a `Regex` is used from multiple /// threads simultaneously. Typically, performance only becomes an issue /// when there is heavy contention, which in turn usually only occurs /// when each thread's primary unit of work is a regex search on a small /// haystack. /// /// # Example /// /// This example shows how to find the overall match offsets in a /// multi-pattern search without allocating a `Captures` value. Indeed, we /// can put our slots right on the stack. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, PatternID, Input}; /// /// let re = Regex::new_many(&[ /// r"\pL+", /// r"\d+", /// ])?; /// let mut cache = re.create_cache(); /// let input = Input::new("!@#123"); /// /// // We only care about the overall match offsets here, so we just /// // allocate two slots for each pattern. Each slot records the start /// // and end of the match. /// let mut slots = [None; 4]; /// let pid = re.search_slots_with(&mut cache, &input, &mut slots); /// assert_eq!(Some(PatternID::must(1)), pid); /// /// // The overall match offsets are always at 'pid * 2' and 'pid * 2 + 1'. /// // See 'GroupInfo' for more details on the mapping between groups and /// // slot indices. /// let slot_start = pid.unwrap().as_usize() * 2; /// let slot_end = slot_start + 1; /// assert_eq!(Some(3), slots[slot_start].map(|s| s.get())); /// assert_eq!(Some(6), slots[slot_end].map(|s| s.get())); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn search_slots_with( &self, cache: &mut Cache, input: &mut Input, slots: &mut [Option], ) -> Option { if self.imp.info.is_impossible(input) { return None; } self.imp.strat.search_slots(cache, input, slots) } } /// Various non-search routines for querying properties of a `Regex` and /// convenience routines for creating [`Captures`] and [`Cache`] values. impl Regex { /// Creates a new object for recording capture group offsets. This is used /// in search APIs like [`Regex::captures`] and [`Regex::search_captures`]. /// /// This is a convenience routine for /// `Captures::all(re.group_info().clone())`. Callers may build other types /// of `Captures` values that record less information (and thus require /// less work from the regex engine) using [`Captures::matches`] and /// [`Captures::empty`]. /// /// # Example /// /// This shows some alternatives to [`Regex::create_captures`]: /// /// ``` /// use regex_automata::{ /// meta::Regex, /// util::captures::Captures, /// Match, PatternID, Span, /// }; /// /// let re = Regex::new(r"(?[A-Z][a-z]+) (?[A-Z][a-z]+)")?; /// /// // This is equivalent to Regex::create_captures. It stores matching /// // offsets for all groups in the regex. /// let mut all = Captures::all(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut all); /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); /// /// // In this version, we only care about the implicit groups, which /// // means offsets for the explicit groups will be unavailable. It can /// // sometimes be faster to ask for fewer groups, since the underlying /// // regex engine needs to do less work to keep track of them. /// let mut matches = Captures::matches(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut matches); /// // We still get the overall match info. /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); /// // But now the explicit groups are unavailable. /// assert_eq!(None, matches.get_group_by_name("first")); /// assert_eq!(None, matches.get_group_by_name("last")); /// /// // Finally, in this version, we don't ask to keep track of offsets for /// // *any* groups. All we get back is whether a match occurred, and if /// // so, the ID of the pattern that matched. /// let mut empty = Captures::empty(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut empty); /// // it's a match! /// assert!(empty.is_match()); /// // for pattern ID 0 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); /// // Match offsets are unavailable. /// assert_eq!(None, empty.get_match()); /// // And of course, explicit groups are unavailable too. /// assert_eq!(None, empty.get_group_by_name("first")); /// assert_eq!(None, empty.get_group_by_name("last")); /// /// # Ok::<(), Box>(()) /// ``` pub fn create_captures(&self) -> Captures { Captures::all(self.group_info().clone()) } /// Creates a new cache for use with lower level search APIs like /// [`Regex::search_with`]. /// /// The cache returned should only be used for searches for this `Regex`. /// If you want to reuse the cache for another `Regex`, then you must call /// [`Cache::reset`] with that `Regex`. /// /// This is a convenience routine for [`Cache::new`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("crazy janey and her mission man"); /// assert_eq!( /// Some(Match::must(0, 20..31)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn create_cache(&self) -> Cache { self.imp.strat.create_cache() } /// Returns the total number of patterns in this regex. /// /// The standard [`Regex::new`] constructor always results in a `Regex` /// with a single pattern, but [`Regex::new_many`] permits building a /// multi-pattern regex. /// /// A `Regex` guarantees that the maximum possible `PatternID` returned in /// any match is `Regex::pattern_len() - 1`. In the case where the number /// of patterns is `0`, a match is impossible. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// let re = Regex::new(r"(?m)^[a-z]$")?; /// assert_eq!(1, re.pattern_len()); /// /// let re = Regex::new_many::<&str>(&[])?; /// assert_eq!(0, re.pattern_len()); /// /// let re = Regex::new_many(&["a", "b", "c"])?; /// assert_eq!(3, re.pattern_len()); /// /// # Ok::<(), Box>(()) /// ``` pub fn pattern_len(&self) -> usize { self.imp.info.pattern_len() } /// Returns the total number of capturing groups. /// /// This includes the implicit capturing group corresponding to the /// entire match. Therefore, the minimum value returned is `1`. /// /// # Example /// /// This shows a few patterns and how many capture groups they have. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |pattern| { /// Regex::new(pattern).map(|re| re.captures_len()) /// }; /// /// assert_eq!(1, len("a")?); /// assert_eq!(2, len("(a)")?); /// assert_eq!(3, len("(a)|(b)")?); /// assert_eq!(5, len("(a)(b)|(c)(d)")?); /// assert_eq!(2, len("(a)|b")?); /// assert_eq!(2, len("a|(b)")?); /// assert_eq!(2, len("(b)*")?); /// assert_eq!(2, len("(b)+")?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: multiple patterns /// /// This routine also works for multiple patterns. The total number is /// the sum of the capture groups of each pattern. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |patterns| { /// Regex::new_many(patterns).map(|re| re.captures_len()) /// }; /// /// assert_eq!(2, len(&["a", "b"])?); /// assert_eq!(4, len(&["(a)", "(b)"])?); /// assert_eq!(6, len(&["(a)|(b)", "(c)|(d)"])?); /// assert_eq!(8, len(&["(a)(b)|(c)(d)", "(x)(y)"])?); /// assert_eq!(3, len(&["(a)", "b"])?); /// assert_eq!(3, len(&["a", "(b)"])?); /// assert_eq!(4, len(&["(a)", "(b)*"])?); /// assert_eq!(4, len(&["(a)+", "(b)+"])?); /// /// # Ok::<(), Box>(()) /// ``` pub fn captures_len(&self) -> usize { self.imp.info.props_union().explicit_captures_len().saturating_add(self.pattern_len()) } /// Returns the total number of capturing groups that appear in every /// possible match. /// /// If the number of capture groups can vary depending on the match, then /// this returns `None`. That is, a value is only returned when the number /// of matching groups is invariant or "static." /// /// Note that like [`Regex::captures_len`], this **does** include the /// implicit capturing group corresponding to the entire match. Therefore, /// when a non-None value is returned, it is guaranteed to be at least `1`. /// Stated differently, a return value of `Some(0)` is impossible. /// /// # Example /// /// This shows a few cases where a static number of capture groups is /// available and a few cases where it is not. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |pattern| { /// Regex::new(pattern).map(|re| re.static_captures_len()) /// }; /// /// assert_eq!(Some(1), len("a")?); /// assert_eq!(Some(2), len("(a)")?); /// assert_eq!(Some(2), len("(a)|(b)")?); /// assert_eq!(Some(3), len("(a)(b)|(c)(d)")?); /// assert_eq!(None, len("(a)|b")?); /// assert_eq!(None, len("a|(b)")?); /// assert_eq!(None, len("(b)*")?); /// assert_eq!(Some(2), len("(b)+")?); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: multiple patterns /// /// This property extends to regexes with multiple patterns as well. In /// order for their to be a static number of capture groups in this case, /// every pattern must have the same static number. /// /// ``` /// use regex_automata::meta::Regex; /// /// let len = |patterns| { /// Regex::new_many(patterns).map(|re| re.static_captures_len()) /// }; /// /// assert_eq!(Some(1), len(&["a", "b"])?); /// assert_eq!(Some(2), len(&["(a)", "(b)"])?); /// assert_eq!(Some(2), len(&["(a)|(b)", "(c)|(d)"])?); /// assert_eq!(Some(3), len(&["(a)(b)|(c)(d)", "(x)(y)"])?); /// assert_eq!(None, len(&["(a)", "b"])?); /// assert_eq!(None, len(&["a", "(b)"])?); /// assert_eq!(None, len(&["(a)", "(b)*"])?); /// assert_eq!(Some(2), len(&["(a)+", "(b)+"])?); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn static_captures_len(&self) -> Option { self.imp.info.props_union().static_explicit_captures_len().map(|len| len.saturating_add(1)) } /// Return information about the capture groups in this `Regex`. /// /// A `GroupInfo` is an immutable object that can be cheaply cloned. It /// is responsible for maintaining a mapping between the capture groups /// in the concrete syntax of zero or more regex patterns and their /// internal representation used by some of the regex matchers. It is also /// responsible for maintaining a mapping between the name of each group /// (if one exists) and its corresponding group index. /// /// A `GroupInfo` is ultimately what is used to build a [`Captures`] value, /// which is some mutable space where group offsets are stored as a result /// of a search. /// /// # Example /// /// This shows some alternatives to [`Regex::create_captures`]: /// /// ``` /// use regex_automata::{ /// meta::Regex, /// util::captures::Captures, /// Match, PatternID, Span, /// }; /// /// let re = Regex::new(r"(?[A-Z][a-z]+) (?[A-Z][a-z]+)")?; /// /// // This is equivalent to Regex::create_captures. It stores matching /// // offsets for all groups in the regex. /// let mut all = Captures::all(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut all); /// assert_eq!(Some(Match::must(0, 0..17)), all.get_match()); /// assert_eq!(Some(Span::from(0..5)), all.get_group_by_name("first")); /// assert_eq!(Some(Span::from(6..17)), all.get_group_by_name("last")); /// /// // In this version, we only care about the implicit groups, which /// // means offsets for the explicit groups will be unavailable. It can /// // sometimes be faster to ask for fewer groups, since the underlying /// // regex engine needs to do less work to keep track of them. /// let mut matches = Captures::matches(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut matches); /// // We still get the overall match info. /// assert_eq!(Some(Match::must(0, 0..17)), matches.get_match()); /// // But now the explicit groups are unavailable. /// assert_eq!(None, matches.get_group_by_name("first")); /// assert_eq!(None, matches.get_group_by_name("last")); /// /// // Finally, in this version, we don't ask to keep track of offsets for /// // *any* groups. All we get back is whether a match occurred, and if /// // so, the ID of the pattern that matched. /// let mut empty = Captures::empty(re.group_info().clone()); /// re.captures("Bruce Springsteen", &mut empty); /// // it's a match! /// assert!(empty.is_match()); /// // for pattern ID 0 /// assert_eq!(Some(PatternID::ZERO), empty.pattern()); /// // Match offsets are unavailable. /// assert_eq!(None, empty.get_match()); /// // And of course, explicit groups are unavailable too. /// assert_eq!(None, empty.get_group_by_name("first")); /// assert_eq!(None, empty.get_group_by_name("last")); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn group_info(&self) -> &GroupInfo { self.imp.strat.group_info() } /// Returns the configuration object used to build this `Regex`. /// /// If no configuration object was explicitly passed, then the /// configuration returned represents the default. #[inline] pub fn get_config(&self) -> &Config { self.imp.info.config() } /// Returns true if this regex has a high chance of being "accelerated." /// /// The precise meaning of "accelerated" is specifically left unspecified, /// but the general meaning is that the search is a high likelihood of /// running faster than than a character-at-a-time loop inside a standard /// regex engine. /// /// When a regex is accelerated, it is only a *probabilistic* claim. That /// is, just because the regex is believed to be accelerated, that doesn't /// mean it will definitely execute searches very fast. Similarly, if a /// regex is *not* accelerated, that is also a probabilistic claim. That /// is, a regex for which `is_accelerated` returns `false` could still run /// searches more quickly than a regex for which `is_accelerated` returns /// `true`. /// /// Whether a regex is marked as accelerated or not is dependent on /// implementations details that may change in a semver compatible release. /// That is, a regex that is accelerated in a `x.y.1` release might not be /// accelerated in a `x.y.2` release. /// /// Basically, the value of acceleration boils down to a hedge: a hodge /// podge of internal heuristics combine to make a probabilistic guess /// that this regex search may run "fast." The value in knowing this from /// a caller's perspective is that it may act as a signal that no further /// work should be done to accelerate a search. For example, a grep-like /// tool might try to do some extra work extracting literals from a regex /// to create its own heuristic acceleration strategies. But it might /// choose to defer to this crate's acceleration strategy if one exists. /// This routine permits querying whether such a strategy is active for a /// particular regex. /// /// # Example /// /// ``` /// use regex_automata::meta::Regex; /// /// // A simple literal is very likely to be accelerated. /// let re = Regex::new(r"foo")?; /// assert!(re.is_accelerated()); /// /// // A regex with no literals is likely to not be accelerated. /// let re = Regex::new(r"\w")?; /// assert!(!re.is_accelerated()); /// /// # Ok::<(), Box>(()) /// ``` #[inline] pub fn is_accelerated(&self) -> bool { self.imp.strat.is_accelerated() } /// Return the total approximate heap memory, in bytes, used by this `Regex`. /// /// Note that currently, there is no high level configuration for setting /// a limit on the specific value returned by this routine. Instead, the /// following routines can be used to control heap memory at a bit of a /// lower level: /// /// * [`Config::nfa_size_limit`] controls how big _any_ of the NFAs are /// allowed to be. /// * [`Config::onepass_size_limit`] controls how big the one-pass DFA is /// allowed to be. /// * [`Config::hybrid_cache_capacity`] controls how much memory the lazy /// DFA is permitted to allocate to store its transition table. /// * [`Config::dfa_size_limit`] controls how big a fully compiled DFA is /// allowed to be. /// * [`Config::dfa_state_limit`] controls the conditions under which the /// meta regex engine will even attempt to build a fully compiled DFA. #[inline] pub fn memory_usage(&self) -> usize { self.imp.strat.memory_usage() } } impl Clone for Regex { fn clone(&self) -> Self { let imp = Arc::clone(&self.imp); let pool = { let strat = Arc::clone(&imp.strat); let create: CachePoolFn = Box::new(move || strat.create_cache()); Pool::new(create) }; Regex { imp, pool } } } #[derive(Clone, Debug)] pub(crate) struct RegexInfo(Arc); #[derive(Clone, Debug)] struct RegexInfoI { config: Config, props: Vec, props_union: hir::Properties, } impl RegexInfo { fn new(config: Config, hirs: &[&Hir]) -> RegexInfo { // Collect all of the properties from each of the HIRs, and also // union them into one big set of properties representing all HIRs // as if they were in one big alternation. let mut props = vec![]; for hir in hirs.iter() { props.push(hir.properties().clone()); } let props_union = hir::Properties::union(&props); RegexInfo(Arc::new(RegexInfoI { config, props, props_union })) } pub(crate) fn config(&self) -> &Config { &self.0.config } pub(crate) fn props(&self) -> &[hir::Properties] { &self.0.props } pub(crate) fn props_union(&self) -> &hir::Properties { &self.0.props_union } pub(crate) fn pattern_len(&self) -> usize { self.props().len() } pub(crate) fn memory_usage(&self) -> usize { self.props().iter().map(|p| p.memory_usage()).sum::() + self.props_union().memory_usage() } /// Returns true when the search is guaranteed to be anchored. That is, /// when a match is reported, its offset is guaranteed to correspond to /// the start of the search. /// /// This includes returning true when `input` _isn't_ anchored but the /// underlying regex is. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_anchored_start(&self, input: &Input) -> bool { input.get_anchored().is_anchored() || self.is_always_anchored_start() } /// Returns true when this regex is always anchored to the start of a /// search. And in particular, that regardless of an `Input` configuration, /// if any match is reported it must start at `0`. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_always_anchored_start(&self) -> bool { use regex_syntax::hir::Look; self.props_union().look_set_prefix().contains(Look::Start) } /// Returns true when this regex is always anchored to the end of a /// search. And in particular, that regardless of an `Input` configuration, /// if any match is reported it must end at the end of the haystack. #[cfg_attr(feature = "perf-inline", inline(always))] pub(crate) fn is_always_anchored_end(&self) -> bool { use regex_syntax::hir::Look; self.props_union().look_set_suffix().contains(Look::End) } /// Returns true if and only if it is known that a match is impossible /// for the given input. This is useful for short-circuiting and avoiding /// running the regex engine if it's known no match can be reported. /// /// Note that this doesn't necessarily detect every possible case. For /// example, when `pattern_len() == 0`, a match is impossible, but that /// case is so rare that it's fine to be handled by the regex engine /// itself. That is, it's not worth the cost of adding it here in order to /// make it a little faster. The reason is that this is called for every /// search. so there is some cost to adding checks here. Arguably, some of /// the checks that are here already probably shouldn't be here... #[cfg_attr(feature = "perf-inline", inline(always))] fn is_impossible(&self, input: &Input) -> bool { // The underlying regex is anchored, so if we don't start the search // at position 0, a match is impossible, because the anchor can only // match at position 0. if input.start() > 0 && self.is_always_anchored_start() { return true; } // // Same idea, but for the end anchor. // if input.end() < input.haystack().len() && self.is_always_anchored_end() { // return true; // } // If the haystack is smaller than the minimum length required, then // we know there can be no match. let minlen = match self.props_union().minimum_len() { None => return false, Some(minlen) => minlen, }; if input.get_span().len() < minlen { return true; } // Same idea as minimum, but for maximum. This is trickier. We can // only apply the maximum when we know the entire span that we're // searching *has* to match according to the regex (and possibly the // input configuration). If we know there is too much for the regex // to match, we can bail early. // // I don't think we can apply the maximum otherwise unfortunately. if self.is_anchored_start(input) && self.is_always_anchored_end() { let maxlen = match self.props_union().maximum_len() { None => return false, Some(maxlen) => maxlen, }; if input.get_span().len() > maxlen { return true; } } false } } /// An iterator over all non-overlapping matches for an infallible search. /// /// The iterator yields a [`Match`] value until no more matches could be found. /// If the underlying regex engine returns an error, then a panic occurs. /// /// This iterator can be created with the [`Regex::find_iter`] method. #[derive(Debug)] pub struct FindMatches<'r, C: Cursor> { re: &'r Regex, cache: CachePoolGuard<'r>, it: iter::Searcher, } impl<'r, C: Cursor> FindMatches<'r, C> { /// Returns the `Regex` value that created this iterator. #[inline] pub fn regex(&self) -> &'r Regex { self.re } /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input(&mut self) -> &mut Input { self.it.input() } } impl<'r, C: Cursor> Iterator for FindMatches<'r, C> { type Item = Match; #[inline] fn next(&mut self) -> Option { let FindMatches { re, ref mut cache, ref mut it } = *self; it.advance(|input| Ok(re.search_with(cache, input))) } #[inline] fn count(self) -> usize { // If all we care about is a count of matches, then we only need to // find the end position of each match. This can give us a 2x perf // boost in some cases, because it avoids needing to do a reverse scan // to find the start of a match. let FindMatches { re, mut cache, it } = self; // This does the deref for PoolGuard once instead of every iter. let cache = &mut *cache; it.into_half_matches_iter(|input| Ok(re.search_half_with(cache, input))).count() } } impl<'r, C: Cursor> core::iter::FusedIterator for FindMatches<'r, C> {} /// An iterator over all non-overlapping leftmost matches with their capturing /// groups. /// /// The iterator yields a [`Captures`] value until no more matches could be /// found. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::captures_iter`] method. #[derive(Debug)] pub struct CapturesMatches<'r, C: Cursor> { re: &'r Regex, cache: CachePoolGuard<'r>, caps: Captures, it: iter::Searcher, } impl<'r, C: Cursor> CapturesMatches<'r, C> { /// Returns the `Regex` value that created this iterator. #[inline] pub fn regex(&self) -> &'r Regex { self.re } /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input(&mut self) -> &mut Input { self.it.input() } } impl<'r, C: Cursor> Iterator for CapturesMatches<'r, C> { type Item = Captures; #[inline] fn next(&mut self) -> Option { // Splitting 'self' apart seems necessary to appease borrowck. let CapturesMatches { re, ref mut cache, ref mut caps, ref mut it } = *self; let _ = it.advance(|input| { re.search_captures_with(cache, input, caps); Ok(caps.get_match()) }); if caps.is_match() { Some(caps.clone()) } else { None } } #[inline] fn count(self) -> usize { let CapturesMatches { re, mut cache, it, .. } = self; // This does the deref for PoolGuard once instead of every iter. let cache = &mut *cache; it.into_half_matches_iter(|input| Ok(re.search_half_with(cache, input))).count() } } impl<'r, C: Cursor> core::iter::FusedIterator for CapturesMatches<'r, C> {} /// Yields all substrings delimited by a regular expression match. /// /// The spans correspond to the offsets between matches. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::split`] method. #[derive(Debug)] pub struct Split<'r, C: Cursor> { finder: FindMatches<'r, C>, last: usize, } impl<'r, C: Cursor> Split<'r, C> { /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input(&mut self) -> &mut Input { self.finder.input() } } impl<'r, C: Cursor> Iterator for Split<'r, C> { type Item = Span; fn next(&mut self) -> Option { match self.finder.next() { None => { let len = self.finder.it.input().end(); if self.last > len { None } else { let span = Span::from(self.last..len); self.last = len + 1; // Next call will return None Some(span) } } Some(m) => { let span = Span::from(self.last..m.start()); self.last = m.end(); Some(span) } } } } impl<'r, C: Cursor> core::iter::FusedIterator for Split<'r, C> {} /// Yields at most `N` spans delimited by a regular expression match. /// /// The spans correspond to the offsets between matches. The last span will be /// whatever remains after splitting. /// /// The lifetime parameters are as follows: /// /// * `'r` represents the lifetime of the `Regex` that produced this iterator. /// * `'h` represents the lifetime of the haystack being searched. /// /// This iterator can be created with the [`Regex::splitn`] method. #[derive(Debug)] pub struct SplitN<'r, C: Cursor> { splits: Split<'r, C>, limit: usize, } impl<'r, C: Cursor> SplitN<'r, C> { /// Returns the current `Input` associated with this iterator. /// /// The `start` position on the given `Input` may change during iteration, /// but all other values are guaranteed to remain invariant. #[inline] pub fn input(&mut self) -> &mut Input { self.splits.input() } } impl<'r, C: Cursor> Iterator for SplitN<'r, C> { type Item = Span; fn next(&mut self) -> Option { if self.limit == 0 { return None; } self.limit -= 1; if self.limit > 0 { return self.splits.next(); } let len = self.splits.finder.it.input().end(); if self.splits.last > len { // We've already returned all substrings. None } else { // self.n == 0, so future calls will return None immediately Some(Span::from(self.splits.last..len)) } } fn size_hint(&self) -> (usize, Option) { (0, Some(self.limit)) } } impl<'r, C: Cursor> core::iter::FusedIterator for SplitN<'r, C> {} /// Represents mutable scratch space used by regex engines during a search. /// /// Most of the regex engines in this crate require some kind of /// mutable state in order to execute a search. This mutable state is /// explicitly separated from the the core regex object (such as a /// [`thompson::NFA`](crate::nfa::thompson::NFA)) so that the read-only regex /// object can be shared across multiple threads simultaneously without any /// synchronization. Conversely, a `Cache` must either be duplicated if using /// the same `Regex` from multiple threads, or else there must be some kind of /// synchronization that guarantees exclusive access while it's in use by one /// thread. /// /// A `Regex` attempts to do this synchronization for you by using a thread /// pool internally. Its size scales roughly with the number of simultaneous /// regex searches. /// /// For cases where one does not want to rely on a `Regex`'s internal thread /// pool, lower level routines such as [`Regex::search_with`] are provided /// that permit callers to pass a `Cache` into the search routine explicitly. /// /// General advice is that the thread pool is often more than good enough. /// However, it may be possible to observe the effects of its latency, /// especially when searching many small haystacks from many threads /// simultaneously. /// /// Caches can be created from their corresponding `Regex` via /// [`Regex::create_cache`]. A cache can only be used with either the `Regex` /// that created it, or the `Regex` that was most recently used to reset it /// with [`Cache::reset`]. Using a cache with any other `Regex` may result in /// panics or incorrect results. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Input, Match}; /// /// let re = Regex::new(r"(?-u)m\w+\s+m\w+")?; /// let mut cache = re.create_cache(); /// let input = Input::new("crazy janey and her mission man"); /// assert_eq!( /// Some(Match::must(0, 20..31)), /// re.search_with(&mut cache, &input), /// ); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Debug, Clone)] pub struct Cache { pub(crate) capmatches: Captures, pub(crate) pikevm: wrappers::PikeVMCache, // pub(crate) backtrack: wrappers::BoundedBacktrackerCache, // pub(crate) onepass: wrappers::OnePassCache, pub(crate) hybrid: wrappers::HybridCache, // pub(crate) revhybrid: wrappers::ReverseHybridCache, } impl Cache { /// Creates a new `Cache` for use with this regex. /// /// The cache returned should only be used for searches for the given /// `Regex`. If you want to reuse the cache for another `Regex`, then you /// must call [`Cache::reset`] with that `Regex`. pub fn new(re: &Regex) -> Cache { re.create_cache() } /// Reset this cache such that it can be used for searching with the given /// `Regex` (and only that `Regex`). /// /// A cache reset permits potentially reusing memory already allocated in /// this cache with a different `Regex`. /// /// # Example /// /// This shows how to re-purpose a cache for use with a different `Regex`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match, Input}; /// /// let re1 = Regex::new(r"\w")?; /// let re2 = Regex::new(r"\W")?; /// /// let mut cache = re1.create_cache(); /// assert_eq!( /// Some(Match::must(0, 0..2)), /// re1.search_with(&mut cache, &Input::new("Δ")), /// ); /// /// // Using 'cache' with re2 is not allowed. It may result in panics or /// // incorrect results. In order to re-purpose the cache, we must reset /// // it with the Regex we'd like to use it with. /// // /// // Similarly, after this reset, using the cache with 're1' is also not /// // allowed. /// cache.reset(&re2); /// assert_eq!( /// Some(Match::must(0, 0..3)), /// re2.search_with(&mut cache, &Input::new("☃")), /// ); /// /// # Ok::<(), Box>(()) /// ``` pub fn reset(&mut self, re: &Regex) { re.imp.strat.reset_cache(self) } /// Returns the heap memory usage, in bytes, of this cache. /// /// This does **not** include the stack size used up by this cache. To /// compute that, use `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { let mut bytes = 0; bytes += self.pikevm.memory_usage(); // bytes += self.backtrack.memory_usage(); // bytes += self.onepass.memory_usage(); bytes += self.hybrid.memory_usage(); // bytes += self.revhybrid.memory_usage(); bytes } } /// An object describing the configuration of a `Regex`. /// /// This configuration only includes options for the /// non-syntax behavior of a `Regex`, and can be applied via the /// [`Builder::configure`] method. For configuring the syntax options, see /// [`util::syntax::Config`](crate::util::syntax::Config). /// /// # Example: lower the NFA size limit /// /// In some cases, the default size limit might be too big. The size limit can /// be lowered, which will prevent large regex patterns from compiling. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug, Default)] pub struct Config { // As with other configuration types in this crate, we put all our knobs // in options so that we can distinguish between "default" and "not set." // This makes it possible to easily combine multiple configurations // without default values overwriting explicitly specified values. See the // 'overwrite' method. // // For docs on the fields below, see the corresponding method setters. match_kind: Option, utf8_empty: Option, autopre: Option, pre: Option>, which_captures: Option, nfa_size_limit: Option>, onepass_size_limit: Option>, hybrid_cache_capacity: Option, hybrid: Option, dfa: Option, dfa_size_limit: Option>, dfa_state_limit: Option>, // onepass: Option, backtrack: Option, byte_classes: Option, line_terminator: Option, } impl Config { /// Create a new configuration object for a `Regex`. pub fn new() -> Config { Config::default() } /// Set the match semantics for a `Regex`. /// /// The default value is [`MatchKind::LeftmostFirst`]. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match, MatchKind}; /// /// // By default, leftmost-first semantics are used, which /// // disambiguates matches at the same position by selecting /// // the one that corresponds earlier in the pattern. /// let re = Regex::new("sam|samwise")?; /// assert_eq!(Some(Match::must(0, 0..3)), re.find("samwise")); /// /// // But with 'all' semantics, match priority is ignored /// // and all match states are included. When coupled with /// // a leftmost search, the search will report the last /// // possible match. /// let re = Regex::builder() /// .configure(Regex::config().match_kind(MatchKind::All)) /// .build("sam|samwise")?; /// assert_eq!(Some(Match::must(0, 0..7)), re.find("samwise")); /// // Beware that this can lead to skipping matches! /// // Usually 'all' is used for anchored reverse searches /// // only, or for overlapping searches. /// assert_eq!(Some(Match::must(0, 4..11)), re.find("sam samwise")); /// /// # Ok::<(), Box>(()) /// ``` pub fn match_kind(self, kind: MatchKind) -> Config { Config { match_kind: Some(kind), ..self } } /// Toggles whether empty matches are permitted to occur between the code /// units of a UTF-8 encoded codepoint. /// /// This should generally be enabled when search a `&str` or anything that /// you otherwise know is valid UTF-8. It should be disabled in all other /// cases. Namely, if the haystack is not valid UTF-8 and this is enabled, /// then behavior is unspecified. /// /// By default, this is enabled. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches only occur at the beginning and end of the snowman. /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 3..3), /// ]); /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches now occur at every position! /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// Ok::<(), Box>(()) /// ``` pub fn utf8_empty(self, yes: bool) -> Config { Config { utf8_empty: Some(yes), ..self } } /// Toggles whether automatic prefilter support is enabled. /// /// If this is disabled and [`Config::prefilter`] is not set, then the /// meta regex engine will not use any prefilters. This can sometimes /// be beneficial in cases where you know (or have measured) that the /// prefilter leads to overall worse search performance. /// /// By default, this is enabled. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().auto_prefilter(false)) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); /// /// Ok::<(), Box>(()) /// ``` pub fn auto_prefilter(self, yes: bool) -> Config { Config { autopre: Some(yes), ..self } } /// Overrides and sets the prefilter to use inside a `Regex`. /// /// This permits one to forcefully set a prefilter in cases where the /// caller knows better than whatever the automatic prefilter logic is /// capable of. /// /// By default, this is set to `None` and an automatic prefilter will be /// used if one could be built. (Assuming [`Config::auto_prefilter`] is /// enabled, which it is by default.) /// /// # Example /// /// This example shows how to set your own prefilter. In the case of a /// pattern like `Bruce \w+`, the automatic prefilter is likely to be /// constructed in a way that it will look for occurrences of `Bruce `. /// In most cases, this is the best choice. But in some cases, it may be /// the case that running `memchr` on `B` is the best choice. One can /// achieve that behavior by overriding the automatic prefilter logic /// and providing a prefilter that just matches `B`. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// meta::Regex, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["B"]) /// .expect("a prefilter"); /// let re = Regex::builder() /// .configure(Regex::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// assert_eq!(Some(Match::must(0, 6..23)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: incorrect prefilters can lead to incorrect results! /// /// Be warned that setting an incorrect prefilter can lead to missed /// matches. So if you use this option, ensure your prefilter can _never_ /// report false negatives. (A false positive is, on the other hand, quite /// okay and generally unavoidable.) /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::{ /// meta::Regex, /// util::prefilter::Prefilter, /// Match, MatchKind, /// }; /// /// let pre = Prefilter::new(MatchKind::LeftmostFirst, &["Z"]) /// .expect("a prefilter"); /// let re = Regex::builder() /// .configure(Regex::config().prefilter(Some(pre))) /// .build(r"Bruce \w+")?; /// let hay = "Hello Bruce Springsteen!"; /// // Oops! No match found, but there should be one! /// assert_eq!(None, re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn prefilter(self, pre: Option) -> Config { Config { pre: Some(pre), ..self } } /// Configures what kinds of groups are compiled as "capturing" in the /// underlying regex engine. /// /// This is set to [`WhichCaptures::All`] by default. Callers may wish to /// use [`WhichCaptures::Implicit`] in cases where one wants avoid the /// overhead of capture states for explicit groups. /// /// Note that another approach to avoiding the overhead of capture groups /// is by using non-capturing groups in the regex pattern. That is, /// `(?:a)` instead of `(a)`. This option is useful when you can't control /// the concrete syntax but know that you don't need the underlying capture /// states. For example, using `WhichCaptures::Implicit` will behave as if /// all explicit capturing groups in the pattern were non-capturing. /// /// Setting this to `WhichCaptures::None` is usually not the right thing to /// do. When no capture states are compiled, some regex engines (such as /// the `PikeVM`) won't be able to report match offsets. This will manifest /// as no match being found. /// /// # Example /// /// This example demonstrates how the results of capture groups can change /// based on this option. First we show the default (all capture groups in /// the pattern are capturing): /// /// ``` /// use regex_automata::{meta::Regex, Match, Span}; /// /// let re = Regex::new(r"foo([0-9]+)bar")?; /// let hay = "foo123bar"; /// /// let mut caps = re.create_captures(); /// re.captures(hay, &mut caps); /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); /// assert_eq!(Some(Span::from(3..6)), caps.get_group(1)); /// /// Ok::<(), Box>(()) /// ``` /// /// And now we show the behavior when we only include implicit capture /// groups. In this case, we can only find the overall match span, but the /// spans of any other explicit group don't exist because they are treated /// as non-capturing. (In effect, when `WhichCaptures::Implicit` is used, /// there is no real point in using [`Regex::captures`] since it will never /// be able to report more information than [`Regex::find`].) /// /// ``` /// use regex_automata::{ /// meta::Regex, /// nfa::thompson::WhichCaptures, /// Match, /// Span, /// }; /// /// let re = Regex::builder() /// .configure(Regex::config().which_captures(WhichCaptures::Implicit)) /// .build(r"foo([0-9]+)bar")?; /// let hay = "foo123bar"; /// /// let mut caps = re.create_captures(); /// re.captures(hay, &mut caps); /// assert_eq!(Some(Span::from(0..9)), caps.get_group(0)); /// assert_eq!(None, caps.get_group(1)); /// /// Ok::<(), Box>(()) /// ``` pub fn which_captures(mut self, which_captures: WhichCaptures) -> Config { self.which_captures = Some(which_captures); self } /// Sets the size limit, in bytes, to enforce on the construction of every /// NFA build by the meta regex engine. /// /// Setting it to `None` disables the limit. This is not recommended if /// you're compiling untrusted patterns. /// /// Note that this limit is applied to _each_ NFA built, and if any of /// them exceed the limit, then construction will fail. This limit does /// _not_ correspond to the total memory used by all NFAs in the meta regex /// engine. /// /// This defaults to some reasonable number that permits most reasonable /// patterns. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().nfa_size_limit(Some(20 * (1<<10)))) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_err()); /// /// // But notice that building such a regex with the exact same limit /// // can succeed depending on other aspects of the configuration. For /// // example, a single *forward* NFA will (at time of writing) fit into /// // the 20KB limit, but a *reverse* NFA of the same pattern will not. /// // So if one configures a meta regex such that a reverse NFA is never /// // needed and thus never built, then the 20KB limit will be enough for /// // a pattern like \pL! /// let result = Regex::builder() /// .configure(Regex::config() /// .nfa_size_limit(Some(20 * (1<<10))) /// // The DFAs are the only thing that (currently) need a reverse /// // NFA. So if both are disabled, the meta regex engine will /// // skip building the reverse NFA. Note that this isn't an API /// // guarantee. A future semver compatible version may introduce /// // new use cases for a reverse NFA. /// .hybrid(false) /// .dfa(false) /// ) /// // Not even 20KB is enough to build a single large Unicode class! /// .build(r"\pL"); /// assert!(result.is_ok()); /// /// # Ok::<(), Box>(()) /// ``` pub fn nfa_size_limit(self, limit: Option) -> Config { Config { nfa_size_limit: Some(limit), ..self } } /// Sets the size limit, in bytes, for the one-pass DFA. /// /// Setting it to `None` disables the limit. Disabling the limit is /// strongly discouraged when compiling untrusted patterns. Even if the /// patterns are trusted, it still may not be a good idea, since a one-pass /// DFA can use a lot of memory. With that said, as the size of a regex /// increases, the likelihood of it being one-pass likely decreases. /// /// This defaults to some reasonable number that permits most reasonable /// one-pass patterns. /// /// # Example /// /// This shows how to set the one-pass DFA size limit. Note that since /// a one-pass DFA is an optional component of the meta regex engine, /// this size limit only impacts what is built internally and will never /// determine whether a `Regex` itself fails to build. /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().onepass_size_limit(Some(2 * (1<<20)))) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn onepass_size_limit(self, limit: Option) -> Config { Config { onepass_size_limit: Some(limit), ..self } } /// Set the cache capacity, in bytes, for the lazy DFA. /// /// The cache capacity of the lazy DFA determines approximately how much /// heap memory it is allowed to use to store its state transitions. The /// state transitions are computed at search time, and if the cache fills /// up it, it is cleared. At this point, any previously generated state /// transitions are lost and are re-generated if they're needed again. /// /// This sort of cache filling and clearing works quite well _so long as /// cache clearing happens infrequently_. If it happens too often, then the /// meta regex engine will stop using the lazy DFA and switch over to a /// different regex engine. /// /// In cases where the cache is cleared too often, it may be possible to /// give the cache more space and reduce (or eliminate) how often it is /// cleared. Similarly, sometimes a regex is so big that the lazy DFA isn't /// used at all if its cache capacity isn't big enough. /// /// The capacity set here is a _limit_ on how much memory is used. The /// actual memory used is only allocated as it's needed. /// /// Determining the right value for this is a little tricky and will likely /// required some profiling. Enabling the `logging` feature and setting the /// log level to `trace` will also tell you how often the cache is being /// cleared. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config().hybrid_cache_capacity(20 * (1<<20))) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn hybrid_cache_capacity(self, limit: usize) -> Config { Config { hybrid_cache_capacity: Some(limit), ..self } } /// Sets the size limit, in bytes, for heap memory used for a fully /// compiled DFA. /// /// **NOTE:** If you increase this, you'll likely also need to increase /// [`Config::dfa_state_limit`]. /// /// In contrast to the lazy DFA, building a full DFA requires computing /// all of its state transitions up front. This can be a very expensive /// process, and runs in worst case `2^n` time and space (where `n` is /// proportional to the size of the regex). However, a full DFA unlocks /// some additional optimization opportunities. /// /// Because full DFAs can be so expensive, the default limits for them are /// incredibly small. Generally speaking, if your regex is moderately big /// or if you're using Unicode features (`\w` is Unicode-aware by default /// for example), then you can expect that the meta regex engine won't even /// attempt to build a DFA for it. /// /// If this and [`Config::dfa_state_limit`] are set to `None`, then the /// meta regex will not use any sort of limits when deciding whether to /// build a DFA. This in turn makes construction of a `Regex` take /// worst case exponential time and space. Even short patterns can result /// in huge space blow ups. So it is strongly recommended to keep some kind /// of limit set! /// /// The default is set to a small number that permits some simple regexes /// to get compiled into DFAs in reasonable time. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// // 100MB is much bigger than the default. /// .configure(Regex::config() /// .dfa_size_limit(Some(100 * (1<<20))) /// // We don't care about size too much here, so just /// // remove the NFA state limit altogether. /// .dfa_state_limit(None)) /// .build(r"\pL{5}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn dfa_size_limit(self, limit: Option) -> Config { Config { dfa_size_limit: Some(limit), ..self } } /// Sets a limit on the total number of NFA states, beyond which, a full /// DFA is not attempted to be compiled. /// /// This limit works in concert with [`Config::dfa_size_limit`]. Namely, /// where as `Config::dfa_size_limit` is applied by attempting to construct /// a DFA, this limit is used to avoid the attempt in the first place. This /// is useful to avoid hefty initialization costs associated with building /// a DFA for cases where it is obvious the DFA will ultimately be too big. /// /// By default, this is set to a very small number. /// /// # Example /// /// ``` /// # if cfg!(miri) { return Ok(()); } // miri takes too long /// use regex_automata::meta::Regex; /// /// let result = Regex::builder() /// .configure(Regex::config() /// // Sometimes the default state limit rejects DFAs even /// // if they would fit in the size limit. Here, we disable /// // the check on the number of NFA states and just rely on /// // the size limit. /// .dfa_state_limit(None)) /// .build(r"(?-u)\w{30}"); /// assert!(result.is_ok()); /// # Ok::<(), Box>(()) /// ``` pub fn dfa_state_limit(self, limit: Option) -> Config { Config { dfa_state_limit: Some(limit), ..self } } /// Whether to attempt to shrink the size of the alphabet for the regex /// pattern or not. When enabled, the alphabet is shrunk into a set of /// equivalence classes, where every byte in the same equivalence class /// cannot discriminate between a match or non-match. /// /// **WARNING:** This is only useful for debugging DFAs. Disabling this /// does not yield any speed advantages. Indeed, disabling it can result /// in much higher memory usage. Disabling byte classes is useful for /// debugging the actual generated transitions because it lets one see the /// transitions defined on actual bytes instead of the equivalence classes. /// /// This option is enabled by default and should never be disabled unless /// one is debugging the meta regex engine's internals. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::builder() /// .configure(Regex::config().byte_classes(false)) /// .build(r"[a-z]+")?; /// let hay = "!!quux!!"; /// assert_eq!(Some(Match::must(0, 2..6)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn byte_classes(self, yes: bool) -> Config { Config { byte_classes: Some(yes), ..self } } /// Set the line terminator to be used by the `^` and `$` anchors in /// multi-line mode. /// /// This option has no effect when CRLF mode is enabled. That is, /// regardless of this setting, `(?Rm:^)` and `(?Rm:$)` will always treat /// `\r` and `\n` as line terminators (and will never match between a `\r` /// and a `\n`). /// /// By default, `\n` is the line terminator. /// /// **Warning**: This does not change the behavior of `.`. To do that, /// you'll need to configure the syntax option /// [`syntax::Config::line_terminator`](crate::util::syntax::Config::line_terminator) /// in addition to this. Otherwise, `.` will continue to match any /// character other than `\n`. /// /// # Example /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn line_terminator(self, byte: u8) -> Config { Config { line_terminator: Some(byte), ..self } } /// Toggle whether the hybrid NFA/DFA (also known as the "lazy DFA") should /// be available for use by the meta regex engine. /// /// Enabling this does not necessarily mean that the lazy DFA will /// definitely be used. It just means that it will be _available_ for use /// if the meta regex engine thinks it will be useful. /// /// When the `hybrid` crate feature is enabled, then this is enabled by /// default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn hybrid(self, yes: bool) -> Config { Config { hybrid: Some(yes), ..self } } /// Toggle whether a fully compiled DFA should be available for use by the /// meta regex engine. /// /// Enabling this does not necessarily mean that a DFA will definitely be /// used. It just means that it will be _available_ for use if the meta /// regex engine thinks it will be useful. /// /// When the `dfa-build` crate feature is enabled, then this is enabled by /// default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn dfa(self, yes: bool) -> Config { Config { dfa: Some(yes), ..self } } // /// Toggle whether a one-pass DFA should be available for use by the meta // /// regex engine. // /// // /// Enabling this does not necessarily mean that a one-pass DFA will // /// definitely be used. It just means that it will be _available_ for // /// use if the meta regex engine thinks it will be useful. (Indeed, a // /// one-pass DFA can only be used when the regex is one-pass. See the // /// [`dfa::onepass`](crate::dfa::onepass) module for more details.) // /// // /// When the `dfa-onepass` crate feature is enabled, then this is enabled // /// by default. Otherwise, if the crate feature is disabled, then this is // /// always disabled, regardless of its setting by the caller. // pub fn onepass(self, yes: bool) -> Config { // Config { onepass: Some(yes), ..self } // } /// Toggle whether a bounded backtracking regex engine should be available /// for use by the meta regex engine. /// /// Enabling this does not necessarily mean that a bounded backtracker will /// definitely be used. It just means that it will be _available_ for use /// if the meta regex engine thinks it will be useful. /// /// When the `nfa-backtrack` crate feature is enabled, then this is enabled /// by default. Otherwise, if the crate feature is disabled, then this is /// always disabled, regardless of its setting by the caller. pub fn backtrack(self, yes: bool) -> Config { Config { backtrack: Some(yes), ..self } } /// Returns the match kind on this configuration, as set by /// [`Config::match_kind`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_match_kind(&self) -> MatchKind { self.match_kind.unwrap_or(MatchKind::LeftmostFirst) } /// Returns whether empty matches must fall on valid UTF-8 boundaries, as /// set by [`Config::utf8_empty`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_utf8_empty(&self) -> bool { self.utf8_empty.unwrap_or(true) } /// Returns whether automatic prefilters are enabled, as set by /// [`Config::auto_prefilter`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_auto_prefilter(&self) -> bool { self.autopre.unwrap_or(true) } /// Returns a manually set prefilter, if one was set by /// [`Config::prefilter`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_prefilter(&self) -> Option<&Prefilter> { self.pre.as_ref().unwrap_or(&None).as_ref() } /// Returns the capture configuration, as set by /// [`Config::which_captures`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_which_captures(&self) -> WhichCaptures { self.which_captures.unwrap_or(WhichCaptures::All) } /// Returns NFA size limit, as set by [`Config::nfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_nfa_size_limit(&self) -> Option { self.nfa_size_limit.unwrap_or(Some(10 * (1 << 20))) } // /// Returns one-pass DFA size limit, as set by // /// [`Config::onepass_size_limit`]. // /// // /// If it was not explicitly set, then a default value is returned. // pub fn get_onepass_size_limit(&self) -> Option { // self.onepass_size_limit.unwrap_or(Some((1 << 20))) // } /// Returns hybrid NFA/DFA cache capacity, as set by /// [`Config::hybrid_cache_capacity`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_hybrid_cache_capacity(&self) -> usize { self.hybrid_cache_capacity.unwrap_or(2 * (1 << 20)) } /// Returns DFA size limit, as set by [`Config::dfa_size_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa_size_limit(&self) -> Option { // The default for this is VERY small because building a full DFA is // ridiculously costly. But for regexes that are very small, it can be // beneficial to use a full DFA. In particular, a full DFA can enable // additional optimizations via something called "accelerated" states. // Namely, when there's a state with only a few outgoing transitions, // we can temporary suspend walking the transition table and use memchr // for just those outgoing transitions to skip ahead very quickly. // // Generally speaking, if Unicode is enabled in your regex and you're // using some kind of Unicode feature, then it's going to blow this // size limit. Moreover, Unicode tends to defeat the "accelerated" // state optimization too, so it's a double whammy. // // We also use a limit on the number of NFA states to avoid even // starting the DFA construction process. Namely, DFA construction // itself could make lots of initial allocs proportional to the size // of the NFA, and if the NFA is large, it doesn't make sense to pay // that cost if we know it's likely to be blown by a large margin. self.dfa_size_limit.unwrap_or(Some(40 * (1 << 10))) } /// Returns DFA size limit in terms of the number of states in the NFA, as /// set by [`Config::dfa_state_limit`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa_state_limit(&self) -> Option { // Again, as with the size limit, we keep this very small. self.dfa_state_limit.unwrap_or(Some(30)) } /// Returns whether byte classes are enabled, as set by /// [`Config::byte_classes`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_byte_classes(&self) -> bool { self.byte_classes.unwrap_or(true) } /// Returns the line terminator for this configuration, as set by /// [`Config::line_terminator`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_line_terminator(&self) -> u8 { self.line_terminator.unwrap_or(b'\n') } /// Returns whether the hybrid NFA/DFA regex engine may be used, as set by /// [`Config::hybrid`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_hybrid(&self) -> bool { self.hybrid.unwrap_or(true) } /// Returns whether the DFA regex engine may be used, as set by /// [`Config::dfa`]. /// /// If it was not explicitly set, then a default value is returned. pub fn get_dfa(&self) -> bool { self.dfa.unwrap_or(true) } // /// Returns whether the one-pass DFA regex engine may be used, as set by // /// [`Config::onepass`]. // /// // /// If it was not explicitly set, then a default value is returned. // pub fn get_onepass(&self) -> bool { // self.onepass.unwrap_or(true) // } // /// Returns whether the bounded backtracking regex engine may be used, as // /// set by [`Config::backtrack`]. // /// // /// If it was not explicitly set, then a default value is returned. // pub fn get_backtrack(&self) -> bool { // #[cfg(feature = "nfa-backtrack")] // { // self.backtrack.unwrap_or(true) // } // #[cfg(not(feature = "nfa-backtrack"))] // { // false // } // } /// Overwrite the default configuration such that the options in `o` are /// always used. If an option in `o` is not set, then the corresponding /// option in `self` is used. If it's not set in `self` either, then it /// remains not set. pub(crate) fn overwrite(&self, o: Config) -> Config { Config { match_kind: o.match_kind.or(self.match_kind), utf8_empty: o.utf8_empty.or(self.utf8_empty), autopre: o.autopre.or(self.autopre), pre: o.pre.or_else(|| self.pre.clone()), which_captures: o.which_captures.or(self.which_captures), nfa_size_limit: o.nfa_size_limit.or(self.nfa_size_limit), onepass_size_limit: o.onepass_size_limit.or(self.onepass_size_limit), hybrid_cache_capacity: o.hybrid_cache_capacity.or(self.hybrid_cache_capacity), hybrid: o.hybrid.or(self.hybrid), dfa: o.dfa.or(self.dfa), dfa_size_limit: o.dfa_size_limit.or(self.dfa_size_limit), dfa_state_limit: o.dfa_state_limit.or(self.dfa_state_limit), // onepass: o.onepass.or(self.onepass), backtrack: o.backtrack.or(self.backtrack), byte_classes: o.byte_classes.or(self.byte_classes), line_terminator: o.line_terminator.or(self.line_terminator), } } } /// A builder for configuring and constructing a `Regex`. /// /// The builder permits configuring two different aspects of a `Regex`: /// /// * [`Builder::configure`] will set high-level configuration options as /// described by a [`Config`]. /// * [`Builder::syntax`] will set the syntax level configuration options /// as described by a [`util::syntax::Config`](crate::util::syntax::Config). /// This only applies when building a `Regex` from pattern strings. /// /// Once configured, the builder can then be used to construct a `Regex` from /// one of 4 different inputs: /// /// * [`Builder::build`] creates a regex from a single pattern string. /// * [`Builder::build_many`] creates a regex from many pattern strings. /// * [`Builder::build_from_hir`] creates a regex from a /// [`regex-syntax::Hir`](Hir) expression. /// * [`Builder::build_many_from_hir`] creates a regex from many /// [`regex-syntax::Hir`](Hir) expressions. /// /// The latter two methods in particular provide a way to construct a fully /// feature regular expression matcher directly from an `Hir` expression /// without having to first convert it to a string. (This is in contrast to the /// top-level `regex` crate which intentionally provides no such API in order /// to avoid making `regex-syntax` a public dependency.) /// /// As a convenience, this builder may be created via [`Regex::builder`], which /// may help avoid an extra import. /// /// # Example: change the line terminator /// /// This example shows how to enable multi-line mode by default and change the /// line terminator to the NUL byte: /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().multi_line(true)) /// .configure(Regex::config().line_terminator(b'\x00')) /// .build(r"^foo$")?; /// let hay = "\x00foo\x00"; /// assert_eq!(Some(Match::must(0, 1..4)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` /// /// # Example: disable UTF-8 requirement /// /// By default, regex patterns are required to match UTF-8. This includes /// regex patterns that can produce matches of length zero. In the case of an /// empty match, by default, matches will not appear between the code units of /// a UTF-8 encoded codepoint. /// /// However, it can be useful to disable this requirement, particularly if /// you're searching things like `&[u8]` that are not known to be valid UTF-8. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let mut builder = Regex::builder(); /// // Disables the requirement that non-empty matches match UTF-8. /// builder.syntax(syntax::Config::new().utf8(false)); /// // Disables the requirement that empty matches match UTF-8 boundaries. /// builder.configure(Regex::config().utf8_empty(false)); /// /// // We can match raw bytes via \xZZ syntax, but we need to disable /// // Unicode mode to do that. We could disable it everywhere, or just /// // selectively, as shown here. /// let re = builder.build(r"(?-u:\xFF)foo(?-u:\xFF)")?; /// let hay = b"\xFFfoo\xFF"; /// assert_eq!(Some(Match::must(0, 0..5)), re.find(hay)); /// /// // We can also match between code units. /// let re = builder.build(r"")?; /// let hay = "☃"; /// assert_eq!(re.find_iter(hay).collect::>(), vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// # Ok::<(), Box>(()) /// ``` #[derive(Clone, Debug)] pub struct Builder { config: Config, ast: ast::parse::ParserBuilder, hir: hir::translate::TranslatorBuilder, } impl Builder { /// Creates a new builder for configuring and constructing a [`Regex`]. pub fn new() -> Builder { Builder { config: Config::default(), ast: ast::parse::ParserBuilder::new(), hir: hir::translate::TranslatorBuilder::new(), } } /// Builds a `Regex` from a single pattern string. /// /// If there was a problem parsing the pattern or a problem turning it into /// a regex matcher, then an error is returned. /// /// # Example /// /// This example shows how to configure syntax options. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().crlf(true).multi_line(true)) /// .build(r"^foo$")?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// # Ok::<(), Box>(()) /// ``` pub fn build(&self, pattern: &str) -> Result { self.build_many(&[pattern]) } /// Builds a `Regex` from many pattern strings. /// /// If there was a problem parsing any of the patterns or a problem turning /// them into a regex matcher, then an error is returned. /// /// # Example: finding the pattern that caused an error /// /// When a syntax error occurs, it is possible to ask which pattern /// caused the syntax error. /// /// ``` /// use regex_automata::{meta::Regex, PatternID}; /// /// let err = Regex::builder() /// .build_many(&["a", "b", r"\p{Foo}", "c"]) /// .unwrap_err(); /// assert_eq!(Some(PatternID::must(2)), err.pattern()); /// ``` /// /// # Example: zero patterns is valid /// /// Building a regex with zero patterns results in a regex that never /// matches anything. Because this routine is generic, passing an empty /// slice usually requires a turbo-fish (or something else to help type /// inference). /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .build_many::<&str>(&[])?; /// assert_eq!(None, re.find("")); /// /// # Ok::<(), Box>(()) /// ``` pub fn build_many>(&self, patterns: &[P]) -> Result { use crate::util::primitives::IteratorIndexExt; let (mut asts, mut hirs) = (vec![], vec![]); for (pid, p) in patterns.iter().with_pattern_ids() { let ast = self.ast.build().parse(p.as_ref()).map_err(|err| BuildError::ast(pid, err))?; asts.push(ast); } for ((pid, p), ast) in patterns.iter().with_pattern_ids().zip(asts.iter()) { let hir = self .hir .build() .translate(p.as_ref(), ast) .map_err(|err| BuildError::hir(pid, err))?; hirs.push(hir); } self.build_many_from_hir(&hirs) } /// Builds a `Regex` directly from an `Hir` expression. /// /// This is useful if you needed to parse a pattern string into an `Hir` /// for other reasons (such as analysis or transformations). This routine /// permits building a `Regex` directly from the `Hir` expression instead /// of first converting the `Hir` back to a pattern string. /// /// When using this method, any options set via [`Builder::syntax`] are /// ignored. Namely, the syntax options only apply when parsing a pattern /// string, which isn't relevant here. /// /// If there was a problem building the underlying regex matcher for the /// given `Hir`, then an error is returned. /// /// # Example /// /// This example shows how one can hand-construct an `Hir` expression and /// build a regex from it without doing any parsing at all. /// /// ``` /// use { /// regex_automata::{meta::Regex, Match}, /// regex_syntax::hir::{Hir, Look}, /// }; /// /// // (?Rm)^foo$ /// let hir = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("foo".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// let re = Regex::builder() /// .build_from_hir(&hir)?; /// let hay = "\r\nfoo\r\n"; /// assert_eq!(Some(Match::must(0, 2..5)), re.find(hay)); /// /// Ok::<(), Box>(()) /// ``` pub fn build_from_hir(&self, hir: &Hir) -> Result { self.build_many_from_hir(&[hir]) } /// Builds a `Regex` directly from many `Hir` expressions. /// /// This is useful if you needed to parse pattern strings into `Hir` /// expressions for other reasons (such as analysis or transformations). /// This routine permits building a `Regex` directly from the `Hir` /// expressions instead of first converting the `Hir` expressions back to /// pattern strings. /// /// When using this method, any options set via [`Builder::syntax`] are /// ignored. Namely, the syntax options only apply when parsing a pattern /// string, which isn't relevant here. /// /// If there was a problem building the underlying regex matcher for the /// given `Hir` expressions, then an error is returned. /// /// Note that unlike [`Builder::build_many`], this can only fail as a /// result of building the underlying matcher. In that case, there is /// no single `Hir` expression that can be isolated as a reason for the /// failure. So if this routine fails, it's not possible to determine which /// `Hir` expression caused the failure. /// /// # Example /// /// This example shows how one can hand-construct multiple `Hir` /// expressions and build a single regex from them without doing any /// parsing at all. /// /// ``` /// use { /// regex_automata::{meta::Regex, Match}, /// regex_syntax::hir::{Hir, Look}, /// }; /// /// // (?Rm)^foo$ /// let hir1 = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("foo".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// // (?Rm)^bar$ /// let hir2 = Hir::concat(vec![ /// Hir::look(Look::StartCRLF), /// Hir::literal("bar".as_bytes()), /// Hir::look(Look::EndCRLF), /// ]); /// let re = Regex::builder() /// .build_many_from_hir(&[&hir1, &hir2])?; /// let hay = "\r\nfoo\r\nbar"; /// let got: Vec = re.find_iter(hay).collect(); /// let expected = vec![ /// Match::must(0, 2..5), /// Match::must(1, 7..10), /// ]; /// assert_eq!(expected, got); /// /// Ok::<(), Box>(()) /// ``` pub fn build_many_from_hir>(&self, hirs: &[H]) -> Result { let config = self.config.clone(); // We collect the HIRs into a vec so we can write internal routines // with '&[&Hir]'. i.e., Don't use generics everywhere to keep code // bloat down.. let hirs: Vec<&Hir> = hirs.iter().map(|hir| hir.borrow()).collect(); let info = RegexInfo::new(config, &hirs); let strat = Strategy::new(&info, &hirs)?; let pool = { let strat = Arc::clone(&strat); let create: CachePoolFn = Box::new(move || strat.create_cache()); Pool::new(create) }; Ok(Regex { imp: Arc::new(RegexI { strat, info }), pool }) } /// Configure the behavior of a `Regex`. /// /// This configuration controls non-syntax options related to the behavior /// of a `Regex`. This includes things like whether empty matches can split /// a codepoint, prefilters, line terminators and a long list of options /// for configuring which regex engines the meta regex engine will be able /// to use internally. /// /// # Example /// /// This example shows how to disable UTF-8 empty mode. This will permit /// empty matches to occur between the UTF-8 encoding of a codepoint. /// /// ``` /// use regex_automata::{meta::Regex, Match}; /// /// let re = Regex::new("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches only occur at the beginning and end of the snowman. /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 3..3), /// ]); /// /// let re = Regex::builder() /// .configure(Regex::config().utf8_empty(false)) /// .build("")?; /// let got: Vec = re.find_iter("☃").collect(); /// // Matches now occur at every position! /// assert_eq!(got, vec![ /// Match::must(0, 0..0), /// Match::must(0, 1..1), /// Match::must(0, 2..2), /// Match::must(0, 3..3), /// ]); /// /// Ok::<(), Box>(()) /// ``` pub fn configure(&mut self, config: Config) -> &mut Builder { self.config = self.config.overwrite(config); self } /// Configure the syntax options when parsing a pattern string while /// building a `Regex`. /// /// These options _only_ apply when [`Builder::build`] or [`Builder::build_many`] /// are used. The other build methods accept `Hir` values, which have /// already been parsed. /// /// # Example /// /// This example shows how to enable case insensitive mode. /// /// ``` /// use regex_automata::{meta::Regex, util::syntax, Match}; /// /// let re = Regex::builder() /// .syntax(syntax::Config::new().case_insensitive(true)) /// .build(r"δ")?; /// assert_eq!(Some(Match::must(0, 0..2)), re.find(r"Δ")); /// /// Ok::<(), Box>(()) /// ``` pub fn syntax(&mut self, config: regex_automata::util::syntax::Config) -> &mut Builder { self.ast .ignore_whitespace(config.get_ignore_whitespace()) .nest_limit(config.get_nest_limit()) .octal(config.get_octal()); self.hir .unicode(config.get_unicode()) .case_insensitive(config.get_case_insensitive()) .multi_line(config.get_multi_line()) .crlf(config.get_crlf()) .dot_matches_new_line(config.get_dot_matches_new_line()) .line_terminator(config.get_line_terminator()) .swap_greed(config.get_swap_greed()) .utf8(config.get_utf8()); self } } // #[cfg(test)] // mod tests { // use super::*; // // I found this in the course of building out the benchmark suite for // // rebar. // #[test] // fn regression_suffix_literal_count() { // let _ = env_logger::try_init(); // let re = Regex::new(r"[a-zA-Z]+ing").unwrap(); // assert_eq!(1, re.find_iter("tingling").count()); // } // } regex-cursor-0.1.4/src/engines/meta/strategy.rs000064400000000000000000002263721046102023000176440ustar 00000000000000use core::fmt::Debug; use std::sync::Arc; use log::{debug, trace}; use regex_syntax::hir::{literal, Hir}; use crate::{ cursor::Cursor, engines::meta::{ error::{BuildError, RetryFailError}, regex::{Cache, RegexInfo}, wrappers, }, Input, }; use regex_automata::{ nfa::thompson::{self, WhichCaptures, NFA}, util::{ captures::{Captures, GroupInfo}, look::LookMatcher, prefilter::{self, Prefilter}, primitives::{NonMaxUsize, PatternID}, }, Anchored, HalfMatch, Match, MatchKind, }; /// A trait that represents a single meta strategy. Its main utility is in /// providing a way to do dynamic dispatch over a few choices. /// /// Why dynamic dispatch? I actually don't have a super compelling reason, and /// importantly, I have not benchmarked it with the main alternative: an enum. /// I went with dynamic dispatch initially because the regex engine search code /// really can't be inlined into caller code in most cases because it's just /// too big. In other words, it is already expected that every regex search /// will entail at least the cost of a function call. /// /// I do wonder whether using enums would result in better codegen overall /// though. It's a worthwhile experiment to try. Probably the most interesting /// benchmark to run in such a case would be one with a high match count. That /// is, a benchmark to test the overall latency of a search call. #[derive(Debug)] pub(super) struct Strategy(StrategyI); impl Strategy { pub(super) fn new(info: &RegexInfo, hirs: &[&Hir]) -> Result, BuildError> { StrategyI::new(info, hirs).map(|res| Arc::new(Self(res))) } pub(super) fn group_info(&self) -> &GroupInfo { self.0.group_info() } pub(super) fn create_cache(&self) -> Cache { self.0.create_cache() } pub(super) fn reset_cache(&self, cache: &mut Cache) { self.0.reset_cache(cache) } pub(super) fn is_accelerated(&self) -> bool { self.0.is_accelerated() } pub(super) fn memory_usage(&self) -> usize { self.0.memory_usage() } pub(super) fn search( &self, cache: &mut Cache, input: &mut Input, ) -> Option { self.0.search(cache, input) } pub(super) fn search_half( &self, cache: &mut Cache, input: &mut Input, ) -> Option { self.0.search_half(cache, input) } pub(super) fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool { self.0.is_match(cache, input) } pub(super) fn search_slots( &self, cache: &mut Cache, input: &mut Input, slots: &mut [Option], ) -> Option { self.0.search_slots(cache, input, slots) } } #[derive(Debug)] enum StrategyI { Core(Core), Pre(Pre), ReverseAnchored(ReverseAnchored), } impl StrategyI { pub(super) fn new(info: &RegexInfo, hirs: &[&Hir]) -> Result { // At this point, we're committed to a regex engine of some kind. So pull // out a prefilter if we can, which will feed to each of the constituent // regex engines. let pre = if info.is_always_anchored_start() { // PERF: I'm not sure we necessarily want to do this... We may want to // run a prefilter for quickly rejecting in some cases. The problem // is that anchored searches overlap quite a bit with the use case // of "run a regex on every line to extract data." In that case, the // regex always matches, so running a prefilter doesn't really help us // there. The main place where a prefilter helps in an anchored search // is if the anchored search is not expected to match frequently. That // is, the prefilter gives us a way to possibly reject a haystack very // quickly. // // Maybe we should do use a prefilter, but only for longer haystacks? // Or maybe we should only use a prefilter when we think it's "fast"? // // Interestingly, I think we currently lack the infrastructure for // disabling a prefilter based on haystack length. That would probably // need to be a new 'Input' option. (Interestingly, an 'Input' used to // carry a 'Prefilter' with it, but I moved away from that.) debug!("skipping literal extraction since regex is anchored"); None } else if let Some(pre) = info.config().get_prefilter() { debug!("skipping literal extraction since the caller provided a prefilter"); Some(pre.clone()) } else if info.config().get_auto_prefilter() { let kind = info.config().get_match_kind(); let prefixes = crate::util::prefilter::prefixes(kind, hirs); // If we can build a full `Strategy` from just the extracted prefixes, // then we can short-circuit and avoid building a regex engine at all. if let Some(pre) = Pre::from_prefixes(info, &prefixes) { debug!( "found that the regex can be broken down to a literal \ search, avoiding the regex engine entirely", ); return Ok(Self::Pre(pre)); } // This now attempts another short-circuit of the regex engine: if we // have a huge alternation of just plain literals, then we can just use // Aho-Corasick for that and avoid the regex engine entirely. // // You might think this case would just be handled by // `Pre::from_prefixes`, but that technique relies on heuristic literal // extraction from the corresponding `Hir`. That works, but part of // heuristics limit the size and number of literals returned. This case // will specifically handle patterns with very large alternations. // // One wonders if we should just roll this our heuristic literal // extraction, and then I think this case could disappear entirely. if let Some(pre) = Pre::from_alternation_literals(info, hirs) { debug!( "found plain alternation of literals, \ avoiding regex engine entirely and using Aho-Corasick" ); return Ok(Self::Pre(pre)); } prefixes.literals().and_then(|strings| { debug!("creating prefilter from {} literals: {:?}", strings.len(), strings,); Prefilter::new(kind, strings) }) } else { debug!("skipping literal extraction since prefilters were disabled"); None }; let mut core = Core::new(info.clone(), pre.clone(), hirs)?; // Now that we have our core regex engines built, there are a few cases // where we can do a little bit better than just a normal "search forward // and maybe use a prefilter when in a start state." However, these cases // may not always work or otherwise build on top of the Core searcher. // For example, the reverse anchored optimization seems like it might // always work, but only the DFAs support reverse searching and the DFAs // might give up or quit for reasons. If we had, e.g., a PikeVM that // supported reverse searching, then we could avoid building a full Core // engine for this case. core = match ReverseAnchored::new(core) { Err(core) => core, Ok(ra) => { debug!("using reverse anchored strategy"); return Ok(Self::ReverseAnchored(ra)); } }; // core = match ReverseSuffix::new(core, hirs) { // Err(core) => core, // Ok(rs) => { // debug!("using reverse suffix strategy"); // return Ok(Arc::new(rs)); // } // }; // core = match ReverseInner::new(core, hirs) { // Err(core) => core, // Ok(ri) => { // debug!("using reverse inner strategy"); // return Ok(Arc::new(ri)); // } // }; debug!("using core strategy"); Ok(Self::Core(core)) } pub(super) fn group_info(&self) -> &GroupInfo { match self { Self::Core(core) => core.group_info(), Self::Pre(pre) => pre.group_info(), Self::ReverseAnchored(rev_anchored) => rev_anchored.group_info(), } } pub(super) fn create_cache(&self) -> Cache { match self { Self::Core(core) => core.create_cache(), Self::Pre(pre) => pre.create_cache(), Self::ReverseAnchored(rev_anchored) => rev_anchored.create_cache(), } } pub(super) fn reset_cache(&self, cache: &mut Cache) { match self { Self::Core(core) => core.reset_cache(cache), Self::Pre(pre) => pre.reset_cache(cache), Self::ReverseAnchored(rev_anchored) => rev_anchored.reset_cache(cache), } } pub(super) fn is_accelerated(&self) -> bool { match self { Self::Core(core) => core.is_accelerated(), Self::Pre(pre) => pre.is_accelerated(), Self::ReverseAnchored(rev_anchored) => rev_anchored.is_accelerated(), } } pub(super) fn memory_usage(&self) -> usize { match self { Self::Core(core) => core.memory_usage(), Self::Pre(pre) => pre.memory_usage(), Self::ReverseAnchored(rev_anchored) => rev_anchored.memory_usage(), } } pub(super) fn search( &self, cache: &mut Cache, input: &mut Input, ) -> Option { match self { Self::Core(core) => core.search(cache, input), Self::Pre(pre) => pre.search(cache, input), Self::ReverseAnchored(rev_anchored) => rev_anchored.search(cache, input), } } pub(super) fn search_half( &self, cache: &mut Cache, input: &mut Input, ) -> Option { match self { Self::Core(core) => core.search_half(cache, input), Self::Pre(pre) => pre.search_half(cache, input), Self::ReverseAnchored(rev_anchored) => rev_anchored.search_half(cache, input), } } pub(super) fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool { match self { Self::Core(core) => core.is_match(cache, input), Self::Pre(pre) => pre.is_match(cache, input), Self::ReverseAnchored(rev_anchored) => rev_anchored.is_match(cache, input), } } pub(super) fn search_slots( &self, cache: &mut Cache, input: &mut Input, slots: &mut [Option], ) -> Option { match self { Self::Core(core) => core.search_slots(cache, input, slots), Self::Pre(pre) => pre.search_slots(cache, input, slots), Self::ReverseAnchored(rev_anchored) => rev_anchored.search_slots(cache, input, slots), } } } #[derive(Clone, Debug)] struct Pre { pre: Prefilter, group_info: GroupInfo, } impl Pre { fn new(pre: Prefilter) -> Self { // The only thing we support when we use prefilters directly as a // strategy is the start and end of the overall match for a single // pattern. In other words, exactly one implicit capturing group. Which // is exactly what we use here for a GroupInfo. let group_info = GroupInfo::new([[None::<&str>]]).unwrap(); Pre { pre, group_info } } /// Given a sequence of prefixes, attempt to return a full `Strategy` using /// just the prefixes. /// /// Basically, this occurs when the prefixes given not just prefixes, /// but an enumeration of the entire language matched by the regular /// expression. /// /// A number of other conditions need to be true too. For example, there /// can be only one pattern, the number of explicit capture groups is 0, no /// look-around assertions and so on. /// /// Note that this ignores `Config::get_auto_prefilter` because if this /// returns something, then it isn't a prefilter but a matcher itself. /// Therefore, it shouldn't suffer from the problems typical to prefilters /// (such as a high false positive rate). fn from_prefixes(info: &RegexInfo, prefixes: &literal::Seq) -> Option
 {
        let kind = info.config().get_match_kind();
        // Check to see if our prefixes are exact, which means we might be
        // able to bypass the regex engine entirely and just rely on literal
        // searches.
        if !prefixes.is_exact() {
            return None;
        }
        // We also require that we have a single regex pattern. Namely,
        // we reuse the prefilter infrastructure to implement search and
        // prefilters only report spans. Prefilters don't know about pattern
        // IDs. The multi-regex case isn't a lost cause, we might still use
        // Aho-Corasick and we might still just use a regular prefilter, but
        // that's done below.
        if info.pattern_len() != 1 {
            return None;
        }
        // We can't have any capture groups either. The literal engines don't
        // know how to deal with things like '(foo)(bar)'. In that case, a
        // prefilter will just be used and then the regex engine will resolve
        // the capture groups.
        if info.props()[0].explicit_captures_len() != 0 {
            return None;
        }
        // We also require that it has zero look-around assertions. Namely,
        // literal extraction treats look-around assertions as if they match
        // *every* empty string. But of course, that isn't true. So for
        // example, 'foo\bquux' never matches anything, but 'fooquux' is
        // extracted from that as an exact literal. Such cases should just run
        // the regex engine. 'fooquux' will be used as a normal prefilter, and
        // then the regex engine will try to look for an actual match.
        if !info.props()[0].look_set().is_empty() {
            return None;
        }
        // Finally, currently, our prefilters are all oriented around
        // leftmost-first match semantics, so don't try to use them if the
        // caller asked for anything else.
        if kind != MatchKind::LeftmostFirst {
            return None;
        }
        // The above seems like a lot of requirements to meet, but it applies
        // to a lot of cases. 'foo', '[abc][123]' and 'foo|bar|quux' all meet
        // the above criteria, for example.
        //
        // Note that this is effectively a latency optimization. If we didn't
        // do this, then the extracted literals would still get bundled into
        // a prefilter, and every regex engine capable of running unanchored
        // searches supports prefilters. So this optimization merely sidesteps
        // having to run the regex engine at all to confirm the match. Thus, it
        // decreases the latency of a match.

        // OK because we know the set is exact and thus finite.
        let prefixes = prefixes.literals().unwrap();
        debug!(
            "trying to bypass regex engine by creating \
             prefilter from {} literals: {:?}",
            prefixes.len(),
            prefixes,
        );
        let choice = match prefilter::Prefilter::new(kind, prefixes) {
            Some(choice) => choice,
            None => {
                debug!("regex bypass failed because no prefilter could be built");
                return None;
            }
        };
        Some(Pre::new(choice))
    }

    /// Attempts to extract an alternation of literals, and if it's deemed
    /// worth doing, returns an Aho-Corasick prefilter as a strategy.
    ///
    /// And currently, this only returns something when 'hirs.len() == 1'. This
    /// could in theory do something if there are multiple HIRs where all of
    /// them are alternation of literals, but I haven't had the time to go down
    /// that path yet.
    fn from_alternation_literals(info: &RegexInfo, hirs: &[&Hir]) -> Option
 {
        let lits = crate::engines::meta::literal::alternation_literals(info, hirs)?;
        let ac = Prefilter::new(MatchKind::LeftmostFirst, &lits)?;
        // let ac = AhoCorasick::new(MatchKind::LeftmostFirst, &lits)?;
        Some(Pre::new(ac))
    }
}

// This implements Strategy for anything that implements PrefilterI.
//
// Note that this must only be used for regexes of length 1. Multi-regexes
// don't work here. The prefilter interface only provides the span of a match
// and not the pattern ID. (I did consider making it more expressive, but I
// couldn't figure out how to tie everything together elegantly.) Thus, so long
// as the regex only contains one pattern, we can simply assume that a match
// corresponds to PatternID::ZERO. And indeed, that's what we do here.
//
// In practice, since this impl is used to report matches directly and thus
// completely bypasses the regex engine, we only wind up using this under the
// following restrictions:
//
// * There must be only one pattern. As explained above.
// * The literal sequence must be finite and only contain exact literals.
// * There must not be any look-around assertions. If there are, the literals
// extracted might be exact, but a match doesn't necessarily imply an overall
// match. As a trivial example, 'foo\bbar' does not match 'foobar'.
// * The pattern must not have any explicit capturing groups. If it does, the
// caller might expect them to be resolved. e.g., 'foo(bar)'.
//
// So when all of those things are true, we use a prefilter directly as a
// strategy.
//
// In the case where the number of patterns is more than 1, we don't use this
// but do use a special Aho-Corasick strategy if all of the regexes are just
// simple literals or alternations of literals. (We also use the Aho-Corasick
// strategy when len(patterns)==1 if the number of literals is large. In that
// case, literal extraction gives up and will return an infinite set.)
impl Pre {
    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn group_info(&self) -> &GroupInfo {
        &self.group_info
    }

    fn create_cache(&self) -> Cache {
        Cache {
            capmatches: Captures::all(self.group_info().clone()),
            pikevm: wrappers::PikeVMCache::none(),
            // backtrack: wrappers::BoundedBacktrackerCache::none(),
            // onepass: wrappers::OnePassCache::none(),
            hybrid: wrappers::HybridCache::none(),
            // revhybrid: wrappers::ReverseHybridCache::none(),
        }
    }

    fn reset_cache(&self, _cache: &mut Cache) {}

    fn is_accelerated(&self) -> bool {
        self.pre.is_fast()
    }

    fn memory_usage(&self) -> usize {
        self.pre.memory_usage()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search(&self, _cache: &mut Cache, input: &mut Input) -> Option {
        if input.is_done() {
            return None;
        }
        if input.get_anchored().is_anchored() {
            return crate::literal::prefix(&self.pre, input)
                .map(|sp| Match::new(PatternID::ZERO, sp));
        }
        crate::literal::find(&self.pre, input).map(|sp| Match::new(PatternID::ZERO, sp))
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_half(&self, cache: &mut Cache, input: &mut Input) -> Option {
        self.search(cache, input).map(|m| HalfMatch::new(m.pattern(), m.end()))
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool {
        self.search(cache, input).is_some()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_slots(
        &self,
        cache: &mut Cache,
        input: &mut Input,
        slots: &mut [Option],
    ) -> Option {
        let m = self.search(cache, input)?;
        if let Some(slot) = slots.get_mut(0) {
            *slot = NonMaxUsize::new(m.start());
        }
        if let Some(slot) = slots.get_mut(1) {
            *slot = NonMaxUsize::new(m.end());
        }
        Some(m.pattern())
    }
}

#[derive(Debug)]
struct Core {
    info: RegexInfo,
    pre: Option,
    nfa: NFA,
    nfarev: Option,
    pikevm: wrappers::PikeVM,
    // backtrack: wrappers::BoundedBacktracker,
    // onepass: wrappers::OnePass,
    hybrid: wrappers::Hybrid,
    dfa: wrappers::DFA,
}

impl Core {
    fn new(info: RegexInfo, pre: Option, hirs: &[&Hir]) -> Result {
        let mut lookm = LookMatcher::new();
        lookm.set_line_terminator(info.config().get_line_terminator());
        let thompson_config = thompson::Config::new()
            .utf8(info.config().get_utf8_empty())
            .nfa_size_limit(info.config().get_nfa_size_limit())
            .shrink(false)
            .which_captures(info.config().get_which_captures())
            .look_matcher(lookm);
        let nfa = thompson::Compiler::new()
            .configure(thompson_config.clone())
            .build_many_from_hir(hirs)
            .map_err(BuildError::nfa)?;
        // It's possible for the PikeVM or the BB to fail to build, even though
        // at this point, we already have a full NFA in hand. They can fail
        // when a Unicode word boundary is used but where Unicode word boundary
        // support is disabled at compile time, thus making it impossible to
        // match. (Construction can also fail if the NFA was compiled without
        // captures, but we always enable that above.)
        let pikevm = wrappers::PikeVM::new(&info, pre.clone(), &nfa)?;
        // let backtrack = wrappers::BoundedBacktracker::new(&info, pre.clone(), &nfa)?;
        // The onepass engine can of course fail to build, but we expect it to
        // fail in many cases because it is an optimization that doesn't apply
        // to all regexes. The 'OnePass' wrapper encapsulates this failure (and
        // logs a message if it occurs).
        // let onepass = wrappers::OnePass::new(&info, &nfa);
        // We try to encapsulate whether a particular regex engine should be
        // used within each respective wrapper, but the DFAs need a reverse NFA
        // to build itself, and we really do not want to build a reverse NFA if
        // we know we aren't going to use the lazy DFA. So we do a config check
        // up front, which is in practice the only way we won't try to use the
        // DFA.
        let (nfarev, hybrid, dfa) = if !info.config().get_hybrid() && !info.config().get_dfa() {
            (None, wrappers::Hybrid::none(), wrappers::DFA::none())
        } else {
            // FIXME: Technically, we don't quite yet KNOW that we need
            // a reverse NFA. It's possible for the DFAs below to both
            // fail to build just based on the forward NFA. In which case,
            // building the reverse NFA was totally wasted work. But...
            // fixing this requires breaking DFA construction apart into
            // two pieces: one for the forward part and another for the
            // reverse part. Quite annoying. Making it worse, when building
            // both DFAs fails, it's quite likely that the NFA is large and
            // that it will take quite some time to build the reverse NFA
            // too. So... it's really probably worth it to do this!
            let nfarev = thompson::Compiler::new()
                // Currently, reverse NFAs don't support capturing groups,
                // so we MUST disable them. But even if we didn't have to,
                // we would, because nothing in this crate does anything
                // useful with capturing groups in reverse. And of course,
                // the lazy DFA ignores capturing groups in all cases.
                .configure(
                    thompson_config.clone().which_captures(WhichCaptures::None).reverse(true),
                )
                .build_many_from_hir(hirs)
                .map_err(BuildError::nfa)?;
            let dfa = if !info.config().get_dfa() {
                wrappers::DFA::none()
            } else {
                wrappers::DFA::new(&info, pre.clone(), &nfa, &nfarev)
            };
            let hybrid = if !info.config().get_hybrid() {
                wrappers::Hybrid::none()
            } else if dfa.is_some() {
                debug!("skipping lazy DFA because we have a full DFA");
                wrappers::Hybrid::none()
            } else {
                wrappers::Hybrid::new(&info, pre.clone(), &nfa, &nfarev)
            };
            (Some(nfarev), hybrid, dfa)
        };
        Ok(Core { info, pre, nfa, nfarev, pikevm, /*backtrack, onepass,*/ hybrid, dfa })
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn try_search_mayfail(
        &self,
        cache: &mut Cache,
        input: &mut Input,
    ) -> Option, RetryFailError>> {
        if let Some(e) = self.dfa.get(input) {
            trace!("using full DFA for search at {:?}", input.get_span());
            Some(e.try_search(input))
        } else if let Some(e) = self.hybrid.get(input) {
            trace!("using lazy DFA for search at {:?}", input.get_span());
            Some(e.try_search(&mut cache.hybrid, input))
        } else {
            None
        }
    }

    fn search_nofail(&self, cache: &mut Cache, input: &mut Input) -> Option {
        let caps = &mut cache.capmatches;
        caps.set_pattern(None);
        // We manually inline 'try_search_slots_nofail' here because we need to
        // borrow from 'cache.capmatches' in this method, but if we do, then
        // we can't pass 'cache' wholesale to to 'try_slots_no_hybrid'. It's a
        // classic example of how the borrow checker inhibits decomposition.
        // There are of course work-arounds (more types and/or interior
        // mutability), but that's more annoying than this IMO.
        let pid = /*if let Some(ref e) = self.onepass.get(input) {
            trace!("using OnePass for search at {:?}", input.get_span());
            e.search_slots(&mut cache.onepass, input, caps.slots_mut())
        } else if let Some(ref e) = self.backtrack.get(input) {
            trace!("using BoundedBacktracker for search at {:?}", input.get_span());
            e.search_slots(&mut cache.backtrack, input, caps.slots_mut())
        } else */{
            trace!("using PikeVM for search at {:?}", input.get_span());
            let e = self.pikevm.get();
            e.search_slots(&mut cache.pikevm, input, caps.slots_mut())
        };
        caps.set_pattern(pid);
        caps.get_match()
    }

    fn search_half_nofail(
        &self,
        cache: &mut Cache,
        input: &mut Input,
    ) -> Option {
        // Only the lazy/full DFA returns half-matches, since the DFA requires
        // a reverse scan to find the start position. These fallback regex
        // engines can find the start and end in a single pass, so we just do
        // that and throw away the start offset to conform to the API.
        let m = self.search_nofail(cache, input)?;
        Some(HalfMatch::new(m.pattern(), m.end()))
    }

    fn search_slots_nofail(
        &self,
        cache: &mut Cache,
        input: &mut Input,
        slots: &mut [Option],
    ) -> Option {
        /* if let Some(ref e) = self.onepass.get(input) {
            trace!("using OnePass for capture search at {:?}", input.get_span());
            e.search_slots(&mut cache.onepass, input, slots)
        } else if let Some(ref e) = self.backtrack.get(input) {
            trace!("using BoundedBacktracker for capture search at {:?}", input.get_span());
            e.search_slots(&mut cache.backtrack, input, slots)
        } else*/
        {
            trace!("using PikeVM for capture search at {:?}", input.get_span());
            let e = self.pikevm.get();
            e.search_slots(&mut cache.pikevm, input, slots)
        }
    }

    fn is_match_nofail(&self, cache: &mut Cache, input: &mut Input) -> bool {
        /*if let Some(ref e) = self.onepass.get(input) {
            trace!("using OnePass for is-match search at {:?}", input.get_span());
            e.search_slots(&mut cache.onepass, input, &mut []).is_some()
        } else if let Some(ref e) = self.backtrack.get(input) {
            trace!("using BoundedBacktracker for is-match search at {:?}", input.get_span());
            e.is_match(&mut cache.backtrack, input)
        } else*/
        {
            trace!("using PikeVM for is-match search at {:?}", input.get_span());
            let e = self.pikevm.get();
            e.is_match(&mut cache.pikevm, input)
        }
    }

    fn is_capture_search_needed(&self, slots_len: usize) -> bool {
        slots_len > self.nfa.group_info().implicit_slot_len()
    }
}

impl Core {
    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn group_info(&self) -> &GroupInfo {
        self.nfa.group_info()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn create_cache(&self) -> Cache {
        Cache {
            capmatches: Captures::all(self.group_info().clone()),
            pikevm: self.pikevm.create_cache(),
            // backtrack: self.backtrack.create_cache(),
            // onepass: self.onepass.create_cache(),
            hybrid: self.hybrid.create_cache(),
            // revhybrid: wrappers::ReverseHybridCache::none(),
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn reset_cache(&self, cache: &mut Cache) {
        cache.pikevm.reset(&self.pikevm);
        // cache.backtrack.reset(&self.backtrack);
        // cache.onepass.reset(&self.onepass);
        cache.hybrid.reset(&self.hybrid);
    }

    fn is_accelerated(&self) -> bool {
        self.pre.as_ref().map_or(false, |pre| pre.is_fast())
    }

    fn memory_usage(&self) -> usize {
        self.info.memory_usage()
            + self.pre.as_ref().map_or(0, |pre| pre.memory_usage())
            + self.nfa.memory_usage()
            + self.nfarev.as_ref().map_or(0, |nfa| nfa.memory_usage())
            // + self.onepass.memory_usage()
            + self.dfa.memory_usage()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search(&self, cache: &mut Cache, input: &mut Input) -> Option {
        // We manually inline try_search_mayfail here because letting the
        // compiler do it seems to produce pretty crappy codegen.
        return if let Some(e) = self.dfa.get(input) {
            trace!("using full DFA for full search at {:?}", input.get_span());
            match e.try_search(input) {
                Ok(x) => x,
                Err(_err) => {
                    trace!("full DFA search failed: {}", _err);
                    self.search_nofail(cache, input)
                }
            }
        } else if let Some(e) = self.hybrid.get(input) {
            trace!("using lazy DFA for full search at {:?}", input.get_span());
            match e.try_search(&mut cache.hybrid, input) {
                Ok(x) => x,
                Err(_err) => {
                    trace!("lazy DFA search failed: {}", _err);
                    self.search_nofail(cache, input)
                }
            }
        } else {
            self.search_nofail(cache, input)
        };
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_half(&self, cache: &mut Cache, input: &mut Input) -> Option {
        // The main difference with 'search' is that if we're using a DFA, we
        // can use a single forward scan without needing to run the reverse
        // DFA.
        if let Some(e) = self.dfa.get(input) {
            trace!("using full DFA for half search at {:?}", input.get_span());
            match e.try_search_half_fwd(input) {
                Ok(x) => x,
                Err(_err) => {
                    trace!("full DFA half search failed: {}", _err);
                    self.search_half_nofail(cache, input)
                }
            }
        } else if let Some(e) = self.hybrid.get(input) {
            trace!("using lazy DFA for half search at {:?}", input.get_span());
            match e.try_search_half_fwd(&mut cache.hybrid, input) {
                Ok(x) => x,
                Err(_err) => {
                    trace!("lazy DFA half search failed: {}", _err);
                    self.search_half_nofail(cache, input)
                }
            }
        } else {
            self.search_half_nofail(cache, input)
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool {
        if let Some(e) = self.dfa.get(input) {
            trace!("using full DFA for is-match search at {:?}", input.get_span());
            match e.try_search_half_fwd(input) {
                Ok(x) => x.is_some(),
                Err(_err) => {
                    trace!("full DFA half search failed: {}", _err);
                    self.is_match_nofail(cache, input)
                }
            }
        } else if let Some(e) = self.hybrid.get(input) {
            trace!("using lazy DFA for is-match search at {:?}", input.get_span());
            match e.try_search_half_fwd(&mut cache.hybrid, input) {
                Ok(x) => x.is_some(),
                Err(_err) => {
                    trace!("lazy DFA half search failed: {}", _err);
                    self.is_match_nofail(cache, input)
                }
            }
        } else {
            self.is_match_nofail(cache, input)
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_slots(
        &self,
        cache: &mut Cache,
        input: &mut Input,
        slots: &mut [Option],
    ) -> Option {
        // Even if the regex has explicit capture groups, if the caller didn't
        // provide any explicit slots, then it doesn't make sense to try and do
        // extra work to get offsets for those slots. Ideally the caller should
        // realize this and not call this routine in the first place, but alas,
        // we try to save the caller from themselves if they do.
        if !self.is_capture_search_needed(slots.len()) {
            trace!("asked for slots unnecessarily, trying fast path");
            let m = self.search(cache, input)?;
            copy_match_to_slots(m, slots);
            return Some(m.pattern());
        }
        // If the onepass DFA is available for this search (which only happens
        // when it's anchored), then skip running a fallible DFA. The onepass
        // DFA isn't as fast as a full or lazy DFA, but it is typically quite
        // a bit faster than the backtracker or the PikeVM. So it isn't as
        // advantageous to try and do a full/lazy DFA scan first.
        //
        // We still theorize that it's better to do a full/lazy DFA scan, even
        // when it's anchored, because it's usually much faster and permits us
        // to say "no match" much more quickly. This does hurt the case of,
        // say, parsing each line in a log file into capture groups, because
        // in that case, the line always matches. So the lazy DFA scan is
        // usually just wasted work. But, the lazy DFA is usually quite fast
        // and doesn't cost too much here.
        // if self.onepass.get(&mut input).is_some() {
        //     return self.search_slots_nofail(cache, &mut input, slots);
        // }
        let m = match self.try_search_mayfail(cache, input) {
            Some(Ok(Some(m))) => m,
            Some(Ok(None)) => return None,
            Some(Err(_err)) => {
                trace!("fast capture search failed: {}", _err);
                return self.search_slots_nofail(cache, input, slots);
            }
            None => {
                return self.search_slots_nofail(cache, input, slots);
            }
        };
        // At this point, now that we've found the bounds of the
        // match, we need to re-run something that can resolve
        // capturing groups. But we only need to run on it on the
        // match bounds and not the entire haystack.
        trace!(
            "match found at {}..{} in capture search, \
		  	 using another engine to find captures",
            m.start(),
            m.end(),
        );
        let res = input.with(|input| {
            input.span(m.start()..m.end()).anchored(Anchored::Pattern(m.pattern()));
            self.search_slots_nofail(cache, input, slots).expect("should find a match")
        });
        Some(res)
    }
}

#[derive(Debug)]
struct ReverseAnchored {
    core: Core,
}

impl ReverseAnchored {
    fn new(core: Core) -> Result {
        if !core.info.is_always_anchored_end() {
            debug!(
                "skipping reverse anchored optimization because \
				 the regex is not always anchored at the end"
            );
            return Err(core);
        }
        // Note that the caller can still request an anchored search even when
        // the regex isn't anchored at the start. We detect that case in the
        // search routines below and just fallback to the core engine. This
        // is fine because both searches are anchored. It's just a matter of
        // picking one. Falling back to the core engine is a little simpler,
        // since if we used the reverse anchored approach, we'd have to add an
        // extra check to ensure the match reported starts at the place where
        // the caller requested the search to start.
        if core.info.is_always_anchored_start() {
            debug!(
                "skipping reverse anchored optimization because \
				 the regex is also anchored at the start"
            );
            return Err(core);
        }
        // Only DFAs can do reverse searches (currently), so we need one of
        // them in order to do this optimization. It's possible (although
        // pretty unlikely) that we have neither and need to give up.
        if !core.hybrid.is_some() && !core.dfa.is_some() {
            debug!(
                "skipping reverse anchored optimization because \
				 we don't have a lazy DFA or a full DFA"
            );
            return Err(core);
        }
        Ok(ReverseAnchored { core })
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn try_search_half_anchored_rev(
        &self,
        cache: &mut Cache,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        // We of course always want an anchored search. In theory, the
        // underlying regex engines should automatically enable anchored
        // searches since the regex is itself anchored, but this more clearly
        // expresses intent and is always correct.
        input.with(|input| {
            input.anchored(Anchored::Yes);
            if let Some(e) = self.core.dfa.get(input) {
                trace!("using full DFA for reverse anchored search at {:?}", input.get_span());
                e.try_search_half_rev(input)
            } else if let Some(e) = self.core.hybrid.get(input) {
                trace!("using lazy DFA for reverse anchored search at {:?}", input.get_span());
                e.try_search_half_rev(&mut cache.hybrid, input)
            } else {
                unreachable!("ReverseAnchored always has a DFA")
            }
        })
    }
}

// Note that in this impl, we don't check that 'input.end() ==
// input.haystack().len()'. In particular, when that condition is false, a
// match is always impossible because we know that the regex is always anchored
// at the end (or else 'ReverseAnchored' won't be built). We don't check that
// here because the 'Regex' wrapper actually does that for us in all cases.
// Thus, in this impl, we can actually assume that the end position in 'input'
// is equivalent to the length of the haystack.
impl ReverseAnchored {
    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn group_info(&self) -> &GroupInfo {
        self.core.group_info()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn create_cache(&self) -> Cache {
        self.core.create_cache()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn reset_cache(&self, cache: &mut Cache) {
        self.core.reset_cache(cache);
    }

    fn is_accelerated(&self) -> bool {
        // Since this is anchored at the end, a reverse anchored search is
        // almost certainly guaranteed to result in a much faster search than
        // a standard forward search.
        true
    }

    fn memory_usage(&self) -> usize {
        self.core.memory_usage()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search(&self, cache: &mut Cache, input: &mut Input) -> Option {
        if input.get_anchored().is_anchored() {
            return self.core.search(cache, input);
        }
        match self.try_search_half_anchored_rev(cache, input) {
            Err(_err) => {
                trace!("fast reverse anchored search failed: {}", _err);
                self.core.search_nofail(cache, input)
            }
            Ok(None) => None,
            Ok(Some(hm)) => Some(Match::new(hm.pattern(), hm.offset()..input.end())),
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_half(&self, cache: &mut Cache, input: &mut Input) -> Option {
        if input.get_anchored().is_anchored() {
            return self.core.search_half(cache, input);
        }
        match self.try_search_half_anchored_rev(cache, input) {
            Err(_err) => {
                trace!("fast reverse anchored search failed: {}", _err);
                self.core.search_half_nofail(cache, input)
            }
            Ok(None) => None,
            Ok(Some(hm)) => {
                // Careful here! 'try_search_half' is a *forward* search that
                // only cares about the *end* position of a match. But
                // 'hm.offset()' is actually the start of the match. So we
                // actually just throw that away here and, since we know we
                // have a match, return the only possible position at which a
                // match can occur: input.end().
                Some(HalfMatch::new(hm.pattern(), input.end()))
            }
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool {
        if input.get_anchored().is_anchored() {
            return self.core.is_match(cache, input);
        }
        match self.try_search_half_anchored_rev(cache, input) {
            Err(_err) => {
                trace!("fast reverse anchored search failed: {}", _err);
                self.core.is_match_nofail(cache, input)
            }
            Ok(None) => false,
            Ok(Some(_)) => true,
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn search_slots(
        &self,
        cache: &mut Cache,
        input: &mut Input,
        slots: &mut [Option],
    ) -> Option {
        if input.get_anchored().is_anchored() {
            return self.core.search_slots(cache, input, slots);
        }
        match self.try_search_half_anchored_rev(cache, input) {
            Err(_err) => {
                trace!("fast reverse anchored search failed: {}", _err);
                self.core.search_slots_nofail(cache, input, slots)
            }
            Ok(None) => None,
            Ok(Some(hm)) => {
                if !self.core.is_capture_search_needed(slots.len()) {
                    trace!("asked for slots unnecessarily, skipping captures");
                    let m = Match::new(hm.pattern(), hm.offset()..input.end());
                    copy_match_to_slots(m, slots);
                    return Some(m.pattern());
                }
                let start = hm.offset();
                input.with(|input| {
                    input.span(start..input.end()).anchored(Anchored::Pattern(hm.pattern()));
                    self.core.search_slots_nofail(cache, input, slots)
                })
            }
        }
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // fn which_overlapping_matches(
    //     &self,
    //     cache: &mut Cache,
    //     input: &mut Input,
    //     patset: &mut PatternSet,
    // ) {
    //     // It seems like this could probably benefit from a reverse anchored
    //     // optimization, perhaps by doing an overlapping reverse search (which
    //     // the DFAs do support). I haven't given it much thought though, and
    //     // I'm currently focus more on the single pattern case.
    //     self.core.which_overlapping_matches(cache, input, patset)
    // }
}

// #[derive(Debug)]
// struct ReverseSuffix {
//     core: Core,
//     pre: Prefilter,
// }

// impl ReverseSuffix {
//     fn new(core: Core, hirs: &[&Hir]) -> Result {
//         if !core.info.config().get_auto_prefilter() {
//             debug!(
//                 "skipping reverse suffix optimization because \
//                  automatic prefilters are disabled"
//             );
//             return Err(core);
//         }
//         // Like the reverse inner optimization, we don't do this for regexes
//         // that are always anchored. It could lead to scanning too much, but
//         // could say "no match" much more quickly than running the regex
//         // engine if the initial literal scan doesn't match. With that said,
//         // the reverse suffix optimization has lower overhead, since it only
//         // requires a reverse scan after a literal match to confirm or reject
//         // the match. (Although, in the case of confirmation, it then needs to
//         // do another forward scan to find the end position.)
//         //
//         // Note that the caller can still request an anchored search even
//         // when the regex isn't anchored. We detect that case in the search
//         // routines below and just fallback to the core engine. Currently this
//         // optimization assumes all searches are unanchored, so if we do want
//         // to enable this optimization for anchored searches, it will need a
//         // little work to support it.
//         if core.info.is_always_anchored_start() {
//             debug!(
//                 "skipping reverse suffix optimization because \
// 				 the regex is always anchored at the start",
//             );
//             return Err(core);
//         }
//         // Only DFAs can do reverse searches (currently), so we need one of
//         // them in order to do this optimization. It's possible (although
//         // pretty unlikely) that we have neither and need to give up.
//         if !core.hybrid.is_some() && !core.dfa.is_some() {
//             debug!(
//                 "skipping reverse suffix optimization because \
// 				 we don't have a lazy DFA or a full DFA"
//             );
//             return Err(core);
//         }
//         if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
//             debug!(
//                 "skipping reverse suffix optimization because \
// 				 we already have a prefilter that we think is fast"
//             );
//             return Err(core);
//         }
//         let kind = core.info.config().get_match_kind();
//         let suffixseq = crate::util::prefilter::suffixes(kind, hirs);
//         let Some(suffixes) = suffixseq.literals() else {
//             debug!(
//                 "skipping reverse suffix optimization because \
//                  the extract suffix sequence is not finite",
//             );
//             return Err(core);
//         };
//         let Some(pre) = Prefilter::new(kind, suffixes) else {
//             debug!(
//                 "skipping reverse suffix optimization because \
//                      a prefilter could not be constructed from the \
//                      longest common suffix",
//             );
//             return Err(core);
//         };
//         if !pre.is_fast() {
//             debug!(
//                 "skipping reverse suffix optimization because \
// 				 while we have a suffix prefilter, it is not \
// 				 believed to be 'fast'"
//             );
//             return Err(core);
//         }
//         Ok(ReverseSuffix { core, pre })
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_half_start(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//     ) -> Result, RetryError> {
//         let mut span = input.get_span();
//         let mut min_start = 0;
//         loop {
//             let litmatch = match self.pre.find(input.haystack(), span) {
//                 None => return Ok(None),
//                 Some(span) => span,
//             };
//             trace!("reverse suffix scan found suffix match at {:?}", litmatch);
//             let revinput = input.clone().anchored(Anchored::Yes).span(input.start()..litmatch.end);
//             match self.try_search_half_rev_limited(cache, &revinput, min_start)? {
//                 None => {
//                     if span.start >= span.end {
//                         break;
//                     }
//                     span.start = litmatch.start.checked_add(1).unwrap();
//                 }
//                 Some(hm) => return Ok(Some(hm)),
//             }
//             min_start = litmatch.end;
//         }
//         Ok(None)
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_half_fwd(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//     ) -> Result, RetryFailError> {
//         if let Some(e) = self.core.dfa.get(&mut input) {
//             trace!("using full DFA for forward reverse suffix search at {:?}", input.get_span());
//             e.try_search_half_fwd(&mut input)
//         } else if let Some(e) = self.core.hybrid.get(&mut input) {
//             trace!("using lazy DFA for forward reverse suffix search at {:?}", input.get_span());
//             e.try_search_half_fwd(&mut cache.hybrid, &mut input)
//         } else {
//             unreachable!("ReverseSuffix always has a DFA")
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_half_rev_limited(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         min_start: usize,
//     ) -> Result, RetryError> {
//         if let Some(e) = self.core.dfa.get(&mut input) {
//             trace!(
//                 "using full DFA for reverse suffix search at {:?}, \
//                  but will be stopped at {} to avoid quadratic behavior",
//                 input.get_span(),
//                 min_start,
//             );
//             e.try_search_half_rev_limited(&mut input, min_start)
//         } else if let Some(e) = self.core.hybrid.get(&mut input) {
//             trace!(
//                 "using lazy DFA for reverse inner search at {:?}, \
//                  but will be stopped at {} to avoid quadratic behavior",
//                 input.get_span(),
//                 min_start,
//             );
//             e.try_search_half_rev_limited(&mut cache.hybrid, &mut input, min_start)
//         } else {
//             unreachable!("ReverseSuffix always has a DFA")
//         }
//     }
// }

// impl Strategy for ReverseSuffix {
//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn group_info(&self) -> &GroupInfo {
//         self.core.group_info()
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn create_cache(&self) -> Cache {
//         self.core.create_cache()
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn reset_cache(&self, cache: &mut Cache) {
//         self.core.reset_cache(cache);
//     }

//     fn is_accelerated(&self) -> bool {
//         self.pre.is_fast()
//     }

//     fn memory_usage(&self) -> usize {
//         self.core.memory_usage() + self.pre.memory_usage()
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search(&self, cache: &mut Cache, input: &mut Input) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search(cache, input);
//         }
//         match self.try_search_half_start(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse suffix optimization failed: {}", _err);
//                 self.core.search(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse suffix reverse fast search failed: {}", _err);
//                 self.core.search_nofail(cache, input)
//             }
//             Ok(None) => None,
//             Ok(Some(hm_start)) => {
//                 let fwdinput = input
//                     .clone()
//                     .anchored(Anchored::Pattern(hm_start.pattern()))
//                     .span(hm_start.offset()..input.end());
//                 match self.try_search_half_fwd(cache, &fwdinput) {
//                     Err(_err) => {
//                         trace!("reverse suffix forward fast search failed: {}", _err);
//                         self.core.search_nofail(cache, input)
//                     }
//                     Ok(None) => {
//                         unreachable!(
//                             "suffix match plus reverse match implies \
// 						     there must be a match",
//                         )
//                     }
//                     Ok(Some(hm_end)) => {
//                         Some(Match::new(hm_start.pattern(), hm_start.offset()..hm_end.offset()))
//                     }
//                 }
//             }
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search_half(&self, cache: &mut Cache, input: &mut Input) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search_half(cache, input);
//         }
//         match self.try_search_half_start(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse suffix half optimization failed: {}", _err);
//                 self.core.search_half(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse suffix reverse fast half search failed: {}", _err);
//                 self.core.search_half_nofail(cache, input)
//             }
//             Ok(None) => None,
//             Ok(Some(hm_start)) => {
//                 // This is a bit subtle. It is tempting to just stop searching
//                 // at this point and return a half-match with an offset
//                 // corresponding to where the suffix was found. But the suffix
//                 // match does not necessarily correspond to the end of the
//                 // proper leftmost-first match. Consider /[a-z]+ing/ against
//                 // 'tingling'. The first suffix match is the first 'ing', and
//                 // the /[a-z]+/ matches the 't'. So if we stopped here, then
//                 // we'd report 'ting' as the match. But 'tingling' is the
//                 // correct match because of greediness.
//                 let fwdinput = input
//                     .clone()
//                     .anchored(Anchored::Pattern(hm_start.pattern()))
//                     .span(hm_start.offset()..input.end());
//                 match self.try_search_half_fwd(cache, &fwdinput) {
//                     Err(_err) => {
//                         trace!("reverse suffix forward fast search failed: {}", _err);
//                         self.core.search_half_nofail(cache, input)
//                     }
//                     Ok(None) => {
//                         unreachable!(
//                             "suffix match plus reverse match implies \
// 						     there must be a match",
//                         )
//                     }
//                     Ok(Some(hm_end)) => Some(hm_end),
//                 }
//             }
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool {
//         if input.get_anchored().is_anchored() {
//             return self.core.is_match(cache, input);
//         }
//         match self.try_search_half_start(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse suffix half optimization failed: {}", _err);
//                 self.core.is_match_nofail(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse suffix reverse fast half search failed: {}", _err);
//                 self.core.is_match_nofail(cache, input)
//             }
//             Ok(None) => false,
//             Ok(Some(_)) => true,
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search_slots(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         slots: &mut [Option],
//     ) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search_slots(cache, input, slots);
//         }
//         if !self.core.is_capture_search_needed(slots.len()) {
//             trace!("asked for slots unnecessarily, trying fast path");
//             let m = self.search(cache, input)?;
//             copy_match_to_slots(m, slots);
//             return Some(m.pattern());
//         }
//         let hm_start = match self.try_search_half_start(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse suffix captures optimization failed: {}", _err);
//                 return self.core.search_slots(cache, input, slots);
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse suffix reverse fast captures search failed: {}", _err);
//                 return self.core.search_slots_nofail(cache, input, slots);
//             }
//             Ok(None) => return None,
//             Ok(Some(hm_start)) => hm_start,
//         };
//         trace!(
//             "match found at {}..{} in capture search, \
// 		  	 using another engine to find captures",
//             hm_start.offset(),
//             input.end(),
//         );
//         let start = hm_start.offset();
//         let input =
//             input.clone().span(start..input.end()).anchored(Anchored::Pattern(hm_start.pattern()));
//         self.core.search_slots_nofail(cache, &mut input, slots)
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn which_overlapping_matches(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         patset: &mut PatternSet,
//     ) {
//         self.core.which_overlapping_matches(cache, input, patset)
//     }
// }

// #[derive(Debug)]
// struct ReverseInner {
//     core: Core,
//     preinner: Prefilter,
//     nfarev: NFA,
//     hybrid: wrappers::ReverseHybrid,
//     dfa: wrappers::ReverseDFA,
// }

// impl ReverseInner {
//     fn new(core: Core, hirs: &[&Hir]) -> Result {
//         if !core.info.config().get_auto_prefilter() {
//             debug!(
//                 "skipping reverse inner optimization because \
//                  automatic prefilters are disabled"
//             );
//             return Err(core);
//         }
//         // Currently we hard-code the assumption of leftmost-first match
//         // semantics. This isn't a huge deal because 'all' semantics tend to
//         // only be used for forward overlapping searches with multiple regexes,
//         // and this optimization only supports a single pattern at the moment.
//         if core.info.config().get_match_kind() != MatchKind::LeftmostFirst {
//             debug!(
//                 "skipping reverse inner optimization because \
// 				 match kind is {:?} but this only supports leftmost-first",
//                 core.info.config().get_match_kind(),
//             );
//             return Err(core);
//         }
//         // It's likely that a reverse inner scan has too much overhead for it
//         // to be worth it when the regex is anchored at the start. It is
//         // possible for it to be quite a bit faster if the initial literal
//         // scan fails to detect a match, in which case, we can say "no match"
//         // very quickly. But this could be undesirable, e.g., scanning too far
//         // or when the literal scan matches. If it matches, then confirming the
//         // match requires a reverse scan followed by a forward scan to confirm
//         // or reject, which is a fair bit of work.
//         //
//         // Note that the caller can still request an anchored search even
//         // when the regex isn't anchored. We detect that case in the search
//         // routines below and just fallback to the core engine. Currently this
//         // optimization assumes all searches are unanchored, so if we do want
//         // to enable this optimization for anchored searches, it will need a
//         // little work to support it.
//         if core.info.is_always_anchored_start() {
//             debug!(
//                 "skipping reverse inner optimization because \
// 				 the regex is always anchored at the start",
//             );
//             return Err(core);
//         }
//         // Only DFAs can do reverse searches (currently), so we need one of
//         // them in order to do this optimization. It's possible (although
//         // pretty unlikely) that we have neither and need to give up.
//         if !core.hybrid.is_some() && !core.dfa.is_some() {
//             debug!(
//                 "skipping reverse inner optimization because \
// 				 we don't have a lazy DFA or a full DFA"
//             );
//             return Err(core);
//         }
//         if core.pre.as_ref().map_or(false, |p| p.is_fast()) {
//             debug!(
//                 "skipping reverse inner optimization because \
// 				 we already have a prefilter that we think is fast"
//             );
//             return Err(core);
//         } else if core.pre.is_some() {
//             debug!(
//                 "core engine has a prefix prefilter, but it is \
//                  probably not fast, so continuing with attempt to \
//                  use reverse inner prefilter"
//             );
//         }
//         let (concat_prefix, preinner) = match reverse_inner::extract(hirs) {
//             Some(x) => x,
//             // N.B. the 'extract' function emits debug messages explaining
//             // why we bailed out here.
//             None => return Err(core),
//         };
//         debug!("building reverse NFA for prefix before inner literal");
//         let mut lookm = LookMatcher::new();
//         lookm.set_line_terminator(core.info.config().get_line_terminator());
//         let thompson_config = thompson::Config::new()
//             .reverse(true)
//             .utf8(core.info.config().get_utf8_empty())
//             .nfa_size_limit(core.info.config().get_nfa_size_limit())
//             .shrink(false)
//             .which_captures(WhichCaptures::None)
//             .look_matcher(lookm);
//         let result =
//             thompson::Compiler::new().configure(thompson_config).build_from_hir(&concat_prefix);
//         let nfarev = match result {
//             Ok(nfarev) => nfarev,
//             Err(_err) => {
//                 debug!(
//                     "skipping reverse inner optimization because the \
// 					 reverse NFA failed to build: {}",
//                     _err,
//                 );
//                 return Err(core);
//             }
//         };
//         debug!("building reverse DFA for prefix before inner literal");
//         let dfa = if !core.info.config().get_dfa() {
//             wrappers::ReverseDFA::none()
//         } else {
//             wrappers::ReverseDFA::new(&core.info, &nfarev)
//         };
//         let hybrid = if !core.info.config().get_hybrid() {
//             wrappers::ReverseHybrid::none()
//         } else if dfa.is_some() {
//             debug!(
//                 "skipping lazy DFA for reverse inner optimization \
// 				 because we have a full DFA"
//             );
//             wrappers::ReverseHybrid::none()
//         } else {
//             wrappers::ReverseHybrid::new(&core.info, &nfarev)
//         };
//         Ok(ReverseInner { core, preinner, nfarev, hybrid, dfa })
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_full(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//     ) -> Result, RetryError> {
//         let mut span = input.get_span();
//         let mut min_match_start = 0;
//         let mut min_pre_start = 0;
//         loop {
//             let litmatch = match self.preinner.find(input.haystack(), span) {
//                 None => return Ok(None),
//                 Some(span) => span,
//             };
//             if litmatch.start < min_pre_start {
//                 trace!(
//                     "found inner prefilter match at {:?}, which starts \
// 					 before the end of the last forward scan at {}, \
// 					 quitting to avoid quadratic behavior",
//                     litmatch,
//                     min_pre_start,
//                 );
//                 return Err(RetryError::Quadratic(RetryQuadraticError::new()));
//             }
//             trace!("reverse inner scan found inner match at {:?}", litmatch);
//             let revinput =
//                 input.clone().anchored(Anchored::Yes).span(input.start()..litmatch.start);
//             // Note that in addition to the literal search above scanning past
//             // our minimum start point, this routine can also return an error
//             // as a result of detecting possible quadratic behavior if the
//             // reverse scan goes past the minimum start point. That is, the
//             // literal search might not, but the reverse regex search for the
//             // prefix might!
//             match self.try_search_half_rev_limited(cache, &revinput, min_match_start)? {
//                 None => {
//                     if span.start >= span.end {
//                         break;
//                     }
//                     span.start = litmatch.start.checked_add(1).unwrap();
//                 }
//                 Some(hm_start) => {
//                     let fwdinput = input
//                         .clone()
//                         .anchored(Anchored::Pattern(hm_start.pattern()))
//                         .span(hm_start.offset()..input.end());
//                     match self.try_search_half_fwd_stopat(cache, &fwdinput)? {
//                         Err(stopat) => {
//                             min_pre_start = stopat;
//                             span.start = litmatch.start.checked_add(1).unwrap();
//                         }
//                         Ok(hm_end) => {
//                             return Ok(Some(Match::new(
//                                 hm_start.pattern(),
//                                 hm_start.offset()..hm_end.offset(),
//                             )))
//                         }
//                     }
//                 }
//             }
//             min_match_start = litmatch.end;
//         }
//         Ok(None)
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_half_fwd_stopat(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//     ) -> Result, RetryFailError> {
//         if let Some(e) = self.core.dfa.get(&mut input) {
//             trace!("using full DFA for forward reverse inner search at {:?}", input.get_span());
//             e.try_search_half_fwd_stopat(&mut input)
//         } else if let Some(e) = self.core.hybrid.get(&mut input) {
//             trace!("using lazy DFA for forward reverse inner search at {:?}", input.get_span());
//             e.try_search_half_fwd_stopat(&mut cache.hybrid, &mut input)
//         } else {
//             unreachable!("ReverseInner always has a DFA")
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn try_search_half_rev_limited(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         min_start: usize,
//     ) -> Result, RetryError> {
//         if let Some(e) = self.dfa.get(&mut input) {
//             trace!(
//                 "using full DFA for reverse inner search at {:?}, \
//                  but will be stopped at {} to avoid quadratic behavior",
//                 input.get_span(),
//                 min_start,
//             );
//             e.try_search_half_rev_limited(&mut input, min_start)
//         } else if let Some(e) = self.hybrid.get(&mut input) {
//             trace!(
//                 "using lazy DFA for reverse inner search at {:?}, \
//                  but will be stopped at {} to avoid quadratic behavior",
//                 input.get_span(),
//                 min_start,
//             );
//             e.try_search_half_rev_limited(&mut cache.revhybrid, &mut input, min_start)
//         } else {
//             unreachable!("ReverseInner always has a DFA")
//         }
//     }
// }

// impl Strategy for ReverseInner {
//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn group_info(&self) -> &GroupInfo {
//         self.core.group_info()
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn create_cache(&self) -> Cache {
//         let mut cache = self.core.create_cache();
//         cache.revhybrid = self.hybrid.create_cache();
//         cache
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn reset_cache(&self, cache: &mut Cache) {
//         self.core.reset_cache(cache);
//         cache.revhybrid.reset(&self.hybrid);
//     }

//     fn is_accelerated(&self) -> bool {
//         self.preinner.is_fast()
//     }

//     fn memory_usage(&self) -> usize {
//         self.core.memory_usage()
//             + self.preinner.memory_usage()
//             + self.nfarev.memory_usage()
//             + self.dfa.memory_usage()
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search(&self, cache: &mut Cache, input: &mut Input) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search(cache, input);
//         }
//         match self.try_search_full(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse inner optimization failed: {}", _err);
//                 self.core.search(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse inner fast search failed: {}", _err);
//                 self.core.search_nofail(cache, input)
//             }
//             Ok(matornot) => matornot,
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search_half(&self, cache: &mut Cache, input: &mut Input) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search_half(cache, input);
//         }
//         match self.try_search_full(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse inner half optimization failed: {}", _err);
//                 self.core.search_half(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse inner fast half search failed: {}", _err);
//                 self.core.search_half_nofail(cache, input)
//             }
//             Ok(None) => None,
//             Ok(Some(m)) => Some(HalfMatch::new(m.pattern(), m.end())),
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn is_match(&self, cache: &mut Cache, input: &mut Input) -> bool {
//         if input.get_anchored().is_anchored() {
//             return self.core.is_match(cache, input);
//         }
//         match self.try_search_full(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse inner half optimization failed: {}", _err);
//                 self.core.is_match_nofail(cache, input)
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse inner fast half search failed: {}", _err);
//                 self.core.is_match_nofail(cache, input)
//             }
//             Ok(None) => false,
//             Ok(Some(_)) => true,
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn search_slots(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         slots: &mut [Option],
//     ) -> Option {
//         if input.get_anchored().is_anchored() {
//             return self.core.search_slots(cache, input, slots);
//         }
//         if !self.core.is_capture_search_needed(slots.len()) {
//             trace!("asked for slots unnecessarily, trying fast path");
//             let m = self.search(cache, input)?;
//             copy_match_to_slots(m, slots);
//             return Some(m.pattern());
//         }
//         let m = match self.try_search_full(cache, input) {
//             Err(RetryError::Quadratic(_err)) => {
//                 trace!("reverse inner captures optimization failed: {}", _err);
//                 return self.core.search_slots(cache, input, slots);
//             }
//             Err(RetryError::Fail(_err)) => {
//                 trace!("reverse inner fast captures search failed: {}", _err);
//                 return self.core.search_slots_nofail(cache, input, slots);
//             }
//             Ok(None) => return None,
//             Ok(Some(m)) => m,
//         };
//         trace!(
//             "match found at {}..{} in capture search, \
// 		  	 using another engine to find captures",
//             m.start(),
//             m.end(),
//         );
//         let input = input.clone().span(m.start()..m.end()).anchored(Anchored::Pattern(m.pattern()));
//         self.core.search_slots_nofail(cache, &mut input, slots)
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     fn which_overlapping_matches(
//         &self,
//         cache: &mut Cache,
//         input: &mut Input,
//         patset: &mut PatternSet,
//     ) {
//         self.core.which_overlapping_matches(cache, input, patset)
//     }
// }

/// Copies the offsets in the given match to the corresponding positions in
/// `slots`.
///
/// In effect, this sets the slots corresponding to the implicit group for the
/// pattern in the given match. If the indices for the corresponding slots do
/// not exist, then no slots are set.
///
/// This is useful when the caller provides slots (or captures), but you use a
/// regex engine that doesn't operate on slots (like a lazy DFA). This function
/// lets you map the match you get back to the slots provided by the caller.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn copy_match_to_slots(m: Match, slots: &mut [Option]) {
    let slot_start = m.pattern().as_usize() * 2;
    let slot_end = slot_start + 1;
    if let Some(slot) = slots.get_mut(slot_start) {
        *slot = NonMaxUsize::new(m.start());
    }
    if let Some(slot) = slots.get_mut(slot_end) {
        *slot = NonMaxUsize::new(m.end());
    }
}
regex-cursor-0.1.4/src/engines/meta/wrappers.rs000064400000000000000000000653131046102023000176410ustar  00000000000000/*!
This module contains a boat load of wrappers around each of our internal regex
engines. They encapsulate a few things:

1. The wrappers manage the conditional existence of the regex engine. Namely,
the PikeVM is the only required regex engine. The rest are optional. These
wrappers present a uniform API regardless of which engines are available. And
availability might be determined by compile time features or by dynamic
configuration via `meta::Config`. Encapsulating the conditional compilation
features is in particular a huge simplification for the higher level code that
composes these engines.
2. The wrappers manage construction of each engine, including skipping it if
the engine is unavailable or configured to not be used.
3. The wrappers manage whether an engine *can* be used for a particular
search configuration. For example, `BoundedBacktracker::get` only returns a
backtracking engine when the haystack is bigger than the maximum supported
length. The wrappers also sometimes take a position on when an engine *ought*
to be used, but only in cases where the logic is extremely local to the engine
itself. Otherwise, things like "choose between the backtracker and the one-pass
DFA" are managed by the higher level meta strategy code.

There are also corresponding wrappers for the various `Cache` types for each
regex engine that needs them. If an engine is unavailable or not used, then a
cache for it will *not* actually be allocated.
*/

use log::debug;
use regex_automata::nfa::thompson::NFA;
use regex_automata::util::prefilter::Prefilter;
use regex_automata::util::primitives::NonMaxUsize;
use regex_automata::{dfa, hybrid, HalfMatch, Match, MatchKind, PatternID};

use crate::cursor::Cursor;
use crate::engines::meta::error::{BuildError, RetryFailError};
use crate::engines::meta::regex::RegexInfo;
use crate::engines::pikevm;
use crate::Input;

#[derive(Debug)]
pub(crate) struct PikeVM(PikeVMEngine);

impl PikeVM {
    pub(crate) fn new(
        info: &RegexInfo,
        pre: Option,
        nfa: &NFA,
    ) -> Result {
        PikeVMEngine::new(info, pre, nfa).map(PikeVM)
    }

    pub(crate) fn create_cache(&self) -> PikeVMCache {
        PikeVMCache::new(self)
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn get(&self) -> &PikeVMEngine {
        &self.0
    }
}

#[derive(Debug)]
pub(crate) struct PikeVMEngine(pikevm::PikeVM);

impl PikeVMEngine {
    pub(crate) fn new(
        info: &RegexInfo,
        pre: Option,
        nfa: &NFA,
    ) -> Result {
        let pikevm_config =
            pikevm::Config::new().match_kind(info.config().get_match_kind()).prefilter(pre);
        let engine = pikevm::Builder::new()
            .configure(pikevm_config)
            .build_from_nfa(nfa.clone())
            .map_err(BuildError::nfa)?;
        debug!("PikeVM built");
        Ok(PikeVMEngine(engine))
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn is_match(&self, cache: &mut PikeVMCache, input: &mut Input) -> bool {
        crate::engines::pikevm::is_match(&self.0, cache.0.as_mut().unwrap(), input)
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn search_slots(
        &self,
        cache: &mut PikeVMCache,
        input: &mut Input,
        slots: &mut [Option],
    ) -> Option {
        crate::engines::pikevm::search_slots(&self.0, cache.0.as_mut().unwrap(), input, slots)
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn which_overlapping_matches(
    //     &self,
    //     cache: &mut PikeVMCache,
    //     input: &mut Input,
    //     patset: &mut PatternSet,
    // ) {
    //     self.0.which_overlapping_matches(cache.0.as_mut().unwrap(), input, patset)
    // }
}

#[derive(Clone, Debug)]
pub(crate) struct PikeVMCache(Option);

impl PikeVMCache {
    pub(crate) fn none() -> PikeVMCache {
        PikeVMCache(None)
    }

    pub(crate) fn new(builder: &PikeVM) -> PikeVMCache {
        PikeVMCache(Some(pikevm::Cache::new(&builder.get().0)))
    }

    pub(crate) fn reset(&mut self, builder: &PikeVM) {
        self.0.as_mut().unwrap().reset(&builder.get().0);
    }

    pub(crate) fn memory_usage(&self) -> usize {
        self.0.as_ref().map_or(0, |c| c.memory_usage())
    }
}

#[derive(Debug)]
pub(crate) struct Hybrid(Option);

impl Hybrid {
    pub(crate) fn none() -> Hybrid {
        Hybrid(None)
    }

    pub(crate) fn new(info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA) -> Hybrid {
        Hybrid(HybridEngine::new(info, pre, nfa, nfarev))
    }

    pub(crate) fn create_cache(&self) -> HybridCache {
        HybridCache::new(self)
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn get(&self, _input: &mut Input) -> Option<&HybridEngine> {
        let engine = self.0.as_ref()?;
        Some(engine)
    }

    pub(crate) fn is_some(&self) -> bool {
        self.0.is_some()
    }
}

#[derive(Debug)]
pub(crate) struct HybridEngine(hybrid::regex::Regex);

impl HybridEngine {
    pub(crate) fn new(
        info: &RegexInfo,
        pre: Option,
        nfa: &NFA,
        nfarev: &NFA,
    ) -> Option {
        {
            if !info.config().get_hybrid() {
                return None;
            }
            let dfa_config = hybrid::dfa::Config::new()
                .match_kind(info.config().get_match_kind())
                .prefilter(pre.clone())
                // Enabling this is necessary for ensuring we can service any
                // kind of 'Input' search without error. For the lazy DFA,
                // this is not particularly costly, since the start states are
                // generated lazily.
                .starts_for_each_pattern(true)
                .byte_classes(info.config().get_byte_classes())
                .unicode_word_boundary(true)
                .specialize_start_states(pre.is_some())
                .cache_capacity(info.config().get_hybrid_cache_capacity())
                // This makes it possible for building a lazy DFA to
                // fail even though the NFA has already been built. Namely,
                // if the cache capacity is too small to fit some minimum
                // number of states (which is small, like 4 or 5), then the
                // DFA will refuse to build.
                //
                // We shouldn't enable this to make building always work, since
                // this could cause the allocation of a cache bigger than the
                // provided capacity amount.
                //
                // This is effectively the only reason why building a lazy DFA
                // could fail. If it does, then we simply suppress the error
                // and return None.
                .skip_cache_capacity_check(false)
                // This and enabling heuristic Unicode word boundary support
                // above make it so the lazy DFA can quit at match time.
                .minimum_cache_clear_count(Some(3))
                .minimum_bytes_per_state(Some(10));
            let result = hybrid::dfa::Builder::new()
                .configure(dfa_config.clone())
                .build_from_nfa(nfa.clone());
            let fwd = match result {
                Ok(fwd) => fwd,
                Err(_err) => {
                    debug!("forward lazy DFA failed to build: {}", _err);
                    return None;
                }
            };
            let result = hybrid::dfa::Builder::new()
                .configure(
                    dfa_config
                        .clone()
                        .match_kind(MatchKind::All)
                        .prefilter(None)
                        .specialize_start_states(false),
                )
                .build_from_nfa(nfarev.clone());
            let rev = match result {
                Ok(rev) => rev,
                Err(_err) => {
                    debug!("reverse lazy DFA failed to build: {}", _err);
                    return None;
                }
            };
            let engine = hybrid::regex::Builder::new().build_from_dfas(fwd, rev);
            debug!("lazy DFA built");
            Some(HybridEngine(engine))
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search(
        &self,
        cache: &mut HybridCache,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        let cache = cache.0.as_mut().unwrap();
        crate::engines::hybrid::try_search(&self.0, cache, input).map_err(|e| e.into())
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search_half_fwd(
        &self,
        cache: &mut HybridCache,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        let fwd = self.0.forward();
        let fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
        crate::engines::hybrid::try_search_fwd(fwd, fwdcache, input).map_err(|e| e.into())
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn try_search_half_fwd_stopat(
    //     &self,
    //     cache: &mut HybridCache,
    //     input: &mut Input,
    // ) -> Result, RetryFailError> {
    //     let dfa = self.0.forward();
    //     let mut cache = cache.0.as_mut().unwrap().as_parts_mut().0;
    //     crate::meta::stopat::hybrid_try_search_half_fwd(dfa, &mut cache, input)
    // }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search_half_rev(
        &self,
        cache: &mut HybridCache,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        let rev = self.0.reverse();
        let revcache = cache.0.as_mut().unwrap().as_parts_mut().1;
        crate::engines::hybrid::try_search_rev(rev, revcache, input).map_err(|e| e.into())
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn try_search_half_rev_limited(
    //     &self,
    //     cache: &mut HybridCache,
    //     input: &mut Input,
    //     min_start: usize,
    // ) -> Result, RetryError> {
    //     let dfa = self.0.reverse();
    //     let mut cache = cache.0.as_mut().unwrap().as_parts_mut().1;
    //     crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start)
    // }

    // #[inline]
    // pub(crate) fn try_which_overlapping_matches(
    //     &self,
    //     cache: &mut HybridCache,
    //     input: &mut Input,
    //     patset: &mut PatternSet,
    // ) -> Result<(), RetryFailError> {
    //         let fwd = self.0.forward();
    //         let mut fwdcache = cache.0.as_mut().unwrap().as_parts_mut().0;
    //         fwd.try_which_overlapping_matches(&mut fwdcache, input, patset).map_err(|e| e.into())
    // }
}

#[derive(Clone, Debug)]
pub(crate) struct HybridCache(Option);

impl HybridCache {
    pub(crate) fn none() -> HybridCache {
        HybridCache(None)
    }

    pub(crate) fn new(builder: &Hybrid) -> HybridCache {
        HybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
    }

    pub(crate) fn reset(&mut self, builder: &Hybrid) {
        if let Some(ref e) = builder.0 {
            self.0.as_mut().unwrap().reset(&e.0);
        }
    }

    pub(crate) fn memory_usage(&self) -> usize {
        {
            self.0.as_ref().map_or(0, |c| c.memory_usage())
        }
    }
}

#[derive(Debug)]
pub(crate) struct DFA(Option);

impl DFA {
    pub(crate) fn none() -> DFA {
        DFA(None)
    }

    pub(crate) fn new(info: &RegexInfo, pre: Option, nfa: &NFA, nfarev: &NFA) -> DFA {
        DFA(DFAEngine::new(info, pre, nfa, nfarev))
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn get(&self, _input: &mut Input) -> Option<&DFAEngine> {
        let engine = self.0.as_ref()?;
        Some(engine)
    }

    pub(crate) fn is_some(&self) -> bool {
        self.0.is_some()
    }

    pub(crate) fn memory_usage(&self) -> usize {
        self.0.as_ref().map_or(0, |e| e.memory_usage())
    }
}

#[derive(Debug)]
pub(crate) struct DFAEngine(dfa::regex::Regex);

impl DFAEngine {
    pub(crate) fn new(
        info: &RegexInfo,
        pre: Option,
        nfa: &NFA,
        nfarev: &NFA,
    ) -> Option {
        {
            if !info.config().get_dfa() {
                return None;
            }
            // If our NFA is anything but small, don't even bother with a DFA.
            if let Some(state_limit) = info.config().get_dfa_state_limit() {
                if nfa.states().len() > state_limit {
                    debug!(
                        "skipping full DFA because NFA has {} states, \
                         which exceeds the heuristic limit of {}",
                        nfa.states().len(),
                        state_limit,
                    );
                    return None;
                }
            }
            // We cut the size limit in four because the total heap used by
            // DFA construction is determinization aux memory and the DFA
            // itself, and those things are configured independently in the
            // lower level DFA builder API. And then split that in two because
            // of forward and reverse DFAs.
            let size_limit = info.config().get_dfa_size_limit().map(|n| n / 4);
            let dfa_config = dfa::dense::Config::new()
                .match_kind(info.config().get_match_kind())
                .prefilter(pre.clone())
                // Enabling this is necessary for ensuring we can service any
                // kind of 'Input' search without error. For the full DFA, this
                // can be quite costly. But since we have such a small bound
                // on the size of the DFA, in practice, any multl-regexes are
                // probably going to blow the limit anyway.
                .starts_for_each_pattern(true)
                .byte_classes(info.config().get_byte_classes())
                .unicode_word_boundary(true)
                .specialize_start_states(pre.is_some())
                .determinize_size_limit(size_limit)
                .dfa_size_limit(size_limit);
            let result =
                dfa::dense::Builder::new().configure(dfa_config.clone()).build_from_nfa(nfa);
            let fwd = match result {
                Ok(fwd) => fwd,
                Err(_err) => {
                    debug!("forward full DFA failed to build: {}", _err);
                    return None;
                }
            };
            let result = dfa::dense::Builder::new()
                .configure(
                    dfa_config
                        .clone()
                        // We never need unanchored reverse searches, so
                        // there's no point in building it into the DFA, which
                        // WILL take more space. (This isn't done for the lazy
                        // DFA because the DFA is, well, lazy. It doesn't pay
                        // the cost for supporting unanchored searches unless
                        // you actually do an unanchored search, which we
                        // don't.)
                        .start_kind(dfa::StartKind::Anchored)
                        .match_kind(MatchKind::All)
                        .prefilter(None)
                        .specialize_start_states(false),
                )
                .build_from_nfa(nfarev);
            let rev = match result {
                Ok(rev) => rev,
                Err(_err) => {
                    debug!("reverse full DFA failed to build: {}", _err);
                    return None;
                }
            };
            let engine = dfa::regex::Builder::new().build_from_dfas(fwd, rev);
            debug!(
                "fully compiled forward and reverse DFAs built, {} bytes",
                engine.forward().memory_usage() + engine.reverse().memory_usage(),
            );
            Some(DFAEngine(engine))
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search(
        &self,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        crate::engines::dfa::try_search(&self.0, input).map_err(|err| err.into())
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search_half_fwd(
        &self,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        crate::engines::dfa::try_search_fwd(self.0.forward(), input).map_err(|e| e.into())
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn try_search_half_fwd_stopat(
    //     &self,
    //     input: &mut Input,
    // ) -> Result, RetryFailError> {
    //         let dfa = self.0.forward();
    //         crate::meta::stopat::dfa_try_search_half_fwd(dfa, input)
    // }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn try_search_half_rev(
        &self,
        input: &mut Input,
    ) -> Result, RetryFailError> {
        crate::engines::dfa::try_search_rev(self.0.reverse(), input).map_err(|e| e.into())
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn try_search_half_rev_limited(
    //     &self,
    //     input: &mut Input,
    //     min_start: usize,
    // ) -> Result, RetryError> {
    //     let dfa = self.0.reverse();
    //     crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start)
    // }

    // #[inline]
    // pub(crate) fn try_which_overlapping_matches(
    //     &self,
    //     input: &mut Input,
    //     patset: &mut PatternSet,
    // ) -> Result<(), RetryFailError> {
    //         use crate::dfa::Automaton;
    //         self.0.forward().try_which_overlapping_matches(input, patset).map_err(|e| e.into())
    // }

    pub(crate) fn memory_usage(&self) -> usize {
        self.0.forward().memory_usage() + self.0.reverse().memory_usage()
    }
}

// #[derive(Debug)]
// pub(crate) struct ReverseHybrid(Option);

// impl ReverseHybrid {
//     pub(crate) fn none() -> ReverseHybrid {
//         ReverseHybrid(None)
//     }

//     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseHybrid {
//         ReverseHybrid(ReverseHybridEngine::new(info, nfarev))
//     }

//     pub(crate) fn create_cache(&self) -> ReverseHybridCache {
//         ReverseHybridCache::new(self)
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     pub(crate) fn get(&self, _input: &mut Input) -> Option<&ReverseHybridEngine> {
//         let engine = self.0.as_ref()?;
//         Some(engine)
//     }
// }

// #[derive(Debug)]
// pub(crate) struct ReverseHybridEngine(hybrid::dfa::DFA);

// impl ReverseHybridEngine {
//     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option {
//         if !info.config().get_hybrid() {
//             return None;
//         }
//         // Since we only use this for reverse searches, we can hard-code
//         // a number of things like match semantics, prefilters, starts
//         // for each pattern and so on.
//         let dfa_config = hybrid::dfa::Config::new()
//             .match_kind(MatchKind::All)
//             .prefilter(None)
//             .starts_for_each_pattern(false)
//             .byte_classes(info.config().get_byte_classes())
//             .unicode_word_boundary(true)
//             .specialize_start_states(false)
//             .cache_capacity(info.config().get_hybrid_cache_capacity())
//             .skip_cache_capacity_check(false)
//             .minimum_cache_clear_count(Some(3))
//             .minimum_bytes_per_state(Some(10));
//         let result =
//             hybrid::dfa::Builder::new().configure(dfa_config).build_from_nfa(nfarev.clone());
//         let rev = match result {
//             Ok(rev) => rev,
//             Err(_err) => {
//                 debug!("lazy reverse DFA failed to build: {}", _err);
//                 return None;
//             }
//         };
//         debug!("lazy reverse DFA built");
//         Some(ReverseHybridEngine(rev))
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     pub(crate) fn try_search_half_rev_limited(
//         &self,
//         cache: &mut ReverseHybridCache,
//         input: &mut Input,
//         min_start: usize,
//     ) -> Result, RetryError> {
//         let dfa = &self.0;
//         let mut cache = cache.0.as_mut().unwrap();
//         crate::meta::limited::hybrid_try_search_half_rev(dfa, &mut cache, input, min_start)
//     }
// }

// #[derive(Clone, Debug)]
// pub(crate) struct ReverseHybridCache(
//     #[cfg(feature = "hybrid")] Option,
//     #[cfg(not(feature = "hybrid"))] (),
// );

// impl ReverseHybridCache {
//     pub(crate) fn none() -> ReverseHybridCache {
//         #[cfg(feature = "hybrid")]
//         {
//             ReverseHybridCache(None)
//         }
//         #[cfg(not(feature = "hybrid"))]
//         {
//             ReverseHybridCache(())
//         }
//     }

//     pub(crate) fn new(builder: &ReverseHybrid) -> ReverseHybridCache {
//         #[cfg(feature = "hybrid")]
//         {
//             ReverseHybridCache(builder.0.as_ref().map(|e| e.0.create_cache()))
//         }
//         #[cfg(not(feature = "hybrid"))]
//         {
//             ReverseHybridCache(())
//         }
//     }

//     pub(crate) fn reset(&mut self, builder: &ReverseHybrid) {
//         #[cfg(feature = "hybrid")]
//         if let Some(ref e) = builder.0 {
//             self.0.as_mut().unwrap().reset(&e.0);
//         }
//     }

//     pub(crate) fn memory_usage(&self) -> usize {
//         #[cfg(feature = "hybrid")]
//         {
//             self.0.as_ref().map_or(0, |c| c.memory_usage())
//         }
//         #[cfg(not(feature = "hybrid"))]
//         {
//             0
//         }
//     }
// }

// #[derive(Debug)]
// pub(crate) struct ReverseDFA(Option);

// impl ReverseDFA {
//     pub(crate) fn none() -> ReverseDFA {
//         ReverseDFA(None)
//     }

//     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> ReverseDFA {
//         ReverseDFA(ReverseDFAEngine::new(info, nfarev))
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     pub(crate) fn get(&self, _input: &mut Input) -> Option<&ReverseDFAEngine> {
//         let engine = self.0.as_ref()?;
//         Some(engine)
//     }

//     pub(crate) fn is_some(&self) -> bool {
//         self.0.is_some()
//     }

//     pub(crate) fn memory_usage(&self) -> usize {
//         self.0.as_ref().map_or(0, |e| e.memory_usage())
//     }
// }

// #[derive(Debug)]
// pub(crate) struct ReverseDFAEngine(
//     #[cfg(feature = "dfa-build")] dfa::dense::DFA>,
//     #[cfg(not(feature = "dfa-build"))] (),
// );

// impl ReverseDFAEngine {
//     pub(crate) fn new(info: &RegexInfo, nfarev: &NFA) -> Option {
//         #[cfg(feature = "dfa-build")]
//         {
//             if !info.config().get_dfa() {
//                 return None;
//             }
//             // If our NFA is anything but small, don't even bother with a DFA.
//             if let Some(state_limit) = info.config().get_dfa_state_limit() {
//                 if nfarev.states().len() > state_limit {
//                     debug!(
//                         "skipping full reverse DFA because NFA has {} states, \
//                          which exceeds the heuristic limit of {}",
//                         nfarev.states().len(),
//                         state_limit,
//                     );
//                     return None;
//                 }
//             }
//             // We cut the size limit in two because the total heap used by DFA
//             // construction is determinization aux memory and the DFA itself,
//             // and those things are configured independently in the lower level
//             // DFA builder API.
//             let size_limit = info.config().get_dfa_size_limit().map(|n| n / 2);
//             // Since we only use this for reverse searches, we can hard-code
//             // a number of things like match semantics, prefilters, starts
//             // for each pattern and so on. We also disable acceleration since
//             // it's incompatible with limited searches (which is the only
//             // operation we support for this kind of engine at the moment).
//             let dfa_config = dfa::dense::Config::new()
//                 .match_kind(MatchKind::All)
//                 .prefilter(None)
//                 .accelerate(false)
//                 .start_kind(dfa::StartKind::Anchored)
//                 .starts_for_each_pattern(false)
//                 .byte_classes(info.config().get_byte_classes())
//                 .unicode_word_boundary(true)
//                 .specialize_start_states(false)
//                 .determinize_size_limit(size_limit)
//                 .dfa_size_limit(size_limit);
//             let result = dfa::dense::Builder::new().configure(dfa_config).build_from_nfa(&nfarev);
//             let rev = match result {
//                 Ok(rev) => rev,
//                 Err(_err) => {
//                     debug!("full reverse DFA failed to build: {}", _err);
//                     return None;
//                 }
//             };
//             debug!("fully compiled reverse DFA built, {} bytes", rev.memory_usage());
//             Some(ReverseDFAEngine(rev))
//         }
//         #[cfg(not(feature = "dfa-build"))]
//         {
//             None
//         }
//     }

//     #[cfg_attr(feature = "perf-inline", inline(always))]
//     pub(crate) fn try_search_half_rev_limited(
//         &self,
//         input: &mut Input,
//         min_start: usize,
//     ) -> Result, RetryError> {
//         #[cfg(feature = "dfa-build")]
//         {
//             let dfa = &self.0;
//             crate::meta::limited::dfa_try_search_half_rev(dfa, input, min_start)
//         }
//         #[cfg(not(feature = "dfa-build"))]
//         {
//             // Impossible to reach because this engine is never constructed
//             // if the requisite features aren't enabled.
//             unreachable!()
//         }
//     }

//     pub(crate) fn memory_usage(&self) -> usize {
//         #[cfg(feature = "dfa-build")]
//         {
//             self.0.memory_usage()
//         }
//         #[cfg(not(feature = "dfa-build"))]
//         {
//             // Impossible to reach because this engine is never constructed
//             // if the requisite features aren't enabled.
//             unreachable!()
//         }
//     }
// }
regex-cursor-0.1.4/src/engines/pikevm/error.rs000064400000000000000000000160141046102023000174660ustar  00000000000000use regex_automata::util::primitives::{PatternID, StateID};
use regex_automata::util::{captures, look};

/// An error that can occurred during the construction of a thompson NFA.
///
/// This error does not provide many introspection capabilities. There are
/// generally only two things you can do with it:
///
/// * Obtain a human readable message via its `std::fmt::Display` impl.
/// * Access an underlying [`regex_syntax::Error`] type from its `source`
/// method via the `std::error::Error` trait. This error only occurs when using
/// convenience routines for building an NFA directly from a pattern string.
///
/// Otherwise, errors typically occur when a limit has been breeched. For
/// example, if the total heap usage of the compiled NFA exceeds the limit
/// set by [`Config::nfa_size_limit`](crate::nfa::thompson::Config), then
/// building the NFA will fail.
#[derive(Clone, Debug)]
pub struct BuildError {
    kind: BuildErrorKind,
}

/// The kind of error that occurred during the construction of a thompson NFA.
#[derive(Clone, Debug)]
enum BuildErrorKind {
    /// An error that occurred while parsing a regular expression. Note that
    /// this error may be printed over multiple lines, and is generally
    /// intended to be end user readable on its own.
    Syntax(regex_syntax::Error),
    /// An error that occurs if the capturing groups provided to an NFA builder
    /// do not satisfy the documented invariants. For example, things like
    /// too many groups, missing groups, having the first (zeroth) group be
    /// named or duplicate group names within the same pattern.
    Captures(captures::GroupInfoError),
    /// An error that occurs when an NFA contains a Unicode word boundary, but
    /// where the crate was compiled without the necessary data for dealing
    /// with Unicode word boundaries.
    Word(look::UnicodeWordBoundaryError),
    /// An error that occurs if too many patterns were given to the NFA
    /// compiler.
    TooManyPatterns {
        /// The number of patterns given, which exceeds the limit.
        given: usize,
        /// The limit on the number of patterns.
        limit: usize,
    },
    /// An error that occurs if too states are produced while building an NFA.
    TooManyStates {
        /// The minimum number of states that are desired, which exceeds the
        /// limit.
        given: usize,
        /// The limit on the number of states.
        limit: usize,
    },
    /// An error that occurs when NFA compilation exceeds a configured heap
    /// limit.
    ExceededSizeLimit {
        /// The configured limit, in bytes.
        limit: usize,
    },
    /// An error that occurs when an invalid capture group index is added to
    /// the NFA. An "invalid" index can be one that would otherwise overflow
    /// a `usize` on the current target.
    InvalidCaptureIndex {
        /// The invalid index that was given.
        index: u32,
    },
    /// An error that occurs when one tries to build an NFA simulation (such as
    /// the PikeVM) without any capturing groups.
    MissingCaptures,
    /// An error that occurs when one tries to build a reverse NFA with
    /// captures enabled. Currently, this isn't supported, but we probably
    /// should support it at some point.
    UnsupportedCaptures,
}

impl BuildError {
    /// If this error occurred because the NFA exceeded the configured size
    /// limit before being built, then this returns the configured size limit.
    ///
    /// The limit returned is what was configured, and corresponds to the
    /// maximum amount of heap usage in bytes.
    pub fn size_limit(&self) -> Option {
        match self.kind {
            BuildErrorKind::ExceededSizeLimit { limit } => Some(limit),
            _ => None,
        }
    }

    fn kind(&self) -> &BuildErrorKind {
        &self.kind
    }

    pub(crate) fn syntax(err: regex_syntax::Error) -> BuildError {
        BuildError { kind: BuildErrorKind::Syntax(err) }
    }

    pub(crate) fn captures(err: captures::GroupInfoError) -> BuildError {
        BuildError { kind: BuildErrorKind::Captures(err) }
    }

    pub(crate) fn word(err: look::UnicodeWordBoundaryError) -> BuildError {
        BuildError { kind: BuildErrorKind::Word(err) }
    }

    pub(crate) fn too_many_patterns(given: usize) -> BuildError {
        let limit = PatternID::LIMIT;
        BuildError { kind: BuildErrorKind::TooManyPatterns { given, limit } }
    }

    pub(crate) fn too_many_states(given: usize) -> BuildError {
        let limit = StateID::LIMIT;
        BuildError { kind: BuildErrorKind::TooManyStates { given, limit } }
    }

    pub(crate) fn exceeded_size_limit(limit: usize) -> BuildError {
        BuildError { kind: BuildErrorKind::ExceededSizeLimit { limit } }
    }

    pub(crate) fn invalid_capture_index(index: u32) -> BuildError {
        BuildError { kind: BuildErrorKind::InvalidCaptureIndex { index } }
    }

    pub(crate) fn missing_captures() -> BuildError {
        BuildError { kind: BuildErrorKind::MissingCaptures }
    }

    pub(crate) fn unsupported_captures() -> BuildError {
        BuildError { kind: BuildErrorKind::UnsupportedCaptures }
    }
}

impl std::error::Error for BuildError {
    fn source(&self) -> Option<&(dyn std::error::Error + 'static)> {
        match self.kind() {
            BuildErrorKind::Syntax(ref err) => Some(err),
            BuildErrorKind::Captures(ref err) => Some(err),
            _ => None,
        }
    }
}

impl core::fmt::Display for BuildError {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        match self.kind() {
            BuildErrorKind::Syntax(_) => write!(f, "error parsing regex"),
            BuildErrorKind::Captures(_) => {
                write!(f, "error with capture groups")
            }
            BuildErrorKind::Word(_) => {
                write!(f, "NFA contains Unicode word boundary")
            }
            BuildErrorKind::TooManyPatterns { given, limit } => write!(
                f,
                "attempted to compile {} patterns, \
                 which exceeds the limit of {}",
                given, limit,
            ),
            BuildErrorKind::TooManyStates { given, limit } => write!(
                f,
                "attempted to compile {} NFA states, \
                 which exceeds the limit of {}",
                given, limit,
            ),
            BuildErrorKind::ExceededSizeLimit { limit } => {
                write!(f, "heap usage during NFA compilation exceeded limit of {}", limit,)
            }
            BuildErrorKind::InvalidCaptureIndex { index } => {
                write!(f, "capture group index {} is invalid (too big or discontinuous)", index,)
            }
            BuildErrorKind::MissingCaptures => write!(
                f,
                "operation requires the NFA to have capturing groups, \
                 but the NFA given contains none",
            ),
            BuildErrorKind::UnsupportedCaptures => write!(
                f,
                "currently captures must be disabled when compiling \
                 a reverse NFA",
            ),
        }
    }
}
regex-cursor-0.1.4/src/engines/pikevm/tests.rs000064400000000000000000000063341046102023000175030ustar  00000000000000use std::ops::RangeBounds;

use proptest::{prop_assert_eq, proptest};
use regex_automata::nfa::thompson::pikevm::PikeVM;
use regex_automata::nfa::thompson::Config;
use regex_automata::util::syntax::Config as SyntaxConfig;

use crate::engines::pikevm::find_iter;
use crate::input::Input;
use crate::test_rope::SingleByteChunks;

use super::Cache;

fn test(needle: &str, haystack: &[u8]) {
    test_with_bounds(needle, haystack, ..)
}

fn test_with_bounds(needle: &str, haystack: &[u8], bounds: impl RangeBounds + Clone) {
    for utf8 in [true, false] {
        println!("start");
        let regex = PikeVM::builder()
            .syntax(SyntaxConfig::new().utf8(utf8))
            .thompson(Config::new().utf8(utf8))
            .build(needle)
            .unwrap();
        let mut cache1 = regex.create_cache();
        let mut cache2 = Cache::new(®ex);
        let input = regex_automata::Input::new(haystack).range(bounds.clone());
        let iter1: Vec<_> = regex.find_iter(&mut cache1, input).collect();
        let input = Input::new(SingleByteChunks::new(haystack)).range(bounds.clone());
        let iter2: Vec<_> = find_iter(®ex, &mut cache2, input).collect();
        assert_eq!(iter1, iter2);
    }
}

#[test]
fn smoke_test() {
    let text = std::fs::read_to_string("test_cases/syntax.rs").unwrap();
    let regex =
        PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build("vec").unwrap();
    let mut cache = Cache::new(®ex);
    let rope = ropey::Rope::from_str(&text);
    let matches: Vec<_> = find_iter(®ex, &mut cache, Input::new(&rope))
        .map(|range| rope.byte_slice(range.range()))
        .collect();
    println!("found {matches:#?} in syntax.rs");
    assert_eq!(matches.len(), 68);
}

#[test]
fn any() {
    test(".", b" ");
}

#[test]
fn look_around() {
    test(r"(?m)(?:^|a)+", b"a\naaa\n");
    test_with_bounds(r"\b{end}", "𝛃".as_bytes(), 2..3);
    let haystack: String =
        (0..5 * 4096).map(|i| format!("foöbar  foÖ{0}bar foö{0}bar", " ".repeat(i % 31))).collect();
    let needle = r"\bfoö\b[ ]*\bbar\b";
    test(needle, haystack.as_bytes())
}

#[test]
fn maybe_empty() {
    test(r"x*", b"x");
    test(r"\bx*\b", b"x");
}

proptest! {
  #[test]
  fn matches(haystack: String, needle: String) {
    let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else {
        return Ok(())
    };
    let mut cache1 = regex.create_cache();
    let mut cache2 = Cache::new(®ex);
    let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect();
    let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect();
    prop_assert_eq!(iter1, iter2);
  }
  #[test]
  fn matches_word(haystack: String, needle in r"\\b\PC+\\b") {
    let Ok(regex) = PikeVM::builder().syntax(SyntaxConfig::new().case_insensitive(true)).build(&needle) else {
        return Ok(())
    };
    let mut cache1 = regex.create_cache();
    let mut cache2 = Cache::new(®ex);
    let iter1: Vec<_> = regex.find_iter(&mut cache1, &haystack).collect();
    let iter2: Vec<_> = find_iter(®ex, &mut cache2, Input::new(SingleByteChunks::new(haystack.as_bytes()))).collect();
    prop_assert_eq!(iter1, iter2);
  }
}
regex-cursor-0.1.4/src/engines/pikevm.rs000064400000000000000000001323331046102023000163400ustar  00000000000000#[cfg(feature = "internal-instrument-pikevm")]
use core::cell::RefCell;

pub use regex_automata::nfa::thompson::pikevm::{Builder, Config, PikeVM};
use regex_automata::nfa::thompson::State;
use regex_automata::util::captures::Captures;
use regex_automata::util::primitives::{NonMaxUsize, SmallIndex, StateID};
use regex_automata::{Anchored, HalfMatch, Match, MatchKind, PatternID};

use crate::cursor::Cursor;
use crate::util::sparse_set::SparseSet;
use crate::util::{empty, iter};
use crate::{literal, Input};

#[cfg(test)]
mod tests;

/// Returns an iterator over all non-overlapping leftmost matches in the
/// given bytes. If no match exists, then the iterator yields no elements.
///
/// # Example
///
/// ```
/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
///
/// let re = PikeVM::new("foo[0-9]+")?;
/// let mut cache = re.create_cache();
///
/// let text = "foo1 foo12 foo123";
/// let matches: Vec = re.find_iter(&mut cache, text).collect();
/// assert_eq!(matches, vec![
///     Match::must(0, 0..4),
///     Match::must(0, 5..10),
///     Match::must(0, 11..17),
/// ]);
/// # Ok::<(), Box>(())
/// ```
#[inline]
pub fn find_iter<'r, 'c, C: Cursor>(
    vm: &'r PikeVM,
    cache: &'c mut Cache,
    input: Input,
) -> FindMatches<'r, 'c, C> {
    let caps = Captures::matches(vm.get_nfa().group_info().clone());
    let it = iter::Searcher::new(input);
    FindMatches { re: vm, cache, caps, it }
}

/// Executes a leftmost forward search and writes the spans of capturing
/// groups that participated in a match into the provided [`Captures`]
/// value. If no match was found, then [`Captures::is_match`] is guaranteed
/// to return `false`.
///
/// This is like [`PikeVM::captures`], but it accepts a concrete `&Input`
/// instead of an `Into`.
///
/// # Example: specific pattern search
///
/// This example shows how to build a multi-PikeVM that permits searching
/// for specific patterns.
///
/// ```
/// use regex_automata::{
///     nfa::thompson::pikevm::PikeVM,
///     Anchored, Match, PatternID, Input,
/// };
///
/// let re = PikeVM::new_many(&["[a-z0-9]{6}", "[a-z][a-z0-9]{5}"])?;
/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
/// let haystack = "foo123";
///
/// // Since we are using the default leftmost-first match and both
/// // patterns match at the same starting position, only the first pattern
/// // will be returned in this case when doing a search for any of the
/// // patterns.
/// let expected = Some(Match::must(0, 0..6));
/// re.search(&mut cache, &Input::new(haystack), &mut caps);
/// assert_eq!(expected, caps.get_match());
///
/// // But if we want to check whether some other pattern matches, then we
/// // can provide its pattern ID.
/// let expected = Some(Match::must(1, 0..6));
/// let input = Input::new(haystack)
///     .anchored(Anchored::Pattern(PatternID::must(1)));
/// re.search(&mut cache, &input, &mut caps);
/// assert_eq!(expected, caps.get_match());
///
/// # Ok::<(), Box>(())
/// ```
///
/// # Example: specifying the bounds of a search
///
/// This example shows how providing the bounds of a search can produce
/// different results than simply sub-slicing the haystack.
///
/// ```
/// # if cfg!(miri) { return Ok(()); } // miri takes too long
/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
///
/// let re = PikeVM::new(r"\b[0-9]{3}\b")?;
/// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
/// let haystack = "foo123bar";
///
/// // Since we sub-slice the haystack, the search doesn't know about
/// // the larger context and assumes that `123` is surrounded by word
/// // boundaries. And of course, the match position is reported relative
/// // to the sub-slice as well, which means we get `0..3` instead of
/// // `3..6`.
/// let expected = Some(Match::must(0, 0..3));
/// re.search(&mut cache, &Input::new(&haystack[3..6]), &mut caps);
/// assert_eq!(expected, caps.get_match());
///
/// // But if we provide the bounds of the search within the context of the
/// // entire haystack, then the search can take the surrounding context
/// // into account. (And if we did find a match, it would be reported
/// // as a valid offset into `haystack` instead of its sub-slice.)
/// let expected = None;
/// let input = Input::new(haystack).range(3..6);
/// re.search(&mut cache, &input, &mut caps);
/// assert_eq!(expected, caps.get_match());
///
/// # Ok::<(), Box>(())
/// ```
#[inline]
pub fn search(
    vm: &PikeVM,
    cache: &mut Cache,
    input: &mut Input,
    caps: &mut Captures,
) {
    caps.set_pattern(None);
    let pid = search_slots(vm, cache, input, caps.slots_mut());
    caps.set_pattern(pid);
}

/// Returns true if and only if this `PikeVM` matches the given haystack.
///
/// This routine may short circuit if it knows that scanning future
/// input will never lead to a different result. In particular, if the
/// underlying NFA enters a match state, then this routine will return
/// `true` immediately without inspecting any future input. (Consider how
/// this might make a difference given the regex `a+` on the haystack
/// `aaaaaaaaaaaaaaa`. This routine can stop after it sees the first `a`,
/// but routines like `find` need to continue searching because `+` is
/// greedy by default.)
///
/// # Example
///
/// This shows basic usage:
///
/// ```
/// use regex_automata::nfa::thompson::pikevm::PikeVM;
///
/// let re = PikeVM::new("foo[0-9]+bar")?;
/// let mut cache = re.create_cache();
///
/// assert!(re.is_match(&mut cache, "foo12345bar"));
/// assert!(!re.is_match(&mut cache, "foobar"));
/// # Ok::<(), Box>(())
/// ```
///
/// # Example: consistency with search APIs
///
/// `is_match` is guaranteed to return `true` whenever `find` returns a
/// match. This includes searches that are executed entirely within a
/// codepoint:
///
/// ```
/// use regex_automata::{nfa::thompson::pikevm::PikeVM, Input};
///
/// let re = PikeVM::new("a*")?;
/// let mut cache = re.create_cache();
///
/// assert!(!re.is_match(&mut cache, Input::new("☃").span(1..2)));
/// # Ok::<(), Box>(())
/// ```
///
/// Notice that when UTF-8 mode is disabled, then the above reports a
/// match because the restriction against zero-width matches that split a
/// codepoint has been lifted:
///
/// ```
/// use regex_automata::{nfa::thompson::{pikevm::PikeVM, NFA}, Input};
///
/// let re = PikeVM::builder()
///     .thompson(NFA::config().utf8(false))
///     .build("a*")?;
/// let mut cache = re.create_cache();
///
/// assert!(re.is_match(&mut cache, Input::new("☃").span(1..2)));
/// # Ok::<(), Box>(())
/// ```
#[inline]
pub fn is_match(vm: &PikeVM, cache: &mut Cache, input: &mut Input) -> bool {
    input.with(|input| {
        let input = input.earliest(true);
        search_slots(vm, cache, input, &mut []).is_some()
    })
}
/// A simple macro for conditionally executing instrumentation logic when
/// the 'trace' log level is enabled. This is a compile-time no-op when the
/// 'internal-instrument-pikevm' feature isn't enabled. The intent here is that
/// this makes it easier to avoid doing extra work when instrumentation isn't
/// enabled.
///
/// This macro accepts a closure of type `|&mut Counters|`. The closure can
/// then increment counters (or whatever) in accordance with what one wants
/// to track.
macro_rules! instrument {
    ($fun:expr) => {
        #[cfg(feature = "internal-instrument-pikevm")]
        {
            let fun: &mut dyn FnMut(&mut Counters) = &mut $fun;
            COUNTERS.with(|c: &RefCell| fun(&mut *c.borrow_mut()));
        }
    };
}

#[cfg(feature = "internal-instrument-pikevm")]
std::thread_local! {
    /// Effectively global state used to keep track of instrumentation
    /// counters. The "proper" way to do this is to thread it through the
    /// PikeVM, but it makes the code quite icky. Since this is just a
    /// debugging feature, we're content to relegate it to thread local
    /// state. When instrumentation is enabled, the counters are reset at the
    /// beginning of every search and printed (with the 'trace' log level) at
    /// the end of every search.
    static COUNTERS: RefCell = RefCell::new(Counters::empty());
}

pub fn search_slots(
    vm: &PikeVM,
    cache: &mut Cache,
    input: &mut Input,
    slots: &mut [Option],
) -> Option {
    let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
    if !utf8empty {
        let hm = search_slots_imp(vm, cache, input, slots)?;
        return Some(hm.pattern());
    }
    // There is an unfortunate special case where if the regex can
    // match the empty string and UTF-8 mode is enabled, the search
    // implementation requires that the slots have at least as much space
    // to report the bounds of any match. This is so zero-width matches
    // that split a codepoint can be filtered out.
    //
    // Note that if utf8empty is true, we specialize the case for when
    // the number of patterns is 1. In that case, we can just use a stack
    // allocation. Otherwise we resort to a heap allocation, which we
    // convince ourselves we're fine with due to the pathological nature of
    // this case.
    let min = vm.get_nfa().group_info().implicit_slot_len();
    if slots.len() >= min {
        let hm = search_slots_imp(vm, cache, input, slots)?;
        return Some(hm.pattern());
    }
    if vm.get_nfa().pattern_len() == 1 {
        let mut enough = [None, None];
        let got = search_slots_imp(vm, cache, input, &mut enough);
        // This is OK because we know `enough` is strictly bigger than
        // `slots`, otherwise this special case isn't reached.
        slots.copy_from_slice(&enough[..slots.len()]);
        return got.map(|hm| hm.pattern());
    }
    let mut enough = vec![None; min];
    let got = search_slots_imp(vm, cache, input, &mut enough);
    // This is OK because we know `enough` is strictly bigger than `slots`,
    // otherwise this special case isn't reached.
    slots.copy_from_slice(&enough[..slots.len()]);
    got.map(|hm| hm.pattern())
}

/// This is the actual implementation of `search_slots_imp` that
/// doesn't account for the special case when 1) the NFA has UTF-8 mode
/// enabled, 2) the NFA can match the empty string and 3) the caller has
/// provided an insufficient number of slots to record match offsets.
#[inline(never)]
fn search_slots_imp(
    vm: &PikeVM,
    cache: &mut Cache,
    input: &mut Input,
    slots: &mut [Option],
) -> Option {
    let utf8empty = vm.get_nfa().has_empty() && vm.get_nfa().is_utf8();
    let hm = match search_imp(vm, cache, input, slots) {
        None => return None,
        Some(hm) if !utf8empty => return Some(hm),
        Some(hm) => hm,
    };
    empty::skip_splits_fwd(input, hm, hm.offset(), |input| {
        Ok(search_imp(vm, cache, input, slots).map(|hm| (hm, hm.offset())))
    })
    // OK because the PikeVM never errors.
    .unwrap()
}

/// Return the starting configuration of a PikeVM search.
///
/// The "start config" is basically whether the search should be anchored
/// or not and the NFA state ID at which to begin the search. The state ID
/// returned always corresponds to an anchored starting state even when the
/// search is unanchored. This is because the PikeVM search loop deals with
/// unanchored searches with an explicit epsilon closure out of the start
/// state.
///
/// This routine accounts for both the caller's `Input` configuration
/// and the pattern itself. For example, even if the caller asks for an
/// unanchored search, if the pattern itself is anchored, then this will
/// always return 'true' because implementing an unanchored search in that
/// case would be incorrect.
///
/// Similarly, if the caller requests an anchored search for a particular
/// pattern, then the starting state ID returned will reflect that.
///
/// If a pattern ID is given in the input configuration that is not in
/// this regex, then `None` is returned.
fn start_config(vm: &PikeVM, input: &Input) -> Option<(bool, StateID)> {
    match input.get_anchored() {
        // Only way we're unanchored is if both the caller asked for an
        // unanchored search *and* the pattern is itself not anchored.
        Anchored::No => {
            Some((vm.get_nfa().is_always_start_anchored(), vm.get_nfa().start_anchored()))
        }
        Anchored::Yes => Some((true, vm.get_nfa().start_anchored())),
        Anchored::Pattern(pid) => Some((true, vm.get_nfa().start_pattern(pid)?)),
    }
}

fn search_imp(
    vm: &PikeVM,
    cache: &mut Cache,
    input: &mut Input,
    slots: &mut [Option],
) -> Option {
    cache.setup_search(slots.len());
    if input.is_done() {
        return None;
    }
    instrument!(|c| c.reset(&self.nfa));

    // Whether we want to visit all match states instead of emulating the
    // 'leftmost' semantics of typical backtracking regex engines.
    let allmatches = vm.get_config().get_match_kind() == MatchKind::All;
    let (anchored, start_id) = match start_config(vm, input) {
        None => return None,
        Some(config) => config,
    };

    let pre = if anchored { None } else { vm.get_config().get_prefilter() };
    let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
    let mut hm = None;
    // Yes, our search doesn't end at input.end(), but includes it. This
    // is necessary because matches are delayed by one byte, just like
    // how the DFA engines work. The delay is used to handle look-behind
    // assertions. In the case of the PikeVM, the delay is implemented
    // by not considering a match to exist until it is visited in
    // 'steps'. Technically, we know a match exists in the previous
    // iteration via 'epsilon_closure'. (It's the same thing in NFA-to-DFA
    // determinization. We don't mark a DFA state as a match state if it
    // contains an NFA match state, but rather, whether the DFA state was
    // generated by a transition from a DFA state that contains an NFA
    // match state.)
    input.move_to(input.start());
    input.clear_look_behind();
    input.ensure_look_behind();
    while input.at() <= input.end() {
        // If we have no states left to visit, then there are some cases
        // where we know we can quit early or even skip ahead.
        if curr.set.is_empty() {
            // We have a match and we haven't been instructed to continue
            // on even after finding a match, so we can quit.
            if hm.is_some() && !allmatches {
                break;
            }
            // If we're running an anchored search and we've advanced
            // beyond the start position with no other states to try, then
            // we will never observe a match and thus can stop.
            if anchored && input.at() > input.start() {
                break;
            }
            // If there no states left to explore at this position and we
            // know we can't terminate early, then we are effectively at
            // the starting state of the NFA. If we fell through here,
            // we'd end up adding our '(?s-u:.)*?' prefix and it would be
            // the only thing in 'curr'. So we might as well just skip
            // ahead until we find something that we know might advance us
            // forward.
            if let Some(pre) = pre {
                let chunk_offst = input.chunk_offset();
                match literal::find(pre, input) {
                    None => break,
                    Some(ref span) => {
                        input.move_to(span.start);
                        if chunk_offst != input.chunk_offset() {
                            input.clear_look_behind();
                            input.ensure_look_behind();
                        }
                    }
                }
            }
        }
        // Instead of using the NFA's unanchored start state, we actually
        // always use its anchored starting state. As a result, when doing
        // an unanchored search, we need to simulate our own '(?s-u:.)*?'
        // prefix, to permit a match to appear anywhere.
        //
        // Now, we don't *have* to do things this way. We could use the
        // NFA's unanchored starting state and do one 'epsilon_closure'
        // call from that starting state before the main loop here. And
        // that is just as correct. However, it turns out to be slower
        // than our approach here because it slightly increases the cost
        // of processing each byte by requiring us to visit more NFA
        // states to deal with the additional NFA states in the unanchored
        // prefix. By simulating it explicitly here, we lower those costs
        // substantially. The cost is itself small, but it adds up for
        // large haystacks.
        //
        // In order to simulate the '(?s-u:.)*?' prefix---which is not
        // greedy---we are careful not to perform an epsilon closure on
        // the start state if we already have a match. Namely, if we
        // did otherwise, we would never reach a terminating condition
        // because there would always be additional states to process.
        // In effect, the exclusion of running 'epsilon_closure' when
        // we have a match corresponds to the "dead" states we have in
        // our DFA regex engines. Namely, in a DFA, match states merely
        // instruct the search execution to record the current offset as
        // the most recently seen match. It is the dead state that actually
        // indicates when to stop the search (other than EOF or quit
        // states).
        //
        // However, when 'allmatches' is true, the caller has asked us to
        // leave in every possible match state. This tends not to make a
        // whole lot of sense in unanchored searches, because it means the
        // search really cannot terminate until EOF. And often, in that
        // case, you wind up skipping over a bunch of matches and are left
        // with the "last" match. Arguably, it just doesn't make a lot of
        // sense to run a 'leftmost' search (which is what this routine is)
        // with 'allmatches' set to true. But the DFAs support it and this
        // matches their behavior. (Generally, 'allmatches' is useful for
        // overlapping searches or leftmost anchored searches to find the
        // longest possible match by ignoring match priority.)
        //
        // Additionally, when we're running an anchored search, this
        // epsilon closure should only be computed at the beginning of the
        // search. If we re-computed it at every position, we would be
        // simulating an unanchored search when we were tasked to perform
        // an anchored search.
        if (hm.is_none() || allmatches) && (!anchored || input.at() == input.start()) {
            // Since we are adding to the 'curr' active states and since
            // this is for the start ID, we use a slots slice that is
            // guaranteed to have the right length but where every element
            // is absent. This is exactly what we want, because this
            // epsilon closure is responsible for simulating an unanchored
            // '(?s:.)*?' prefix. It is specifically outside of any
            // capturing groups, and thus, using slots that are always
            // absent is correct.
            //
            // Note though that we can't just use '&mut []' here, since
            // this epsilon closure may traverse through 'Captures' epsilon
            // transitions, and thus must be able to write offsets to the
            // slots given which are later copied to slot values in 'curr'.
            let slots = next.slot_table.all_absent();
            epsilon_closure(vm, stack, slots, curr, input, start_id);
        }
        input.chunk_pos += 1;
        if input.chunk_pos() >= input.chunk().len() {
            input.advance_with_look_behind();
        }
        if let Some(pid) = nexts(vm, stack, curr, next, input, slots) {
            hm = Some(HalfMatch::new(pid, input.at() - 1));
        }
        // Unless the caller asked us to return early, we need to mush on
        // to see if we can extend our match. (But note that 'nexts' will
        // quit right after seeing a match when match_kind==LeftmostFirst,
        // as is consistent with leftmost-first match priority.)
        if input.get_earliest() && hm.is_some() {
            break;
        }
        core::mem::swap(curr, next);
        next.set.clear();
    }
    instrument!(|c| c.eprint(&self.nfa));
    hm
}

/// Process the active states in 'curr' to find the states (written to
/// 'next') we should process for the next byte in the haystack.
///
/// 'stack' is used to perform a depth first traversal of the NFA when
/// computing an epsilon closure.
///
/// When a match is found, the slots for that match state (in 'curr') are
/// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
/// stops (unless the PikeVM was configured with MatchKind::All semantics).
#[cfg_attr(feature = "perf-inline", inline(always))]
fn nexts(
    vm: &PikeVM,
    stack: &mut Vec,
    curr: &mut ActiveStates,
    next_: &mut ActiveStates,
    input: &mut Input,
    slots: &mut [Option],
) -> Option {
    instrument!(|c| c.record_state_set(&curr.set));
    let mut pid = None;
    let ActiveStates { ref set, ref mut slot_table } = *curr;
    for sid in set.iter() {
        pid = match next(vm, stack, slot_table, next_, input, sid) {
            None => continue,
            Some(pid) => Some(pid),
        };
        slots.copy_from_slice(slot_table.for_state(sid));
        if vm.get_config().get_match_kind() != MatchKind::All {
            break;
        }
    }
    pid
}

/// Starting from 'sid', if the position 'at' in the 'input' haystack has a
/// transition defined out of 'sid', then add the state transitioned to and
/// its epsilon closure to the 'next' set of states to explore.
///
/// 'stack' is used by the epsilon closure computation to perform a depth
/// first traversal of the NFA.
///
/// 'curr_slot_table' should be the table of slots for the current set of
/// states being explored. If there is a transition out of 'sid', then
/// sid's row in the slot table is used to perform the epsilon closure.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn next(
    vm: &PikeVM,
    stack: &mut Vec,
    curr_slot_table: &mut SlotTable,
    next: &mut ActiveStates,
    input: &mut Input,
    sid: StateID,
) -> Option {
    instrument!(|c| c.record_step(sid));
    let state = vm.get_nfa().state(sid);
    match *state {
        State::Fail
        | State::Look { .. }
        | State::Union { .. }
        | State::BinaryUnion { .. }
        | State::Capture { .. } => None,
        State::ByteRange { ref trans } => {
            let (chunk, pos) = input.look_around();
            if trans.matches(chunk, pos - 1) {
                let slots = curr_slot_table.for_state(sid);
                epsilon_closure(vm, stack, slots, next, input, trans.next);
            }
            None
        }
        State::Sparse(ref sparse) => {
            let (chunk, pos) = input.look_around();
            if let Some(next_sid) = sparse.matches(chunk, pos - 1) {
                let slots = curr_slot_table.for_state(sid);
                epsilon_closure(vm, stack, slots, next, input, next_sid);
            }
            None
        }
        State::Dense(ref dense) => {
            let (chunk, pos) = input.look_around();
            if let Some(next_sid) = dense.matches(chunk, pos - 1) {
                let slots = curr_slot_table.for_state(sid);
                epsilon_closure(vm, stack, slots, next, input, next_sid);
            }
            None
        }
        State::Match { pattern_id } => Some(pattern_id),
    }
}

/// Compute the epsilon closure of 'sid', writing the closure into 'next'
/// while copying slot values from 'curr_slots' into corresponding states
/// in 'next'. 'curr_slots' should be the slot values corresponding to
/// 'sid'.
///
/// The given 'stack' is used to perform a depth first traversal of the
/// NFA by recursively following all epsilon transitions out of 'sid'.
/// Conditional epsilon transitions are followed if and only if they are
/// satisfied for the position 'at' in the 'input' haystack.
///
/// While this routine may write to 'curr_slots', once it returns, any
/// writes are undone and the original values (even if absent) are
/// restored.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn epsilon_closure(
    vm: &PikeVM,
    stack: &mut Vec,
    curr_slots: &mut [Option],
    next: &mut ActiveStates,
    input: &mut Input,
    sid: StateID,
) {
    instrument!(|c| {
        c.record_closure(sid);
        c.record_stack_push(sid);
    });
    stack.push(FollowEpsilon::Explore(sid));
    while let Some(frame) = stack.pop() {
        match frame {
            FollowEpsilon::RestoreCapture { slot, offset: pos } => {
                curr_slots[slot] = pos;
            }
            FollowEpsilon::Explore(sid) => {
                epsilon_closure_explore(vm, stack, curr_slots, next, input, sid);
            }
        }
    }
}

/// Explore all of the epsilon transitions out of 'sid'. This is mostly
/// split out from 'epsilon_closure' in order to clearly delineate
/// the actual work of computing an epsilon closure from the stack
/// book-keeping.
///
/// This will push any additional explorations needed on to 'stack'.
///
/// 'curr_slots' should refer to the slots for the currently active NFA
/// state. That is, the current state we are stepping through. These
/// slots are mutated in place as new 'Captures' states are traversed
/// during epsilon closure, but the slots are restored to their original
/// values once the full epsilon closure is completed. The ultimate use of
/// 'curr_slots' is to copy them to the corresponding 'next_slots', so that
/// the capturing group spans are forwarded from the currently active state
/// to the next.
///
/// 'next' refers to the next set of active states. Computing an epsilon
/// closure may increase the next set of active states.
///
/// 'input' refers to the caller's input configuration and 'at' refers to
/// the current position in the haystack. These are used to check whether
/// conditional epsilon transitions (like look-around) are satisfied at
/// the current position. If they aren't, then the epsilon closure won't
/// include them.
#[cfg_attr(feature = "perf-inline", inline(always))]
fn epsilon_closure_explore(
    vm: &PikeVM,
    stack: &mut Vec,
    curr_slots: &mut [Option],
    next: &mut ActiveStates,
    input: &mut Input,
    mut sid: StateID,
) {
    // We can avoid pushing some state IDs on to our stack in precisely
    // the cases where a 'push(x)' would be immediately followed by a 'x
    // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
    // to be the next state ID we want to explore once we're done with
    // our initial exploration. In practice, this avoids a lot of stack
    // thrashing.
    loop {
        instrument!(|c| c.record_set_insert(sid));
        // Record this state as part of our next set of active states. If
        // we've already explored it, then no need to do it again.
        if !next.set.insert(sid) {
            return;
        }
        match *vm.get_nfa().state(sid) {
            State::Fail
            | State::Match { .. }
            | State::ByteRange { .. }
            | State::Sparse { .. }
            | State::Dense { .. } => {
                next.slot_table.for_state(sid).copy_from_slice(curr_slots);
                return;
            }
            State::Look { look, next } => {
                // OK because we don't permit building a searcher with a
                // Unicode word boundary if the requisite Unicode data is
                // unavailable.
                let (chunk, at) = input.look_around();
                if !vm.get_nfa().look_matcher().matches(look, chunk, at) {
                    return;
                }
                sid = next;
            }
            State::Union { ref alternates } => {
                sid = match alternates.get(0) {
                    None => return,
                    Some(&sid) => sid,
                };
                instrument!(|c| {
                    for &alt in &alternates[1..] {
                        c.record_stack_push(alt);
                    }
                });
                stack.extend(alternates[1..].iter().copied().rev().map(FollowEpsilon::Explore));
            }
            State::BinaryUnion { alt1, alt2 } => {
                sid = alt1;
                instrument!(|c| c.record_stack_push(sid));
                stack.push(FollowEpsilon::Explore(alt2));
            }
            State::Capture { next, slot, .. } => {
                // There's no need to do anything with slots that
                // ultimately won't be copied into the caller-provided
                // 'Captures' value. So we just skip dealing with them at
                // all.
                if slot.as_usize() < curr_slots.len() {
                    instrument!(|c| c.record_stack_push(sid));
                    stack.push(FollowEpsilon::RestoreCapture { slot, offset: curr_slots[slot] });
                    // OK because length of a slice must fit into an isize.
                    curr_slots[slot] = Some(NonMaxUsize::new(input.at()).unwrap());
                }
                sid = next;
            }
        }
    }
}
/// A cache represents mutable state that a [`PikeVM`] requires during a
/// search.
///
/// For a given [`PikeVM`], its corresponding cache may be created either via
/// [`PikeVM::create_cache`], or via [`Cache::new`]. They are equivalent in
/// every way, except the former does not require explicitly importing `Cache`.
///
/// A particular `Cache` is coupled with the [`PikeVM`] from which it
/// was created. It may only be used with that `PikeVM`. A cache and its
/// allocations may be re-purposed via [`Cache::reset`], in which case, it can
/// only be used with the new `PikeVM` (and not the old one).
#[derive(Clone, Debug)]
pub struct Cache {
    /// Stack used while computing epsilon closure. This effectively lets us
    /// move what is more naturally expressed through recursion to a stack
    /// on the heap.
    stack: Vec,
    /// The current active states being explored for the current byte in the
    /// haystack.
    curr: ActiveStates,
    /// The next set of states we're building that will be explored for the
    /// next byte in the haystack.
    next: ActiveStates,
}

impl Cache {
    /// Create a new [`PikeVM`] cache.
    ///
    /// A potentially more convenient routine to create a cache is
    /// [`PikeVM::create_cache`], as it does not require also importing the
    /// `Cache` type.
    ///
    /// If you want to reuse the returned `Cache` with some other `PikeVM`,
    /// then you must call [`Cache::reset`] with the desired `PikeVM`.
    pub fn new(re: &PikeVM) -> Cache {
        Cache { stack: vec![], curr: ActiveStates::new(re), next: ActiveStates::new(re) }
    }

    /// Reset this cache such that it can be used for searching with a
    /// different [`PikeVM`].
    ///
    /// A cache reset permits reusing memory already allocated in this cache
    /// with a different `PikeVM`.
    ///
    /// # Example
    ///
    /// This shows how to re-purpose a cache for use with a different `PikeVM`.
    ///
    /// ```
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match};
    ///
    /// let re1 = PikeVM::new(r"\w")?;
    /// let re2 = PikeVM::new(r"\W")?;
    ///
    /// let mut cache = re1.create_cache();
    /// assert_eq!(
    ///     Some(Match::must(0, 0..2)),
    ///     re1.find_iter(&mut cache, "Δ").next(),
    /// );
    ///
    /// // Using 'cache' with re2 is not allowed. It may result in panics or
    /// // incorrect results. In order to re-purpose the cache, we must reset
    /// // it with the PikeVM we'd like to use it with.
    /// //
    /// // Similarly, after this reset, using the cache with 're1' is also not
    /// // allowed.
    /// cache.reset(&re2);
    /// assert_eq!(
    ///     Some(Match::must(0, 0..3)),
    ///     re2.find_iter(&mut cache, "☃").next(),
    /// );
    ///
    /// # Ok::<(), Box>(())
    /// ```
    pub fn reset(&mut self, re: &PikeVM) {
        self.curr.reset(re);
        self.next.reset(re);
    }

    /// Returns the heap memory usage, in bytes, of this cache.
    ///
    /// This does **not** include the stack size used up by this cache. To
    /// compute that, use `std::mem::size_of::()`.
    pub fn memory_usage(&self) -> usize {
        use core::mem::size_of;
        (self.stack.len() * size_of::())
            + self.curr.memory_usage()
            + self.next.memory_usage()
    }

    /// Clears this cache. This should be called at the start of every search
    /// to ensure we start with a clean slate.
    ///
    /// This also sets the length of the capturing groups used in the current
    /// search. This permits an optimization where by 'SlotTable::for_state'
    /// only returns the number of slots equivalent to the number of slots
    /// given in the 'Captures' value. This may be less than the total number
    /// of possible slots, e.g., when one only wants to track overall match
    /// offsets. This in turn permits less copying of capturing group spans
    /// in the PikeVM.
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.stack.clear();
        self.curr.setup_search(captures_slot_len);
        self.next.setup_search(captures_slot_len);
    }
}

/// Represents a stack frame for use while computing an epsilon closure.
///
/// (An "epsilon closure" refers to the set of reachable NFA states from a
/// single state without consuming any input. That is, the set of all epsilon
/// transitions not only from that single state, but from every other state
/// reachable by an epsilon transition as well. This is why it's called a
/// "closure." Computing an epsilon closure is also done during DFA
/// determinization! Compare and contrast the epsilon closure here in this
/// PikeVM and the one used for determinization in crate::util::determinize.)
///
/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
/// first traversal over all epsilon transitions from a particular state.
/// (A depth first traversal is important because it emulates the same priority
/// of matches that is typically found in backtracking regex engines.) This
/// depth first traversal is naturally expressed using recursion, but to avoid
/// a call stack size proportional to the size of a regex, we put our stack on
/// the heap instead.
///
/// This stack thus consists of call frames. The typical call frame is
/// `Explore`, which instructs epsilon closure to explore the epsilon
/// transitions from that state. (Subsequent epsilon transitions are then
/// pushed on to the stack as more `Explore` frames.) If the state ID being
/// explored has no epsilon transitions, then the capturing group slots are
/// copied from the original state that sparked the epsilon closure (from the
/// 'step' routine) to the state ID being explored. This way, capturing group
/// slots are forwarded from the previous state to the next.
///
/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
/// set the position for a particular slot back to some particular offset. This
/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
/// set the offset of the slot indicated in `Capture` to the current offset,
/// and then push the old offset on to the stack as a `RestoreCapture` frame.
/// Thus, the new offset is only used until the epsilon closure reverts back to
/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
/// transition its "scope" to only states that come "after" it during depth
/// first traversal.
#[derive(Clone, Debug)]
enum FollowEpsilon {
    /// Explore the epsilon transitions from a state ID.
    Explore(StateID),
    /// Reset the given `slot` to the given `offset` (which might be `None`).
    RestoreCapture { slot: SmallIndex, offset: Option },
}

/// A set of active states used to "simulate" the execution of an NFA via the
/// PikeVM.
///
/// There are two sets of these used during NFA simulation. One set corresponds
/// to the "current" set of states being traversed for the current position
/// in a haystack. The other set corresponds to the "next" set of states being
/// built, which will become the new "current" set for the next position in the
/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
///
/// In addition to representing a set of NFA states, this also maintains slot
/// values for each state. These slot values are what turn the NFA simulation
/// into the "Pike VM." Namely, they track capturing group values for each
/// state. During the computation of epsilon closure, we copy slot values from
/// states in the "current" set to the "next" set. Eventually, once a match
/// is found, the slot values for that match state are what we write to the
/// caller provided 'Captures' value.
#[derive(Clone, Debug)]
struct ActiveStates {
    /// The set of active NFA states. This set preserves insertion order, which
    /// is critical for simulating the match semantics of backtracking regex
    /// engines.
    set: SparseSet,
    /// The slots for every NFA state, where each slot stores a (possibly
    /// absent) offset. Every capturing group has two slots. One for a start
    /// offset and one for an end offset.
    slot_table: SlotTable,
}

impl ActiveStates {
    /// Create a new set of active states for the given PikeVM. The active
    /// states returned may only be used with the given PikeVM. (Use 'reset'
    /// to re-purpose the allocation for a different PikeVM.)
    fn new(re: &PikeVM) -> ActiveStates {
        let mut active = ActiveStates { set: SparseSet::new(0), slot_table: SlotTable::new() };
        active.reset(re);
        active
    }

    /// Reset this set of active states such that it can be used with the given
    /// PikeVM (and only that PikeVM).
    fn reset(&mut self, re: &PikeVM) {
        self.set.resize(re.get_nfa().states().len());
        self.slot_table.reset(re);
    }

    /// Return the heap memory usage, in bytes, used by this set of active
    /// states.
    ///
    /// This does not include the stack size of this value.
    fn memory_usage(&self) -> usize {
        self.set.memory_usage() + self.slot_table.memory_usage()
    }

    /// Setup this set of active states for a new search. The given slot
    /// length should be the number of slots in a caller provided 'Captures'
    /// (and may be zero).
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.set.clear();
        self.slot_table.setup_search(captures_slot_len);
    }
}

/// A table of slots, where each row represent a state in an NFA. Thus, the
/// table has room for storing slots for every single state in an NFA.
///
/// This table is represented with a single contiguous allocation. In general,
/// the notion of "capturing group" doesn't really exist at this level of
/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
/// maps to a pair of slots, one for the start offset and one for the end
/// offset.) Slots are indexed by the 'Captures' NFA state.
///
/// N.B. Not every state actually needs a row of slots. Namely, states that
/// only have epsilon transitions currently never have anything written to
/// their rows in this table. Thus, the table is somewhat wasteful in its heap
/// usage. However, it is important to maintain fast random access by state
/// ID, which means one giant table tends to work well. RE2 takes a different
/// approach here and allocates each row as its own reference counted thing.
/// I explored such a strategy at one point here, but couldn't get it to work
/// well using entirely safe code. (To the ambitious reader: I encourage you to
/// re-litigate that experiment.) I very much wanted to stick to safe code, but
/// could be convinced otherwise if there was a solid argument and the safety
/// was encapsulated well.
#[derive(Clone, Debug)]
struct SlotTable {
    /// The actual table of offsets.
    table: Vec>,
    /// The number of slots per state, i.e., the table's stride or the length
    /// of each row.
    slots_per_state: usize,
    /// The number of slots in the caller-provided 'Captures' value for the
    /// current search. Setting this to 'slots_per_state' is always correct,
    /// but may be wasteful.
    slots_for_captures: usize,
}

impl SlotTable {
    /// Create a new slot table.
    ///
    /// One should call 'reset' with the corresponding PikeVM before use.
    fn new() -> SlotTable {
        SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
    }

    /// Reset this slot table such that it can be used with the given PikeVM
    /// (and only that PikeVM).
    fn reset(&mut self, re: &PikeVM) {
        let nfa = re.get_nfa();
        self.slots_per_state = nfa.group_info().slot_len();
        // This is always correct, but may be reduced for a particular search
        // if a 'Captures' has fewer slots, e.g., none at all or only slots
        // for tracking the overall match instead of all slots for every
        // group.
        self.slots_for_captures =
            core::cmp::max(self.slots_per_state, nfa.pattern_len().checked_mul(2).unwrap());
        let len = nfa
            .states()
            .len()
            .checked_mul(self.slots_per_state)
            // Add space to account for scratch space used during a search.
            .and_then(|x| x.checked_add(self.slots_for_captures))
            // It seems like this could actually panic on legitimate inputs on
            // 32-bit targets, and very likely to panic on 16-bit. Should we
            // somehow convert this to an error? What about something similar
            // for the lazy DFA cache? If you're tripping this assert, please
            // file a bug.
            .expect("slot table length doesn't overflow");
        // This happens about as often as a regex is compiled, so it probably
        // should be at debug level, but I found it quite distracting and not
        // particularly useful.
        self.table.resize(len, None);
    }

    /// Return the heap memory usage, in bytes, used by this slot table.
    ///
    /// This does not include the stack size of this value.
    fn memory_usage(&self) -> usize {
        self.table.len() * core::mem::size_of::>()
    }

    /// Perform any per-search setup for this slot table.
    ///
    /// In particular, this sets the length of the number of slots used in the
    /// 'Captures' given by the caller (if any at all). This number may be
    /// smaller than the total number of slots available, e.g., when the caller
    /// is only interested in tracking the overall match and not the spans of
    /// every matching capturing group. Only tracking the overall match can
    /// save a substantial amount of time copying capturing spans during a
    /// search.
    fn setup_search(&mut self, captures_slot_len: usize) {
        self.slots_for_captures = captures_slot_len;
    }

    /// Return a mutable slice of the slots for the given state.
    ///
    /// Note that the length of the slice returned may be less than the total
    /// number of slots available for this state. In particular, the length
    /// always matches the number of slots indicated via 'setup_search'.
    fn for_state(&mut self, sid: StateID) -> &mut [Option] {
        let i = sid.as_usize() * self.slots_per_state;
        &mut self.table[i..i + self.slots_for_captures]
    }

    /// Return a slice of slots of appropriate length where every slot offset
    /// is guaranteed to be absent. This is useful in cases where you need to
    /// compute an epsilon closure outside of the user supplied regex, and thus
    /// never want it to have any capturing slots set.
    fn all_absent(&mut self) -> &mut [Option] {
        let i = self.table.len() - self.slots_for_captures;
        &mut self.table[i..i + self.slots_for_captures]
    }
}

/// An iterator over all non-overlapping matches for a particular search.
///
/// The iterator yields a [`Match`] value until no more matches could be found.
///
/// The lifetime parameters are as follows:
///
/// * `'r` represents the lifetime of the PikeVM.
/// * `'c` represents the lifetime of the PikeVM's cache.
/// * `'h` represents the lifetime of the haystack being searched.
///
/// This iterator can be created with the [`PikeVM::find_iter`] method.
#[derive(Debug)]
pub struct FindMatches<'r, 'c, C: Cursor> {
    re: &'r PikeVM,
    cache: &'c mut Cache,
    caps: Captures,
    it: iter::Searcher,
}

impl<'r, 'c, C: Cursor> Iterator for FindMatches<'r, 'c, C> {
    type Item = Match;

    #[inline]
    fn next(&mut self) -> Option {
        // Splitting 'self' apart seems necessary to appease borrowck.
        let FindMatches { re, ref mut cache, ref mut caps, ref mut it } = *self;
        // 'advance' converts errors into panics, which is OK here because
        // the PikeVM can never return an error.
        it.advance(|input| {
            search(re, cache, input, caps);
            Ok(caps.get_match())
        })
    }
}
regex-cursor-0.1.4/src/engines.rs000064400000000000000000000000731046102023000150400ustar  00000000000000pub mod dfa;
pub mod hybrid;
pub mod meta;
pub mod pikevm;
regex-cursor-0.1.4/src/input.rs000064400000000000000000000734671046102023000145700ustar  00000000000000/*!
Types and routines that support the search APIs of most regex engines.

This sub-module isn't exposed directly, but rather, its contents are exported
at the crate root due to the universality of most of the types and routines in
this module.
*/

use std::ops::RangeBounds;

use regex_automata::{Anchored, Span};

use crate::cursor::{Cursor, IntoCursor};
use crate::util::utf8::is_boundary;

const MAX_CODEPOINT_LEN: usize = 4;

#[derive(Clone)]
pub struct Input {
    // span: Span,
    anchored: Anchored,
    earliest: bool,
    /// Position within the current chunk
    pub(crate) chunk_pos: usize,
    span: Span,
    look_behind_len: usize,
    /// the last 4 bytes before the current chunk
    look_around: [u8; MAX_CODEPOINT_LEN * 2],
    cursor: C,
}

impl Input {
    /// Create a new search configuration for the given cursor.
    #[inline]
    pub fn new>(cursor: T) -> Self {
        let cursor = cursor.into_cursor();
        let end = cursor.total_bytes().unwrap_or(usize::MAX);
        let start = cursor.offset();
        Input {
            anchored: Anchored::No,
            earliest: false,
            chunk_pos: 0,
            cursor: cursor.into_cursor(),
            // init with invalid utf8. We don't need to track
            // which of these have been filed since we only look
            // behind more than one byte in utf8 mode
            look_around: [255; 8],
            span: Span { start, end },
            look_behind_len: 0,
        }
    }

    /// Return a borrow of the current underlying chunk as a slice of bytes.
    ///
    /// # Example
    ///
    /// ```
    /// use ropey_regex::Input;
    ///
    /// let input = Input::new("foobar".into());
    /// assert_eq!(b"foobar", input.chunk());
    /// ```
    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub fn chunk(&self) -> &[u8] {
        self.cursor.chunk()
    }

    /// Return a borrow of the current underlying chunk as a slice of bytes.
    ///
    /// # Example
    ///
    /// ```
    /// use ropey_regex::Input;
    ///
    /// let input = Input::new("foobar".into());
    /// assert_eq!(b"foobar", input.chunk());
    /// ```
    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub fn chunk_offset(&self) -> usize {
        self.cursor.offset()
    }

    /// Return the start position of this search.
    ///
    /// This is a convenience routine for `search.get_span().start()`.
    ///
    /// When [`Input::is_done`] is `false`, this is guaranteed to return
    /// an offset that is less than or equal to [`Input::end`]. Otherwise,
    /// the offset is one greater than [`Input::end`].
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let input = Input::new("foobar");
    /// assert_eq!(0, input.start());
    ///
    /// let input = Input::new("foobar").span(2..4);
    /// assert_eq!(2, input.start());
    /// ```
    #[inline]
    pub fn start(&self) -> usize {
        self.get_span().start
    }

    #[inline]
    pub fn clear_look_behind(&mut self) {
        self.look_around = [255; 8];
    }

    /// Return the end position of this search.
    ///
    /// This is a convenience routine for `search.get_span().end()`.
    ///
    /// This is guaranteed to return an offset that is a valid exclusive end
    /// bound for this input's haystack.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let input = Input::new("foobar");
    /// assert_eq!(6, input.end());
    ///
    /// let input = Input::new("foobar").span(2..4);
    /// assert_eq!(4, input.end());
    /// ```
    #[inline]
    pub fn end(&self) -> usize {
        self.span.end
    }

    #[inline(always)]
    pub fn get_chunk_end(&self) -> usize {
        let end = self.span.end - self.cursor.offset();
        end.min(self.chunk().len())
    }

    /// Return the span for this search configuration.
    ///
    /// If one was not explicitly set, then the span corresponds to the entire
    /// range of the haystack.
    ///
    /// When [`Input::is_done`] is `false`, the span returned is guaranteed
    /// to correspond to valid bounds for this input's haystack.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::{Input, Span};
    ///
    /// let input = Input::new("foobar");
    /// assert_eq!(Span { start: 0, end: 6 }, input.get_span());
    /// ```
    #[inline]
    pub fn get_span(&self) -> Span {
        self.span
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn set_look_behind(&mut self) {
        #[cold]
        fn copy_partial_look_behind(look_behind: &mut [u8; MAX_CODEPOINT_LEN * 2], chunk: &[u8]) {
            look_behind[..chunk.len()].copy_from_slice(chunk)
        }

        let chunk = self.cursor.chunk();
        let len = chunk.len();
        if len < MAX_CODEPOINT_LEN {
            copy_partial_look_behind(&mut self.look_around, chunk);
            self.look_behind_len = chunk.len();
        } else {
            self.look_behind_len = MAX_CODEPOINT_LEN;
            self.look_around[..MAX_CODEPOINT_LEN].copy_from_slice(&chunk[len - MAX_CODEPOINT_LEN..])
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn advance(&mut self) -> bool {
        let old_len = self.cursor.chunk().len();
        let advanced = self.cursor.advance();
        if advanced {
            self.chunk_pos = 0;
        } else if self.span.end > self.cursor.offset() + old_len {
            self.span.end = self.cursor.offset() + old_len;
        }
        advanced
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn advance_with_look_behind(&mut self) -> bool {
        self.set_look_behind();
        self.advance()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn backtrack(&mut self) -> bool {
        let backtracked = self.cursor.backtrack();
        if backtracked {
            self.chunk_pos = self.chunk().len();
        } else if self.cursor.offset() != 0 {
            unreachable!("cursor does not support backtracking {}", self.cursor.offset())
        }
        backtracked
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn ensure_look_behind(&mut self) -> Option {
        if self.chunk_pos == 0 {
            // move back to the last chunk to read the look behind
            if self.backtrack() {
                self.advance_with_look_behind();
                Some(self.look_around[self.look_behind_len - 1])
            } else {
                self.look_behind_len = 0;
                None
            }
        } else {
            self.chunk().get(self.chunk_pos - 1).copied()
        }
    }

    pub fn look_around(&mut self) -> (&[u8], usize) {
        // TODO: cache look_ahead?
        if self.chunk_pos == 0 {
            #[cold]
            fn copy_partial_look_ahead(look_behind: &mut [u8], chunk: &[u8]) {
                look_behind[..chunk.len()].copy_from_slice(chunk)
            }

            let chunk = self.cursor.chunk();
            let look_around_len;
            if chunk.len() < MAX_CODEPOINT_LEN {
                look_around_len = self.look_behind_len + chunk.len();
                copy_partial_look_ahead(&mut self.look_around[self.look_behind_len..], chunk);
            } else {
                look_around_len = self.look_behind_len + MAX_CODEPOINT_LEN;
                self.look_around[self.look_behind_len..look_around_len]
                    .copy_from_slice(&chunk[..MAX_CODEPOINT_LEN])
            }
            (&self.look_around[..look_around_len], self.look_behind_len)
        } else {
            (self.chunk(), self.chunk_pos)
        }
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn chunk_pos(&self) -> usize {
        self.chunk_pos
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn set_chunk_pos(&mut self, at: usize) {
        self.chunk_pos = at;
    }

    /// Sets the anchor mode of a search.
    ///
    /// When a search is anchored (so that's [`Anchored::Yes`] or
    /// [`Anchored::Pattern`]), a match must begin at the start of a search.
    /// When a search is not anchored (that's [`Anchored::No`]), regex engines
    /// will behave as if the pattern started with a `(?:s-u.)*?`. This prefix
    /// permits a match to appear anywhere.
    ///
    /// By default, the anchored mode is [`Anchored::No`].
    ///
    /// **WARNING:** this is subtly different than using a `^` at the start of
    /// your regex. A `^` forces a regex to match exclusively at the start of
    /// a chunk, regardless of where you begin your search. In contrast,
    /// anchoring a search will allow your regex to match anywhere in your
    /// chunk, but the match must start at the beginning of a search.
    ///
    /// For example, consider the chunk `aba` and the following searches:
    ///
    /// 1. The regex `^a` is compiled with `Anchored::No` and searches `aba`
    ///    starting at position `2`. Since `^` requires the match to start at
    ///    the beginning of the chunk and `2 > 0`, no match is found.
    /// 2. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
    ///    starting at position `2`. This reports a match at `[2, 3]` since
    ///    the match starts where the search started. Since there is no `^`,
    ///    there is no requirement for the match to start at the beginning of
    ///    the chunk.
    /// 3. The regex `a` is compiled with `Anchored::Yes` and searches `aba`
    ///    starting at position `1`. Since `b` corresponds to position `1` and
    ///    since the search is anchored, it finds no match. While the regex
    ///    matches at other positions, configuring the search to be anchored
    ///    requires that it only report a match that begins at the same offset
    ///    as the beginning of the search.
    /// 4. The regex `a` is compiled with `Anchored::No` and searches `aba`
    ///    startting at position `1`. Since the search is not anchored and
    ///    the regex does not start with `^`, the search executes as if there
    ///    is a `(?s:.)*?` prefix that permits it to match anywhere. Thus, it
    ///    reports a match at `[2, 3]`.
    ///
    /// Note that the [`Anchored::Pattern`] mode is like `Anchored::Yes`,
    /// except it only reports matches for a particular pattern.
    ///
    /// # Example
    ///
    /// This demonstrates the differences between an anchored search and
    /// a pattern that begins with `^` (as described in the above warning
    /// message).
    ///
    /// ```
    /// use regex_automata::{
    ///     nfa::thompson::pikevm::PikeVM,
    ///     Anchored, Match, Input,
    /// };
    ///
    /// let chunk = "aba";
    ///
    /// let re = PikeVM::new(r"^a")?;
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
    /// let input = Input::new(chunk).span(2..3).anchored(Anchored::No);
    /// re.search(&mut cache, &input, &mut caps);
    /// // No match is found because 2 is not the beginning of the chunk,
    /// // which is what ^ requires.
    /// assert_eq!(None, caps.get_match());
    ///
    /// let re = PikeVM::new(r"a")?;
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
    /// let input = Input::new(chunk).span(2..3).anchored(Anchored::Yes);
    /// re.search(&mut cache, &input, &mut caps);
    /// // An anchored search can still match anywhere in the chunk, it just
    /// // must begin at the start of the search which is '2' in this case.
    /// assert_eq!(Some(Match::must(0, 2..3)), caps.get_match());
    ///
    /// let re = PikeVM::new(r"a")?;
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
    /// let input = Input::new(chunk).span(1..3).anchored(Anchored::Yes);
    /// re.search(&mut cache, &input, &mut caps);
    /// // No match is found since we start searching at offset 1 which
    /// // corresponds to 'b'. Since there is no '(?s:.)*?' prefix, no match
    /// // is found.
    /// assert_eq!(None, caps.get_match());
    ///
    /// let re = PikeVM::new(r"a")?;
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
    /// let input = Input::new(chunk).span(1..3).anchored(Anchored::No);
    /// re.search(&mut cache, &input, &mut caps);
    /// // Since anchored=no, an implicit '(?s:.)*?' prefix was added to the
    /// // pattern. Even though the search starts at 'b', the 'match anything'
    /// // prefix allows the search to match 'a'.
    /// let expected = Some(Match::must(0, 2..3));
    /// assert_eq!(expected, caps.get_match());
    ///
    /// # Ok::<(), Box>(())
    /// ```
    #[inline]
    pub fn anchored(&mut self, mode: Anchored) -> &mut Self {
        self.set_anchored(mode);
        self
    }

    /// Whether to execute an "earliest" search or not.
    ///
    /// When running a non-overlapping search, an "earliest" search will return
    /// the match location as early as possible. For example, given a pattern
    /// of `foo[0-9]+` and a chunk of `foo12345`, a normal leftmost search
    /// will return `foo12345` as a match. But an "earliest" search for regex
    /// engines that support "earliest" semantics will return `foo1` as a
    /// match, since as soon as the first digit following `foo` is seen, it is
    /// known to have found a match.
    ///
    /// Note that "earliest" semantics generally depend on the regex engine.
    /// Different regex engines may determine there is a match at different
    /// points. So there is no guarantee that "earliest" matches will always
    /// return the same offsets for all regex engines. The "earliest" notion
    /// is really about when the particular regex engine determines there is
    /// a match rather than a consistent semantic unto itself. This is often
    /// useful for implementing "did a match occur or not" predicates, but
    /// sometimes the offset is useful as well.
    ///
    /// This is disabled by default.
    ///
    /// # Example
    ///
    /// This example shows the difference between "earliest" searching and
    /// normal searching.
    ///
    /// ```
    /// use regex_automata::{nfa::thompson::pikevm::PikeVM, Match, Input};
    ///
    /// let re = PikeVM::new(r"foo[0-9]+")?;
    /// let mut cache = re.create_cache();
    /// let mut caps = re.create_captures();
    ///
    /// // A normal search implements greediness like you expect.
    /// let input = Input::new("foo12345");
    /// re.search(&mut cache, &input, &mut caps);
    /// assert_eq!(Some(Match::must(0, 0..8)), caps.get_match());
    ///
    /// // When 'earliest' is enabled and the regex engine supports
    /// // it, the search will bail once it knows a match has been
    /// // found.
    /// let input = Input::new("foo12345").earliest(true);
    /// re.search(&mut cache, &input, &mut caps);
    /// assert_eq!(Some(Match::must(0, 0..4)), caps.get_match());
    /// # Ok::<(), Box>(())
    /// ```
    #[inline]
    pub fn earliest(&mut self, yes: bool) -> &mut Self {
        self.set_earliest(yes);
        self
    }

    /// Set the anchor mode of a search.
    ///
    /// This is like [`Input::anchored`], except it mutates the search
    /// configuration in place.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::{Anchored, Input, PatternID};
    ///
    /// let mut input = Input::new("foobar");
    /// assert_eq!(Anchored::No, input.get_anchored());
    ///
    /// let pid = PatternID::must(5);
    /// input.set_anchored(Anchored::Pattern(pid));
    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
    /// ```
    #[inline]
    pub fn set_anchored(&mut self, mode: Anchored) {
        self.anchored = mode;
    }

    /// Set whether the search should execute in "earliest" mode or not.
    ///
    /// This is like [`Input::earliest`], except it mutates the search
    /// configuration in place.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let mut input = Input::new("foobar");
    /// assert!(!input.get_earliest());
    /// input.set_earliest(true);
    /// assert!(input.get_earliest());
    /// ```
    #[inline]
    pub fn set_earliest(&mut self, yes: bool) {
        self.earliest = yes;
    }

    /// Set the span for this search.
    ///
    /// This routine does not panic if the span given is not a valid range for
    /// this search's haystack. If this search is run with an invalid range,
    /// then the most likely outcome is that the actual search execution will
    /// panic.
    ///
    /// This routine is generic over how a span is provided. While
    /// a [`Span`] may be given directly, one may also provide a
    /// `std::ops::Range`. To provide anything supported by range
    /// syntax, use the [`Input::range`] method.
    ///
    /// The default span is the entire haystack.
    ///
    /// Note that [`Input::range`] overrides this method and vice versa.
    ///
    /// # Panics
    ///
    /// This panics if the given span does not correspond to valid bounds in
    /// the haystack or the termination of a search.
    ///
    /// # Example
    ///
    /// This example shows how the span of the search can impact whether a
    /// match is reported or not. This is particularly relevant for look-around
    /// operators, which might take things outside of the span into account
    /// when determining whether they match.
    ///
    /// ```
    /// # if cfg!(miri) { return Ok(()); } // miri takes too long
    /// use regex_automata::{
    ///     nfa::thompson::pikevm::PikeVM,
    ///     Match, Input,
    /// };
    ///
    /// // Look for 'at', but as a distinct word.
    /// let re = PikeVM::new(r"\bat\b")?;
    /// let mut cache = re.create_cache();
    /// let mut caps = re.create_captures();
    ///
    /// // Our haystack contains 'at', but not as a distinct word.
    /// let haystack = "batter";
    ///
    /// // A standard search finds nothing, as expected.
    /// let input = Input::new(haystack);
    /// re.search(&mut cache, &input, &mut caps);
    /// assert_eq!(None, caps.get_match());
    ///
    /// // But if we wanted to search starting at position '1', we might
    /// // slice the haystack. If we do this, it's impossible for the \b
    /// // anchors to take the surrounding context into account! And thus,
    /// // a match is produced.
    /// let input = Input::new(&haystack[1..3]);
    /// re.search(&mut cache, &input, &mut caps);
    /// assert_eq!(Some(Match::must(0, 0..2)), caps.get_match());
    ///
    /// // But if we specify the span of the search instead of slicing the
    /// // haystack, then the regex engine can "see" outside of the span
    /// // and resolve the anchors correctly.
    /// let input = Input::new(haystack).span(1..3);
    /// re.search(&mut cache, &input, &mut caps);
    /// assert_eq!(None, caps.get_match());
    ///
    /// # Ok::<(), Box>(())
    /// ```
    ///
    /// This may seem a little ham-fisted, but this scenario tends to come up
    /// if some other regex engine found the match span and now you need to
    /// re-process that span to look for capturing groups. (e.g., Run a faster
    /// DFA first, find a match, then run the PikeVM on just the match span to
    /// resolve capturing groups.) In order to implement that sort of logic
    /// correctly, you need to set the span on the search instead of slicing
    /// the haystack directly.
    ///
    /// The other advantage of using this routine to specify the bounds of the
    /// search is that the match offsets are still reported in terms of the
    /// original haystack. For example, the second search in the example above
    /// reported a match at position `0`, even though `at` starts at offset
    /// `1` because we sliced the haystack.
    #[inline]
    pub fn span>(&mut self, span: S) -> &mut Input {
        self.set_span(span);
        self
    }

    /// Set the starting offset for the span for this search configuration.
    ///
    /// This is a convenience routine for only mutating the start of a span
    /// without having to set the entire span.
    ///
    /// # Panics
    ///
    /// This panics if the span resulting from the new start position does not
    /// correspond to valid bounds in the haystack or the termination of a
    /// search.
    ///
    #[inline]
    pub fn set_start(&mut self, start: usize) {
        self.set_span(Span { start, ..self.get_span() });
    }

    /// Set the ending offset for the span for this search configuration.
    ///
    /// This is a convenience routine for only mutating the end of a span
    /// without having to set the entire span.
    ///
    /// # Panics
    ///
    /// This panics if the span resulting from the new end position does not
    /// correspond to valid bounds in the haystack or the termination of a
    /// search.
    #[inline]
    pub fn set_end(&mut self, end: usize) {
        self.set_span(Span { end, ..self.get_span() });
    }

    /// Like `Input::span`, but accepts any range instead.
    ///
    /// This routine does not panic if the range given is not a valid range for
    /// this search's haystack. If this search is run with an invalid range,
    /// then the most likely outcome is that the actual search execution will
    /// panic.
    ///
    /// The default range is the entire haystack.
    ///
    /// Note that [`Input::span`] overrides this method and vice versa.
    ///
    /// # Panics
    ///
    /// This routine will panic if the given range could not be converted
    /// to a valid [`Range`]. For example, this would panic when given
    /// `0..=usize::MAX` since it cannot be represented using a half-open
    /// interval in terms of `usize`.
    ///
    /// This also panics if the given range does not correspond to valid bounds
    /// in the haystack or the termination of a search.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let input = Input::new("foobar");
    /// assert_eq!(0..6, input.get_range());
    ///
    /// let input = Input::new("foobar").range(2..=4);
    /// assert_eq!(2..5, input.get_range());
    /// ```
    #[inline]
    pub fn range>(mut self, range: R) -> Input {
        self.set_range(range);
        self
    }

    /// Set the span for this search configuration.
    ///
    /// This is like the [`Input::span`] method, except this mutates the
    /// span in place.
    ///
    /// This routine is generic over how a span is provided. While
    /// a [`Span`] may be given directly, one may also provide a
    /// `std::ops::Range`.
    ///
    /// # Panics
    ///
    /// This panics if the given span does not correspond to valid bounds in
    /// the haystack or the termination of a search.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let mut input = Input::new("foobar");
    /// assert_eq!(0..6, input.get_range());
    /// input.set_span(2..4);
    /// assert_eq!(2..4, input.get_range());
    /// ```
    #[inline]
    pub fn set_span>(&mut self, span: S) {
        let span = span.into();
        assert!(span.start <= span.end.saturating_add(1), "invalid span {:?}", span,);
        if self.at() < span.start {
            self.move_to(span.start);
        } else if !self.is_done() && self.at() > span.end {
            self.move_to(span.end);
        }
        self.span = span;
    }

    #[inline]
    pub(crate) fn move_to(&mut self, at: usize) {
        debug_assert!(at <= self.span.end.saturating_add(1));
        // TODO: fastpath for O(log N) chunk jumping
        while at < self.cursor.offset() {
            self.backtrack();
        }
        if at != self.cursor.offset() {
            while at >= self.cursor.offset() + self.chunk().len() {
                let advanced = self.advance();
                if !advanced {
                    let chunk_pos = (at - self.cursor.offset()).min(self.chunk().len());
                    self.set_chunk_pos(chunk_pos);
                    return;
                }
            }
        }
        self.set_chunk_pos(at - self.cursor.offset());
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn at(&self) -> usize {
        self.cursor.offset() + self.chunk_pos()
    }

    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn with(&mut self, f: impl FnOnce(&mut Self) -> T) -> T {
        let anchored = self.anchored;
        let earliest = self.earliest;
        let span = self.span;
        let res = f(self);
        self.set_span(span);
        self.set_earliest(earliest);
        self.set_anchored(anchored);
        res
    }

    // #[cfg_attr(feature = "perf-inline", inline(always))]
    // pub(crate) fn try_clone(&self) -> Option> {
    //     let res = Input {
    //         cursor: self.cursor.try_clone()?,
    //         anchored: self.anchored,
    //         earliest: self.earliest,
    //         offset: self.offset,
    //         chunk_pos: self.chunk_pos,
    //         span: self.span,
    //         look_behind: self.look_behind,
    //     };
    //     Some(res)
    // }

    /// Set the span for this search configuration given any range.
    ///
    /// This is like the [`Input::range`] method, except this mutates the
    /// span in place.
    ///
    /// This routine does not panic if the range given is not a valid range for
    /// this search's haystack. If this search is run with an invalid range,
    /// then the most likely outcome is that the actual search execution will
    /// panic.
    ///
    /// # Panics
    ///
    /// This routine will panic if the given range could not be converted
    /// to a valid [`Range`]. For example, this would panic when given
    /// `0..=usize::MAX` since it cannot be represented using a half-open
    /// interval in terms of `usize`.
    ///
    /// This also panics if the given span does not correspond to valid bounds
    /// in the haystack or the termination of a search.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let mut input = Input::new("foobar");
    /// assert_eq!(0..6, input.get_range());
    /// input.set_range(2..=4);
    /// assert_eq!(2..5, input.get_range());
    /// ```
    #[inline]
    pub fn set_range>(&mut self, range: R) {
        use core::ops::Bound;

        // It's a little weird to convert ranges into spans, and then spans
        // back into ranges when we actually slice the haystack. Because
        // of that process, we always represent everything as a half-open
        // internal. Therefore, handling things like m..=n is a little awkward.
        let start = match range.start_bound() {
            Bound::Included(&i) => i,
            // Can this case ever happen? Range syntax doesn't support it...
            Bound::Excluded(&i) => i.checked_add(1).unwrap(),
            Bound::Unbounded => 0,
        };
        let end = match range.end_bound() {
            Bound::Included(&i) => i.checked_add(1).unwrap(),
            Bound::Excluded(&i) => i,
            Bound::Unbounded => self.cursor.total_bytes().unwrap_or(usize::MAX),
        };
        self.set_span(Span { start, end });
    }

    /// Return the anchored mode for this search configuration.
    ///
    /// If no anchored mode was set, then it defaults to [`Anchored::No`].
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::{Anchored, Input, PatternID};
    ///
    /// let mut input = Input::new("foobar");
    /// assert_eq!(Anchored::No, input.get_anchored());
    ///
    /// let pid = PatternID::must(5);
    /// input.set_anchored(Anchored::Pattern(pid));
    /// assert_eq!(Anchored::Pattern(pid), input.get_anchored());
    /// ```
    #[inline]
    pub fn get_anchored(&self) -> Anchored {
        self.anchored
    }

    /// Return whether this search should execute in "earliest" mode.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let input = Input::new("foobar");
    /// assert!(!input.get_earliest());
    /// ```
    #[inline]
    pub fn get_earliest(&self) -> bool {
        self.earliest
    }

    /// Return true if and only if this search can never return any other
    /// matches.
    ///
    /// This occurs when the start position of this search is greater than the
    /// end position of the search.
    ///
    /// # Example
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let mut input = Input::new("foobar");
    /// assert!(!input.is_done());
    /// input.set_start(6);
    /// assert!(!input.is_done());
    /// input.set_start(7);
    /// assert!(input.is_done());
    /// ```
    #[inline]
    pub fn is_done(&self) -> bool {
        self.get_span().start > self.get_span().end
    }

    /// Returns true if and only if the given offset in this search's chunk
    /// falls on a valid UTF-8 encoded codepoint boundary.
    ///
    /// If the chunk is not valid UTF-8, then the behavior of this routine
    /// is unspecified.
    ///
    /// # Example
    ///
    /// This shows where codepoint bounardies do and don't exist in valid
    /// UTF-8.
    ///
    /// ```
    /// use regex_automata::Input;
    ///
    /// let input = Input::new("☃");
    /// assert!(input.is_char_boundary(0));
    /// assert!(!input.is_char_boundary(1));
    /// assert!(!input.is_char_boundary(2));
    /// assert!(input.is_char_boundary(3));
    /// assert!(!input.is_char_boundary(4));
    /// ```
    #[inline]
    pub fn is_char_boundary(&mut self) -> bool {
        is_boundary(self.chunk(), self.chunk_pos)
    }
}

impl core::fmt::Debug for Input {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        use regex_automata::util::escape::DebugHaystack;

        f.debug_struct("Input")
            .field("chunk", &DebugHaystack(self.chunk()))
            .field("anchored", &self.anchored)
            .field("earliest", &self.earliest)
            .field("chunk_pos", &self.chunk_pos)
            .field("chunk_offset", &self.cursor.offset())
            .field("span", &self.span)
            .finish()
    }
}
regex-cursor-0.1.4/src/lib.rs000064400000000000000000000050771046102023000141670ustar  00000000000000/*!
This crate provides routines for searching **discontiguous strings** for matches
of a regular expression (aka "regex"). It is based on regex-automata and
most of the code is adapted from the various crates in the
[regex](https://github.com/rust-lang/regex) repository.

It is intended as a prototype for upstream support for "streaming regex". The
cursor based API in this crate is very similar to the API already exposed by
`regex`/`regex-automata`. To that end a generic `Cursor` trait is provided that
collections can implement.

A sketch of the cursor API is shown below. The string is yielded in multiple
byte chunks. Calling advance moves the cursor to the next chunk. Calling
backtrack moves the cursor a chunk back. Backtracking is required by this
create. That makes it unsuitable for searching fully unbuffered streams like
bytes send over a TCP connection.

```
pub trait Cursor {
   fn chunk(&self) -> &[u8] { .. }
    fn advance(&mut self) -> bool { .. }
    fn bracktrack(&mut self) -> bool { .. }
}
```

Working on this crate showed met hat regex backtracks a lot more than expected
with most functionality fundamentally requiring backtracking. For network
usecases that do not buffer their input the primary usecase would likely be
detecting a match (without necessarily requiring the matched byte range).
Such usecases can be covered by manually feeding bytes into the hybrid and DFA
engines from the regex-automata crate. This approach also has the advantage
of allowing the caller to pause the match (async) while waiting for more data
allowing the caller to drive the search instead of the engine itself.

The only part of this crate that could be applied to the fully streaming case is
the streaming PikeVM implementation. However, there are some limitations:
* only a single search can be run since the PikeVM may look ahead multiple bytes
to disambiguate alternative matches
* Prefilters longer than one byte can not work
* utf-8 mode can not be supported (empty matches may occur between unicode
boundaries)

Currently, the PikeVM implementation is not written with this use case in mind
and may call backtrack unnecessarily, but that could be addressed in the future,
but especially the first point is very limiting. The pikevm also does not allow
the user to drive the search and would block on network calls for example (no
async).
*/

#[cfg(feature = "ropey")]
pub use cursor::RopeyCursor;
pub use cursor::{Cursor, IntoCursor};
pub use input::Input;
pub use regex_automata;

mod cursor;
pub mod engines;
mod input;
mod literal;
mod util;

#[cfg(test)]
mod test_rope;
#[cfg(test)]
mod tests;
regex-cursor-0.1.4/src/literal/tests.rs000064400000000000000000000036261046102023000162150ustar  00000000000000use std::iter;

use proptest::proptest;
use regex_automata::util::prefilter::Prefilter;
use regex_automata::Span;

proptest! {
    #[test]
    fn matches(mut haystack: String, needle: String) {
        haystack = haystack.repeat(1024);
        let needles = &[needle.as_bytes()];
        let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else {
            return Ok(())
        };
        let mut span = Span{ start: 0, end: haystack.len() };
        let iter1 = iter::from_fn(||{
            let res = prefilter.find(haystack.as_bytes(), span)?;
            span.start = res.end;
            Some(res)
        });
        let rope = ropey::Rope::from_str(&haystack);
        let mut input = crate::Input::new(&rope);
        let iter2= iter::from_fn(||{
            let res = super::find(&prefilter, &mut input)?;
            input.move_to(res.end);
            Some(res)
        });
        crate::util::iter::prop_assert_eq(iter1, iter2)?;
    }

    #[test]
    fn matches_range(mut haystack: String, needle: String) {
        haystack = haystack.repeat(1024);
        let start = haystack.len() / 3;
        let end = 2*start;
        let needles = &[needle.as_bytes()];
        let Some(prefilter) = Prefilter::new(regex_automata::MatchKind::All, needles) else {
            return Ok(())
        };
        let mut span = Span{ start, end };
        let iter1 = iter::from_fn(||{
            let res = prefilter.find(haystack.as_bytes(), span)?;
            span.start = res.end;
            Some(res)
        });
        let rope = ropey::Rope::from_str(&haystack);
        let mut input = crate::Input::new(&rope).range(start..end);
        let iter2 = iter::from_fn(||{
            let res = super::find(&prefilter, &mut input)?;
            assert!(res.end <= end);
            input.move_to(res.end);
            Some(res)
        });
        crate::util::iter::prop_assert_eq(iter1, iter2)?;
    }
}
regex-cursor-0.1.4/src/literal.rs000064400000000000000000000226311046102023000150500ustar  00000000000000pub use regex_automata::util::prefilter::Prefilter;
pub use regex_automata::MatchKind;
use regex_automata::Span;

use crate::cursor::Cursor;
use crate::Input;

use FindChunkResult::*;

#[cfg(test)]
mod tests;

pub fn find(prefilter: &Prefilter, input: &mut Input) -> Option {
    // TODO optimize this:
    // * potentially use an array vec
    // * specical case max_needle_len==2 (no accumulating necessary)
    // * specical case max_needle_len==min_needle_len (no ambiguety)
    if prefilter.max_needle_len() == 1 {
        find_1(prefilter, input)
    } else {
        find_n::(prefilter, input)
    }
}

pub fn prefix(prefilter: &Prefilter, input: &mut Input) -> Option {
    let mut offset = input.chunk_offset();
    let chunk_pos = input.chunk_pos();
    let chunk_end = input.get_chunk_end();
    let mut res = if prefilter.max_needle_len() <= chunk_end - chunk_pos {
        prefilter
            .prefix(input.chunk(), Span { start: input.chunk_pos(), end: input.get_chunk_end() })?
    } else {
        offset += chunk_pos;
        let mut buf =
            Vec::with_capacity(prefilter.max_needle_len().min(input.end() - input.start()));
        buf.extend_from_slice(&input.chunk()[chunk_pos..chunk_end]);
        while input.advance() && !buf.spare_capacity_mut().is_empty() {
            let mut chunk_len = input.chunk().len().min(buf.spare_capacity_mut().len());
            if input.chunk_offset() + chunk_len <= input.end() {
                buf.extend_from_slice(&input.chunk()[..chunk_len]);
            } else {
                chunk_len = input.end() - input.chunk_offset();
                buf.extend_from_slice(&input.chunk()[..chunk_len]);
                break;
            }
        }
        prefilter.prefix(&buf, Span { start: 0, end: buf.len() })?
    };
    res.start += offset;
    res.end += offset;
    Some(res)
}

fn find_1(prefilter: &Prefilter, input: &mut Input) -> Option {
    debug_assert_eq!(prefilter.max_needle_len(), 1);
    let first_haystack = &input.chunk();
    if let Some(mut res) = prefilter
        .find(first_haystack, Span { start: input.chunk_pos(), end: input.get_chunk_end() })
    {
        res.start += input.chunk_offset();
        res.end += input.chunk_offset();
        return Some(res);
    }
    while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() {
        let haystack = &input.chunk();
        let Some(mut res) = prefilter.find(haystack, Span { start: 0, end: input.get_chunk_end() })
        else {
            continue;
        };

        res.start += input.chunk_offset();
        res.end += input.chunk_offset();
        return Some(res);
    }
    None
}

fn find_n(
    prefilter: &Prefilter,
    input: &mut Input,
) -> Option {
    // helper macro to make the code more readable
    macro_rules! find_chunk {
        ($chunk:expr, $buf_offset:expr, |$start: ident, $off: ident| $disambiguate: expr) => {
            match find_n_chunk::(prefilter, $chunk, $buf_offset) {
                FindChunkResult::Match(span) => return Some(span),
                FindChunkResult::AbigousMatch { $start, $off } if AMBIGUITY => {
                    return Some($disambiguate);
                }
                _ => {}
            }
        };
    }

    // simple case: only search in a single chunk specical casing this is nice
    // for performance and makes the rest of the logic simpler
    let first_chunk_end = input.get_chunk_end();
    let mut first_chunk = input.chunk();
    if first_chunk.len() != first_chunk_end {
        if let Some(mut res) =
            prefilter.find(first_chunk, Span { start: input.chunk_pos(), end: first_chunk_end })
        {
            res.start += input.chunk_offset();
            res.end += input.chunk_offset();
            return Some(res);
        }
        return None;
    }
    first_chunk = &first_chunk[input.chunk_pos()..];

    let max_needle_len = prefilter.max_needle_len();
    let carry_over = max_needle_len - 1;
    let sliding_window = 2 * carry_over;

    // again special case the first chunk since that is the hot path
    // and also keeps the logic below simpler
    let mut buf_offset = input.chunk_offset() + input.chunk_pos();
    if first_chunk.len() >= sliding_window {
        find_chunk!(first_chunk, input.chunk_offset() + input.chunk_pos(), |start, off| {
            let mut buf = Vec::with_capacity(max_needle_len);
            buf.extend_from_slice(&first_chunk[start..]);
            disambiguate_match(prefilter, input, buf, off)
        });
        let carrry_over_start = first_chunk.len() - carry_over;
        first_chunk = &first_chunk[carrry_over_start..];
        buf_offset += carrry_over_start;
    }
    let mut buf = Vec::with_capacity(2 * sliding_window);
    buf.extend_from_slice(first_chunk);

    while input.chunk_offset() + input.chunk().len() < input.end() && input.advance() {
        debug_assert!(buf.len() < sliding_window, "{} {sliding_window}", buf.len());
        let mut chunk = &input.chunk()[..input.get_chunk_end()];
        let mut chunk_offset = input.chunk_offset();
        // this condition only triggers until we have filled the buffer for the first time
        if buf.len() < carry_over {
            if buf.len() + chunk.len() <= carry_over {
                buf.extend_from_slice(chunk);
                continue;
            }
            let copied = carry_over - buf.len();
            buf.extend_from_slice(&chunk[..copied]);
            chunk = &chunk[copied..];
            chunk_offset += copied;
        }
        debug_assert!(buf.len() >= carry_over, "{} {carry_over}", buf.len());

        // if the chunk is too small just continue accumelating the condition
        // below implies chunk.len() <= sliding_window since buf.len() <=
        // sliding_window
        if buf.len() + chunk.len() <= buf.capacity() {
            buf.extend_from_slice(chunk);
            if buf.len() >= sliding_window {
                find_chunk!(&buf, buf_offset, |start, off| {
                    buf.drain(..start);
                    disambiguate_match(prefilter, input, buf, off)
                });
                let carry_over_start = buf.len() - carry_over;
                buf.drain(..carry_over_start);
                buf_offset += carry_over_start;
            }
            continue;
        }

        buf.extend_from_slice(&chunk[..carry_over]);
        find_chunk!(&buf, buf_offset, |start, off| {
            buf.drain(..start);
            buf.extend_from_slice(&chunk[..max_needle_len - buf.len()]);
            let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap();
            res.start += off;
            res.end += off;
            res
        });
        buf.clear();

        find_chunk!(chunk, chunk_offset, |start, off| {
            buf.extend_from_slice(&chunk[start..]);
            disambiguate_match(prefilter, input, buf, off)
        });
        let carrry_over_start = chunk.len() - carry_over;
        buf_offset = chunk_offset + carrry_over_start;
        buf.extend_from_slice(&chunk[carrry_over_start..]);
    }

    if !buf.is_empty() {
        if let Some(mut res) = prefilter.find(&buf, Span { start: 0, end: buf.len() }) {
            res.start += buf_offset;
            res.end += buf_offset;
            return Some(res);
        }
    }
    None
}

#[must_use]
enum FindChunkResult {
    // the prefilter found no matches in this chunk
    NoMatch,
    // the prefilter found a match at the (offset correctd)
    // span in this chunk
    Match(Span),
    // the prefilter found a match that could be ambigous
    // depending on what data follows the buffer
    AbigousMatch { start: usize, off: usize },
}

fn disambiguate_match(
    prefilter: &Prefilter,
    input: &mut Input,
    mut buf: Vec,
    off: usize,
) -> Span {
    let max_needle_len = prefilter.max_needle_len();
    debug_assert!(buf.len() < max_needle_len);
    while input.advance() {
        let chunk_end = input.get_chunk_end().min(max_needle_len - buf.len());
        let chunk = input.chunk();
        if chunk_end != chunk.len() {
            buf.extend_from_slice(&chunk[..chunk_end]);
            break;
        }
        buf.extend_from_slice(chunk);
    }
    debug_assert!(buf.len() <= max_needle_len);
    let mut res = prefilter.prefix(&buf, Span { start: 0, end: buf.len() }).unwrap();
    res.start += off;
    res.end += off;
    res
}

fn find_n_chunk(
    prefilter: &Prefilter,
    buf: &[u8],
    off: usize,
) -> FindChunkResult {
    debug_assert!(buf.len() >= 2 * prefilter.max_needle_len() - 2);
    if let Some(mut res) = prefilter.find(buf, Span { start: 0, end: buf.len() }) {
        // This condition is neeed in case we find a match at the end of the
        // chunk. In that case there may be an even longer match once we
        // continue scanning. For example:
        //
        // pattern: "abc|a"
        // haystack: "xxabc" chunked into ["xxab", "c"]
        // matck_kind: leftmost-first
        //
        // In the first chunk we would find a match for "a" but we
        // should be matching "abc" instead (since that is the first
        // alternation).
        if AMBIGOUS && res.start + prefilter.max_needle_len() > buf.len() {
            AbigousMatch { start: res.start, off: res.start + off }
        } else {
            res.start += off;
            res.end += off;
            Match(res)
        }
    } else {
        NoMatch
    }
}
regex-cursor-0.1.4/src/test_rope.rs000064400000000000000000000113451046102023000154200ustar  00000000000000use std::cell::Cell;
use std::collections::hash_map::DefaultHasher;
use std::hash::Hasher;
use std::sync::atomic::{AtomicUsize, Ordering};

use regex_automata::util::escape::DebugHaystack;

use crate::util::utf8;
use crate::Cursor;

#[derive(Debug)]
struct XorShift64Star {
    state: Cell,
}

impl XorShift64Star {
    fn new() -> Self {
        // Any non-zero seed will do -- this uses the hash of a global counter.
        let mut seed = 0;
        while seed == 0 {
            let mut hasher = DefaultHasher::new();
            static COUNTER: AtomicUsize = AtomicUsize::new(0);
            hasher.write_usize(COUNTER.fetch_add(1, Ordering::Relaxed));
            seed = hasher.finish();
        }

        XorShift64Star { state: Cell::new(seed) }
    }

    fn next(&self) -> u64 {
        let mut x = self.state.get();
        debug_assert_ne!(x, 0);
        x ^= x >> 12;
        x ^= x << 25;
        x ^= x >> 27;
        self.state.set(x);
        x.wrapping_mul(0x2545_f491_4f6c_dd1d)
    }

    /// Return a value from `0..n`.
    fn next_usize(&self, n: usize) -> usize {
        (self.next() % n as u64) as usize
    }
}

#[derive(Debug)]
pub(crate) struct RandomSlices<'a> {
    haystack: &'a [u8],
    pos: usize,
    size: usize,
    ran: XorShift64Star,
}

impl<'a> RandomSlices<'a> {
    pub fn new(haystack: &'a [u8]) -> Self {
        let mut res = RandomSlices { haystack, pos: 0, size: 0, ran: XorShift64Star::new() };
        res.advance();
        res
    }
}

impl Cursor for RandomSlices<'_> {
    fn chunk(&self) -> &[u8] {
        debug_assert_eq!(self.haystack.is_empty(), self.size == 0);
        &self.haystack[self.pos..self.pos + self.size]
    }

    fn utf8_aware(&self) -> bool {
        true
    }

    fn advance(&mut self) -> bool {
        if self.pos + self.size == self.haystack.len() {
            return false;
        }
        let new_start = self.pos + self.size;
        let mut tries = u16::MAX;
        loop {
            let next_size = self.ran.next_usize(250) + 1;
            let new_end = (new_start + next_size).min(self.haystack.len());
            if utf8::is_boundary(self.haystack, new_end) {
                self.pos = new_start;
                self.size = new_end - new_start;
                break;
            }
            if tries == 0 {
                panic!("faild to advance at {} {:?}", self.pos, DebugHaystack(self.haystack))
            }
            tries -= 1;
        }
        true
    }

    fn backtrack(&mut self) -> bool {
        if self.pos == 0 {
            return false;
        }
        let mut tries = u16::MAX;
        let new_end = self.pos;
        loop {
            let next_size = self.ran.next_usize(250) + 1;
            let new_start = new_end.saturating_sub(next_size);
            if utf8::is_boundary(self.haystack, new_start) {
                self.pos = new_start;
                self.size = new_end - new_start;
                break;
            }
            if tries == 0 {
                panic!("faild to backtrack at {} {:?}", self.pos, DebugHaystack(self.haystack))
            }
            tries -= 1;
        }
        true
    }

    fn total_bytes(&self) -> Option {
        Some(self.haystack.len())
    }

    fn offset(&self) -> usize {
        self.pos
    }
}

#[derive(Debug)]
pub(crate) struct SingleByteChunks<'a> {
    haystack: &'a [u8],
    pos: usize,
    end: usize,
}

impl<'a> SingleByteChunks<'a> {
    pub fn new(haystack: &'a [u8]) -> Self {
        Self {
            haystack,
            pos: 0,
            end: (1..haystack.len())
                .find(|&i| utf8::is_boundary(haystack, i))
                .unwrap_or(haystack.len()),
        }
    }
}

impl Cursor for SingleByteChunks<'_> {
    fn chunk(&self) -> &[u8] {
        debug_assert!(utf8::is_boundary(self.haystack, self.pos) || self.pos == 0);
        debug_assert!(utf8::is_boundary(self.haystack, self.end) || self.end == 0);
        &self.haystack[self.pos..self.end]
    }

    fn utf8_aware(&self) -> bool {
        true
    }

    fn advance(&mut self) -> bool {
        if self.end < self.haystack.len() {
            self.pos = self.end;
            self.end = (self.end + 1..self.haystack.len())
                .find(|&i| utf8::is_boundary(self.haystack, i))
                .unwrap_or(self.haystack.len());
            true
        } else {
            false
        }
    }

    fn backtrack(&mut self) -> bool {
        if self.pos != 0 {
            self.end = self.pos;
            self.pos =
                (0..self.pos).rev().find(|&i| utf8::is_boundary(self.haystack, i)).unwrap_or(0);
            true
        } else {
            false
        }
    }

    fn total_bytes(&self) -> Option {
        Some(self.haystack.len())
    }

    fn offset(&self) -> usize {
        self.pos
    }
}
regex-cursor-0.1.4/src/tests.rs000064400000000000000000000207011046102023000145520ustar  00000000000000use crate::{test_rope::SingleByteChunks, Input};

use {
    crate::engines::meta::{self, Regex},
    anyhow::Result,
    regex_automata::util::syntax,
    regex_automata::MatchKind,
    regex_test::{CompiledRegex, Match, RegexTest, SearchKind, Span, TestResult, TestRunner},
};

fn suite() -> anyhow::Result {
    let mut tests = regex_test::RegexTests::new();
    macro_rules! load {
        ($name:expr) => {{
            const DATA: &[u8] = include_bytes!(concat!("../../regex/testdata/", $name, ".toml"));
            tests.load_slice($name, DATA)?;
        }};
    }

    load!("anchored");
    load!("bytes");
    load!("crazy");
    load!("crlf");
    load!("earliest");
    load!("empty");
    load!("expensive");
    load!("flags");
    load!("iter");
    load!("leftmost-all");
    load!("line-terminator");
    load!("misc");
    load!("multiline");
    load!("no-unicode");
    load!("overlapping");
    load!("regression");
    load!("set");
    load!("substring");
    load!("unicode");
    load!("utf8");
    load!("word-boundary");
    load!("word-boundary-special");
    load!("fowler/basic");
    load!("fowler/nullsubexpr");
    load!("fowler/repetition");

    Ok(tests)
}

/// Configure a regex_automata::Input with the given test configuration.
fn create_input(test: ®ex_test::RegexTest) -> crate::Input {
    use regex_automata::Anchored;

    let bounds = test.bounds();
    let anchored = if test.anchored() { Anchored::Yes } else { Anchored::No };
    let mut input = crate::Input::new(crate::test_rope::SingleByteChunks::new(test.haystack()))
        .range(bounds.start..bounds.end);
    input.anchored(anchored);
    input
}

/// Convert capture matches into the test suite's capture values.
///
/// The given captures must represent a valid match, where the first capturing
/// group has a non-None span. Otherwise this panics.
fn testify_captures(caps: ®ex_automata::util::captures::Captures) -> regex_test::Captures {
    assert!(caps.is_match(), "expected captures to represent a match");
    let spans =
        caps.iter().map(|group| group.map(|m| regex_test::Span { start: m.start, end: m.end }));
    // These unwraps are OK because we assume our 'caps' represents a match,
    // and a match always gives a non-zero number of groups with the first
    // group being non-None.
    regex_test::Captures::new(caps.pattern().unwrap().as_usize(), spans).unwrap()
}

const BLACKLIST: &[&str] = &[
    // These 'earliest' tests are blacklisted because the meta searcher doesn't
    // give the same offsets that the test expects. This is legal because the
    // 'earliest' routines don't guarantee a particular match offset other
    // than "the earliest the regex engine can report a match." Some regex
    // engines will quit earlier than others. The backtracker, for example,
    // can't really quit before finding the full leftmost-first match. Many of
    // the literal searchers also don't have the ability to quit fully or it's
    // otherwise not worth doing. (A literal searcher not quitting as early as
    // possible usually means looking at a few more bytes. That's no biggie.)
    "earliest/",
];

const RUNS: usize = 1;
/// Tests the default configuration of the meta regex engine.
#[test]
fn default() -> Result<()> {
    let builder = Regex::builder();
    let mut runner = TestRunner::new()?;
    runner
        .expand(&["is_match", "find", "captures"], |test| test.compiles())
        .blacklist_iter(BLACKLIST);
    for _ in 0..RUNS {
        runner.test_iter(suite()?.iter(), compiler(builder.clone()));
    }
    runner.assert();
    Ok(())
}

#[cfg(feature = "ropey")]
#[test]
fn rope_one_past_end() -> Result<()> {
    use crate::RopeyCursor;

    let builder = Regex::builder()
        .syntax(syntax::Config::new().case_insensitive(true).multi_line(true))
        .build("git nix");
    let rope = ropey::Rope::from_str("x");
    builder.unwrap().find(Input::new(RopeyCursor::at(rope.slice(..), 1)).range(1..));
    Ok(())
}

/// Tests the default configuration minus the full DFA.
#[test]
fn no_dfa() -> Result<()> {
    let mut builder = Regex::builder();
    builder.configure(Regex::config().dfa(false));
    let mut runner = TestRunner::new()?;
    runner
        .expand(&["is_match", "find", "captures"], |test| test.compiles())
        .blacklist_iter(BLACKLIST);
    for _ in 0..RUNS {
        runner.test_iter(suite()?.iter(), compiler(builder.clone()));
    }
    runner.assert();
    Ok(())
}

/// Tests the default configuration minus the full DFA and lazy DFA.
#[test]
fn no_dfa_hybrid() -> Result<()> {
    let mut builder = Regex::builder();
    builder.configure(Regex::config().dfa(false).hybrid(false));
    let mut runner = TestRunner::new()?;
    runner
        .expand(&["is_match", "find", "captures"], |test| test.compiles())
        .blacklist_iter(BLACKLIST);
    for _ in 0..RUNS {
        runner.test_iter(suite()?.iter(), compiler(builder.clone()));
    }
    runner.assert();
    Ok(())
}

fn compiler(
    mut builder: meta::Builder,
) -> impl FnMut(&RegexTest, &[String]) -> Result {
    move |test, regexes| {
        if !configure_meta_builder(test, &mut builder) {
            return Ok(CompiledRegex::skip());
        }
        // println!("{} {builder:?}", test.full_name());
        let re = builder.build_many(regexes)?;
        Ok(CompiledRegex::compiled(move |test| -> TestResult { run_test(&re, test) }))
    }
}

fn run_test(re: &Regex, test: &RegexTest) -> TestResult {
    let mut input = create_input(test);
    match test.additional_name() {
        "is_match" => TestResult::matched(re.is_match(input)),
        "find" => match test.search_kind() {
            SearchKind::Earliest => {
                input.earliest(true);
                TestResult::matches(
                    re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map(
                        |m| Match {
                            id: m.pattern().as_usize(),
                            span: Span { start: m.start(), end: m.end() },
                        },
                    ),
                )
            }
            SearchKind::Leftmost => TestResult::matches(
                re.find_iter(input).take(test.match_limit().unwrap_or(std::usize::MAX)).map(|m| {
                    Match {
                        id: m.pattern().as_usize(),
                        span: Span { start: m.start(), end: m.end() },
                    }
                }),
            ),
            SearchKind::Overlapping => TestResult::skip(),
        },
        "captures" => match test.search_kind() {
            SearchKind::Earliest => {
                input.earliest(true);
                let it = re
                    .captures_iter(input)
                    .take(test.match_limit().unwrap_or(std::usize::MAX))
                    .map(|caps| testify_captures(&caps));
                TestResult::captures(it)
            }
            SearchKind::Leftmost => {
                let it = re
                    .captures_iter(input)
                    .take(test.match_limit().unwrap_or(std::usize::MAX))
                    .map(|caps| testify_captures(&caps));
                TestResult::captures(it)
            }
            SearchKind::Overlapping => {
                // There is no overlapping regex API that supports captures.
                TestResult::skip()
            }
        },
        name => TestResult::fail(&format!("unrecognized test name: {}", name)),
    }
}

/// Configures the given regex builder with all relevant settings on the given
/// regex test.
///
/// If the regex test has a setting that is unsupported, then this returns
/// false (implying the test should be skipped).
fn configure_meta_builder(test: &RegexTest, builder: &mut meta::Builder) -> bool {
    let match_kind = match test.match_kind() {
        regex_test::MatchKind::All => MatchKind::All,
        regex_test::MatchKind::LeftmostFirst => MatchKind::LeftmostFirst,
        regex_test::MatchKind::LeftmostLongest => return false,
    };
    let meta_config = Regex::config()
        .match_kind(match_kind)
        .utf8_empty(test.utf8())
        .line_terminator(test.line_terminator());
    builder.configure(meta_config).syntax(config_syntax(test));
    true
}

/// Configuration of the regex parser from a regex test.
fn config_syntax(test: &RegexTest) -> syntax::Config {
    syntax::Config::new()
        .case_insensitive(test.case_insensitive())
        .unicode(test.unicode())
        .utf8(test.utf8())
        .line_terminator(test.line_terminator())
}
regex-cursor-0.1.4/src/util/empty.rs000064400000000000000000000323161046102023000155300ustar  00000000000000/*!
This module provides helper routines for dealing with zero-width matches.

The main problem being solved here is this:

1. The caller wants to search something that they know is valid UTF-8, such
as a Rust `&str`.
2. The regex used by the caller can match the empty string. For example, `a*`.
3. The caller should never get match offsets returned that occur within the
encoding of a UTF-8 codepoint. It is logically incorrect, and also means that,
e.g., slicing the `&str` at those offsets will lead to a panic.

So the question here is, how do we prevent the caller from getting match
offsets that split a codepoint? For example, strictly speaking, the regex `a*`
matches `☃` at the positions `[0, 0]`, `[1, 1]`, `[2, 2]` and `[3, 3]` since
the UTF-8 encoding of `☃` is `\xE2\x98\x83`. In particular, the `NFA` that
underlies all of the matching engines in this crate doesn't have anything in
its state graph that prevents matching between UTF-8 code units. Indeed, any
engine derived from the `NFA` will match at those positions by virtue of the
fact that the `NFA` is byte oriented. That is, its transitions are defined over
bytes and the matching engines work by proceeding one byte at a time.

(An alternative architecture would be to define the transitions in an `NFA`
over codepoints, or `char`. And then make the matching engines proceed by
decoding one codepoint at a time. This is a viable strategy, but it doesn't
work for DFA matching engines because designing a fast and memory efficient
transition table for an alphabet as large as Unicode is quite difficult. More
to the point, the top-level `regex` crate supports matching on arbitrary bytes
when Unicode mode is disabled and one is searching a `&[u8]`. So in that case,
you can't just limit yourself to decoding codepoints and matching those. You
really do need to be able to follow byte oriented transitions on the `NFA`.)

In an older version of the regex crate, we handled this case not in the regex
engine, but in the iterators over matches. Namely, since this case only arises
when the match is empty, we "just" incremented the next starting position
of the search by `N`, where `N` is the length of the codepoint encoded at
the current position. The alternative or more "natural" solution of just
incrementing by `1` would result in executing a search of `a*` on `☃` like
this:

* Start search at `0`.
* Found match at `[0, 0]`.
* Next start position is `0`.
* To avoid an infinite loop, since it's an empty match, increment by `1`.
* Start search at `1`.
* Found match at `[1, 1]`. Oops.

But if we instead incremented by `3` (the length in bytes of `☃`), then we get
the following:

* Start search at `0`.
* Found match at `[0, 0]`.
* Next start position is `0`.
* To avoid an infinite loop, since it's an empty match, increment by `3`.
* Start search at `3`.
* Found match at `[3, 3]`.

And we get the correct result. But does this technique work in all cases?
Crucially, it requires that a zero-width match that splits a codepoint never
occurs beyond the starting position of the search. Because if it did, merely
incrementing the start position by the number of bytes in the codepoint at
the current position wouldn't be enough. A zero-width match could just occur
anywhere. It turns out that it is _almost_ true. We can convince ourselves by
looking at all possible patterns that can match the empty string:

* Patterns like `a*`, `a{0}`, `(?:)`, `a|` and `|a` all unconditionally match
the empty string. That is, assuming there isn't an `a` at the current position,
they will all match the empty string at the start of a search. There is no way
to move past it because any other match would not be "leftmost."
* `^` only matches at the beginning of the haystack, where the start position
is `0`. Since we know we're searching valid UTF-8 (if it isn't valid UTF-8,
then this entire problem goes away because it implies your string type supports
invalid UTF-8 and thus must deal with offsets that not only split a codepoint
but occur in entirely invalid UTF-8 somehow), it follows that `^` never matches
between the code units of a codepoint because the start of a valid UTF-8 string
is never within the encoding of a codepoint.
* `$` basically the same logic as `^`, but for the end of a string. A valid
UTF-8 string can't have an incomplete codepoint at the end of it.
* `(?m:^)` follows similarly to `^`, but it can match immediately following
a `\n`. However, since a `\n` is always a codepoint itself and can never
appear within a codepoint, it follows that the position immediately following
a `\n` in a string that is valid UTF-8 is guaranteed to not be between the
code units of another codepoint. (One caveat here is that the line terminator
for multi-line anchors can now be changed to any arbitrary byte, including
things like `\x98` which might occur within a codepoint. However, this wasn't
supported by the old regex crate. If it was, it pose the same problems as
`(?-u:\B)`, as we'll discuss below.)
* `(?m:$)` a similar argument as for `(?m:^)`. The only difference is that a
`(?m:$)` matches just before a `\n`. But the same argument applies.
* `(?Rm:^)` and `(?Rm:$)` weren't supported by the old regex crate, but the
CRLF aware line anchors follow a similar argument as for `(?m:^)` and `(?m:$)`.
Namely, since they only ever match at a boundary where one side is either a
`\r` or a `\n`, neither of which can occur within a codepoint.
* `\b` only matches at positions where both sides are valid codepoints, so
this cannot split a codepoint.
* `\B`, like `\b`, also only matches at positions where both sides are valid
codepoints. So this cannot split a codepoint either.
* `(?-u:\b)` matches only at positions where at least one side of it is an ASCII
word byte. Since ASCII bytes cannot appear as code units in non-ASCII codepoints
(one of the many amazing qualities of UTF-8), it follows that this too cannot
split a codepoint.
* `(?-u:\B)` finally represents a problem. It can matches between *any* two
bytes that are either both word bytes or non-word bytes. Since code units like
`\xE2` and `\x98` (from the UTF-8 encoding of `☃`) are both non-word bytes,
`(?-u:\B)` will match at the position between them.

Thus, our approach of incrementing one codepoint at a time after seeing an
empty match is flawed because `(?-u:\B)` can result in an empty match that
splits a codepoint at a position past the starting point of a search. For
example, searching `(?-u:\B)` on `a☃` would produce the following matches: `[2,
2]`, `[3, 3]` and `[4, 4]`. The positions at `0` and `1` don't match because
they correspond to word boundaries since `a` is an ASCII word byte.

So what did the old regex crate do to avoid this? It banned `(?-u:\B)` from
regexes that could match `&str`. That might sound extreme, but a lot of other
things were banned too. For example, all of `(?-u:.)`, `(?-u:[^a])` and
`(?-u:\W)` can match invalid UTF-8 too, including individual code units with a
codepoint. The key difference is that those expressions could never produce an
empty match. That ban happens when translating an `Ast` to an `Hir`, because
that process that reason about whether an `Hir` can produce *non-empty* matches
at invalid UTF-8 boundaries. Bottom line though is that we side-stepped the
`(?-u:\B)` issue by banning it.

If banning `(?-u:\B)` were the only issue with the old regex crate's approach,
then I probably would have kept it. `\B` is rarely used, so it's not such a big
deal to have to work-around it. However, the problem with the above approach
is that it doesn't compose. The logic for avoiding splitting a codepoint only
lived in the iterator, which means if anyone wants to implement their own
iterator over regex matches, they have to deal with this extremely subtle edge
case to get full correctness.

Instead, in this crate, we take the approach of pushing this complexity down
to the lowest layers of each regex engine. The approach is pretty simple:

* If this corner case doesn't apply, don't do anything. (For example, if UTF-8
mode isn't enabled or if the regex cannot match the empty string.)
* If an empty match is reported, explicitly check if it splits a codepoint.
* If it doesn't, we're done, return the match.
* If it does, then ignore the match and re-run the search.
* Repeat the above process until the end of the haystack is reached or a match
is found that doesn't split a codepoint or isn't zero width.

And that's pretty much what this module provides. Every regex engine uses these
methods in their lowest level public APIs, but just above the layer where
their internal engine is used. That way, all regex engines can be arbitrarily
composed without worrying about handling this case, and iterators don't need to
handle it explicitly.

(It turns out that a new feature I added, support for changing the line
terminator in a regex to any arbitrary byte, also provokes the above problem.
Namely, the byte could be invalid UTF-8 or a UTF-8 continuation byte. So that
support would need to be limited or banned when UTF-8 mode is enabled, just
like we did for `(?-u:\B)`. But thankfully our more robust approach in this
crate handles that case just fine too.)
*/

use regex_automata::MatchError;

use crate::cursor::Cursor;
use crate::input::Input;

#[cold]
#[inline(never)]
pub(crate) fn skip_splits_fwd(
    input: &mut Input,
    init_value: T,
    match_offset: usize,
    find: F,
) -> Result, MatchError>
where
    F: FnMut(&mut Input) -> Result, MatchError>,
{
    skip_splits(true, input, match_offset, init_value, find)
}

#[cold]
#[inline(never)]
pub(crate) fn skip_splits_rev(
    input: &mut Input,
    init_value: T,
    match_offset: usize,
    find: F,
) -> Result, MatchError>
where
    F: FnMut(&mut Input) -> Result, MatchError>,
{
    skip_splits(false, input, match_offset, init_value, find)
}

fn skip_splits(
    forward: bool,
    input: &mut Input,
    match_offset: usize,
    init_value: T,
    mut find: F,
) -> Result, MatchError>
where
    F: FnMut(&mut Input) -> Result, MatchError>,
{
    input.move_to(match_offset);
    // If our config says to do an anchored search, then we're definitely
    // done. We just need to determine whether we have a valid match or
    // not. If we don't, then we're not allowed to continue, so we report
    // no match.
    //
    // This is actually quite a subtle correctness thing. The key here is
    // that if we got an empty match that splits a codepoint after doing an
    // anchored search in UTF-8 mode, then that implies that we must have
    // *started* the search at a location that splits a codepoint. This
    // follows from the fact that if a match is reported from an anchored
    // search, then the start offset of the match *must* match the start
    // offset of the search.
    //
    // It also follows that no other non-empty match is possible. For
    // example, you might write a regex like '(?:)|SOMETHING' and start its
    // search in the middle of a codepoint. The first branch is an empty
    // regex that will bubble up a match at the first position, and then
    // get rejected here and report no match. But what if 'SOMETHING' could
    // have matched? We reason that such a thing is impossible, because
    // if it does, it must report a match that starts in the middle of a
    // codepoint. This in turn implies that a match is reported whose span
    // does not correspond to valid UTF-8, and this breaks the promise
    // made when UTF-8 mode is enabled. (That promise *can* be broken, for
    // example, by enabling UTF-8 mode but building an by hand NFA that
    // produces non-empty matches that span invalid UTF-8. This is an unchecked
    // but documented precondition violation of UTF-8 mode, and is documented
    // to have unspecified behavior.)
    //
    // I believe this actually means that if an anchored search is run, and
    // UTF-8 mode is enabled and the start position splits a codepoint,
    // then it is correct to immediately report no match without even
    // executing the regex engine. But it doesn't really seem worth writing
    // out that case in every regex engine to save a tiny bit of work in an
    // extremely pathological case, so we just handle it here.
    if input.get_anchored().is_anchored() {
        return Ok(input.is_char_boundary().then_some(init_value));
    }
    // Otherwise, we have an unanchored search, so just keep looking for
    // matches until we have one that does not split a codepoint or we hit
    // EOI.
    let mut value = init_value;
    while !input.is_char_boundary() {
        if forward {
            // The unwrap is OK here because overflowing usize while
            // iterating over a slice is impossible, at it would require
            // a slice of length greater than isize::MAX, which is itself
            // impossible.
            input.set_start(input.start().checked_add(1).unwrap());
        } else {
            input.set_end(match input.end().checked_sub(1) {
                None => return Ok(None),
                Some(end) => end,
            });
        }
        match find(input)? {
            None => return Ok(None),
            Some((new_value, new_match_end)) => {
                value = new_value;
                input.move_to(new_match_end)
            }
        }
    }
    Ok(Some(value))
}
regex-cursor-0.1.4/src/util/iter.rs000064400000000000000000000662051046102023000153410ustar  00000000000000/*!
Generic helpers for iteration of matches from a regex engine in a haystack.

The principle type in this module is a [`Searcher`]. A `Searcher` provides
its own lower level iterater-like API in addition to methods for constructing
types that implement `Iterator`. The documentation for `Searcher` explains a
bit more about why these different APIs exist.

Currently, this module supports iteration over any regex engine that works
with the [`HalfMatch`], [`Match`] or [`Captures`] types.
*/

use std::fmt::Debug;

use regex_automata::{HalfMatch, Match, MatchError};

use crate::cursor::Cursor;
use crate::input::Input;

/// A searcher for creating iterators and performing lower level iteration.
///
/// This searcher encapsulates the logic required for finding all successive
/// non-overlapping matches in a haystack. In theory, iteration would look
/// something like this:
///
/// 1. Setting the start position to `0`.
/// 2. Execute a regex search. If no match, end iteration.
/// 3. Report the match and set the start position to the end of the match.
/// 4. Go back to (2).
///
/// And if this were indeed the case, it's likely that `Searcher` wouldn't
/// exist. Unfortunately, because a regex may match the empty string, the above
/// logic won't work for all possible regexes. Namely, if an empty match is
/// found, then step (3) would set the start position of the search to the
/// position it was at. Thus, iteration would never end.
///
/// Instead, a `Searcher` knows how to detect these cases and forcefully
/// advance iteration in the case of an empty match that overlaps with a
/// previous match.
///
/// If you know that your regex cannot match any empty string, then the simple
/// algorithm described above will work correctly.
///
/// When possible, prefer the iterators defined on the regex engine you're
/// using. This tries to abstract over the regex engine and is thus a bit more
/// unwieldy to use.
///
/// In particular, a `Searcher` is not itself an iterator. Instead, it provides
/// `advance` routines that permit moving the search along explicitly. It also
/// provides various routines, like [`Searcher::into_matches_iter`], that
/// accept a closure (representing how a regex engine executes a search) and
/// returns a conventional iterator.
///
/// The lifetime parameters come from the [`Input`] type passed to
/// [`Searcher::new`]:
///
/// * `'h` is the lifetime of the underlying haystack.
///
/// # Searcher vs Iterator
///
/// Why does a search type with "advance" APIs exist at all when we also have
/// iterators? Unfortunately, the reasoning behind this split is a complex
/// combination of the following things:
///
/// 1. While many of the regex engines expose their own iterators, it is also
/// nice to expose this lower level iteration helper because it permits callers
/// to provide their own `Input` configuration. Moreover, a `Searcher` can work
/// with _any_ regex engine instead of only the ones defined in this crate.
/// This way, everyone benefits from a shared iteration implementation.
/// 2. There are many different regex engines that, while they have the same
/// match semantics, they have slightly different APIs. Iteration is just
/// complex enough to want to share code, and so we need a way of abstracting
/// over those different regex engines. While we could define a new trait that
/// describes any regex engine search API, it would wind up looking very close
/// to a closure. While there may still be reasons for the more generic trait
/// to exist, for now and for the purposes of iteration, we use a closure.
/// Closures also provide a lot of easy flexibility at the call site, in that
/// they permit the caller to borrow any kind of state they want for use during
/// each search call.
/// 3. As a result of using closures, and because closures are anonymous types
/// that cannot be named, it is difficult to encapsulate them without both
/// costs to speed and added complexity to the public API. For example, in
/// defining an iterator type like
/// [`dfa::regex::FindMatches`](crate::dfa::regex::FindMatches),
/// if we use a closure internally, it's not possible to name this type in the
/// return type of the iterator constructor. Thus, the only way around it is
/// to erase the type by boxing it and turning it into a `Box`.
/// This boxed closure is unlikely to be inlined _and_ it infects the public
/// API in subtle ways. Namely, unless you declare the closure as implementing
/// `Send` and `Sync`, then the resulting iterator type won't implement it
/// either. But there are practical issues with requiring the closure to
/// implement `Send` and `Sync` that result in other API complexities that
/// are beyond the scope of this already long exposition.
/// 4. Some regex engines expose more complex match information than just
/// "which pattern matched" and "at what offsets." For example, the PikeVM
/// exposes match spans for each capturing group that participated in the
/// match. In such cases, it can be quite beneficial to reuse the capturing
/// group allocation on subsequent searches. A proper iterator doesn't permit
/// this API due to its interface, so it's useful to have something a bit lower
/// level that permits callers to amortize allocations while also reusing a
/// shared implementation of iteration. (See the documentation for
/// [`Searcher::advance`] for an example of using the "advance" API with the
/// PikeVM.)
///
/// What this boils down to is that there are "advance" APIs which require
/// handing a closure to it for every call, and there are also APIs to create
/// iterators from a closure. The former are useful for _implementing_
/// iterators or when you need more flexibility, while the latter are useful
/// for conveniently writing custom iterators on-the-fly.
///
/// # Example: iterating with captures
///
/// Several regex engines in this crate over convenient iterator APIs over
/// [`Captures`] values. To do so, this requires allocating a new `Captures`
/// value for each iteration step. This can perhaps be more costly than you
/// might want. Instead of implementing your own iterator to avoid that
/// cost (which can be a little subtle if you want to handle empty matches
/// correctly), you can use this `Searcher` to do it for you:
///
/// ```
/// use regex_automata::{
///     nfa::thompson::pikevm::PikeVM,
///     util::iter::Searcher,
///     Input, Span,
/// };
///
/// let re = PikeVM::new("foo(?P[0-9]+)")?;
/// let haystack = "foo1 foo12 foo123";
///
/// let mut caps = re.create_captures();
/// let mut cache = re.create_cache();
/// let mut matches = vec![];
/// let mut searcher = Searcher::new(Input::new(haystack));
/// while let Some(_) = searcher.advance(|input| {
///     re.search(&mut cache, input, &mut caps);
///     Ok(caps.get_match())
/// }) {
///     // The unwrap is OK since 'numbers' matches if the pattern matches.
///     matches.push(caps.get_group_by_name("numbers").unwrap());
/// }
/// assert_eq!(matches, vec![
///     Span::from(3..4),
///     Span::from(8..10),
///     Span::from(14..17),
/// ]);
///
/// # Ok::<(), Box>(())
/// ```
pub struct Searcher {
    /// The input parameters to give to each regex engine call.
    ///
    /// The start position of the search is mutated during iteration.
    input: Input,
    /// Records the end offset of the most recent match. This is necessary to
    /// handle a corner case for preventing empty matches from overlapping with
    /// the ending bounds of a prior match.
    last_match_end: Option,
}

impl Searcher {
    /// Create a new fallible non-overlapping matches iterator.
    ///
    /// The given `input` provides the parameters (including the haystack),
    /// while the `finder` represents a closure that calls the underlying regex
    /// engine. The closure may borrow any additional state that is needed,
    /// such as a prefilter scanner.
    pub fn new(input: Input) -> Searcher {
        Searcher { input, last_match_end: None }
    }

    /// Returns the current `Input` used by this searcher.
    ///
    /// The `Input` returned is generally equivalent to the one given to
    /// [`Searcher::new`], but its start position may be different to reflect
    /// the start of the next search to be executed.
    pub fn input(&mut self) -> &mut Input {
        &mut self.input
    }

    // /// Return the next half match for an infallible search if one exists, and
    // /// advance to the next position.
    // ///
    // /// This is like `try_advance_half`, except errors are converted into
    // /// panics.
    // ///
    // /// # Panics
    // ///
    // /// If the given closure returns an error, then this panics. This is useful
    // /// when you know your underlying regex engine has been configured to not
    // /// return an error.
    // ///
    // /// # Example
    // ///
    // /// This example shows how to use a `Searcher` to iterate over all matches
    // /// when using a DFA, which only provides "half" matches.
    // ///
    // /// ```
    // /// use regex_automata::{
    // ///     hybrid::dfa::DFA,
    // ///     util::iter::Searcher,
    // ///     HalfMatch, Input,
    // /// };
    // ///
    // /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
    // /// let mut cache = re.create_cache();
    // ///
    // /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
    // /// let mut it = Searcher::new(input);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 10));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 21));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 32));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = None;
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// # Ok::<(), Box>(())
    // /// ```
    // ///
    // /// This correctly moves iteration forward even when an empty match occurs:
    // ///
    // /// ```
    // /// use regex_automata::{
    // ///     hybrid::dfa::DFA,
    // ///     util::iter::Searcher,
    // ///     HalfMatch, Input,
    // /// };
    // ///
    // /// let re = DFA::new(r"a|")?;
    // /// let mut cache = re.create_cache();
    // ///
    // /// let input = Input::new("abba");
    // /// let mut it = Searcher::new(input);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 1));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 2));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = Some(HalfMatch::must(0, 4));
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// let expected = None;
    // /// let got = it.advance_half(|input| re.try_search_fwd(&mut cache, input));
    // /// assert_eq!(expected, got);
    // ///
    // /// # Ok::<(), Box>(())
    // /// ```
    // #[inline]
    // pub fn advance_half(&mut self, finder: F) -> Option
    // where
    //     F: FnMut(&mut Input) -> Result, MatchError>,
    // {
    //     match self.try_advance_half(finder) {
    //         Ok(m) => m,
    //         Err(err) => panic!(
    //             "unexpected regex half find error: {}\n\
    //              to handle find errors, use 'try' or 'search' methods",
    //             err,
    //         ),
    //     }
    // }

    /// Return the next match for an infallible search if one exists, and
    /// advance to the next position.
    ///
    /// The search is advanced even in the presence of empty matches by
    /// forbidding empty matches from overlapping with any other match.
    ///
    /// This is like `try_advance`, except errors are converted into panics.
    ///
    /// # Panics
    ///
    /// If the given closure returns an error, then this panics. This is useful
    /// when you know your underlying regex engine has been configured to not
    /// return an error.
    ///
    /// # Example
    ///
    /// This example shows how to use a `Searcher` to iterate over all matches
    /// when using a regex based on lazy DFAs:
    ///
    /// ```
    /// use regex_automata::{
    ///     hybrid::regex::Regex,
    ///     util::iter::Searcher,
    ///     Match, Input,
    /// };
    ///
    /// let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
    /// let mut cache = re.create_cache();
    ///
    /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
    /// let mut it = Searcher::new(input);
    ///
    /// let expected = Some(Match::must(0, 0..10));
    /// let got = it.advance(|input| re.try_search(&mut cache, input));
    /// assert_eq!(expected, got);
    ///
    /// let expected = Some(Match::must(0, 11..21));
    /// let got = it.advance(|input| re.try_search(&mut cache, input));
    /// assert_eq!(expected, got);
    ///
    /// let expected = Some(Match::must(0, 22..32));
    /// let got = it.advance(|input| re.try_search(&mut cache, input));
    /// assert_eq!(expected, got);
    ///
    /// let expected = None;
    /// let got = it.advance(|input| re.try_search(&mut cache, input));
    /// assert_eq!(expected, got);
    ///
    /// # Ok::<(), Box>(())
    /// ```
    ///
    /// This example shows the same as above, but with the PikeVM. This example
    /// is useful because it shows how to use this API even when the regex
    /// engine doesn't directly return a `Match`.
    ///
    /// ```
    /// use regex_automata::{
    ///     nfa::thompson::pikevm::PikeVM,
    ///     util::iter::Searcher,
    ///     Match, Input,
    /// };
    ///
    /// let re = PikeVM::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
    /// let (mut cache, mut caps) = (re.create_cache(), re.create_captures());
    ///
    /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
    /// let mut it = Searcher::new(input);
    ///
    /// let expected = Some(Match::must(0, 0..10));
    /// let got = it.advance(|input| {
    ///     re.search(&mut cache, input, &mut caps);
    ///     Ok(caps.get_match())
    /// });
    /// // Note that if we wanted to extract capturing group spans, we could
    /// // do that here with 'caps'.
    /// assert_eq!(expected, got);
    ///
    /// let expected = Some(Match::must(0, 11..21));
    /// let got = it.advance(|input| {
    ///     re.search(&mut cache, input, &mut caps);
    ///     Ok(caps.get_match())
    /// });
    /// assert_eq!(expected, got);
    ///
    /// let expected = Some(Match::must(0, 22..32));
    /// let got = it.advance(|input| {
    ///     re.search(&mut cache, input, &mut caps);
    ///     Ok(caps.get_match())
    /// });
    /// assert_eq!(expected, got);
    ///
    /// let expected = None;
    /// let got = it.advance(|input| {
    ///     re.search(&mut cache, input, &mut caps);
    ///     Ok(caps.get_match())
    /// });
    /// assert_eq!(expected, got);
    ///
    /// # Ok::<(), Box>(())
    /// ```
    #[inline]
    pub fn advance(&mut self, finder: F) -> Option
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        match self.try_advance(finder) {
            Ok(m) => m,
            Err(err) => panic!(
                "unexpected regex find error: {}\n\
                 to handle find errors, use 'try' or 'search' methods",
                err,
            ),
        }
    }

    /// Return the next half match for a fallible search if one exists, and
    /// advance to the next position.
    ///
    /// This is like `advance_half`, except it permits callers to handle errors
    /// during iteration.
    #[inline]
    pub fn try_advance_half(&mut self, mut finder: F) -> Result, MatchError>
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        let mut m = match finder(&mut self.input)? {
            None => return Ok(None),
            Some(m) => m,
        };
        if Some(m.offset()) == self.last_match_end {
            m = match self.handle_overlapping_empty_half_match(m, finder)? {
                None => return Ok(None),
                Some(m) => m,
            };
        }
        self.input.set_start(m.offset());
        self.last_match_end = Some(m.offset());
        Ok(Some(m))
    }

    /// Return the next match for a fallible search if one exists, and advance
    /// to the next position.
    ///
    /// This is like `advance`, except it permits callers to handle errors
    /// during iteration.
    #[inline]
    pub fn try_advance(&mut self, mut finder: F) -> Result, MatchError>
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        let end = self.input.end();
        let mut m = match finder(&mut self.input)? {
            None => return Ok(None),
            Some(m) => m,
        };
        assert!(m.end() <= end);
        if m.is_empty() && Some(m.end()) == self.last_match_end {
            m = match self.handle_overlapping_empty_match(m, finder)? {
                None => return Ok(None),
                Some(m) => m,
            };
        }
        self.input.set_start(m.end());
        self.last_match_end = Some(m.end());
        Ok(Some(m))
    }

    /// Given a closure that executes a single search, return an iterator over
    /// all successive non-overlapping half matches.
    ///
    /// The iterator returned yields result values. If the underlying regex
    /// engine is configured to never return an error, consider calling
    /// [`TryHalfMatchesIter::infallible`] to convert errors into panics.
    ///
    /// # Example
    ///
    /// This example shows how to use a `Searcher` to create a proper
    /// iterator over half matches.
    ///
    /// ```
    /// use regex_automata::{
    ///     hybrid::dfa::DFA,
    ///     util::iter::Searcher,
    ///     HalfMatch, Input,
    /// };
    ///
    /// let re = DFA::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}")?;
    /// let mut cache = re.create_cache();
    ///
    /// let input = Input::new("2010-03-14 2016-10-08 2020-10-22");
    /// let mut it = Searcher::new(input).into_half_matches_iter(|input| {
    ///     re.try_search_fwd(&mut cache, input)
    /// });
    ///
    /// let expected = Some(Ok(HalfMatch::must(0, 10)));
    /// assert_eq!(expected, it.next());
    ///
    /// let expected = Some(Ok(HalfMatch::must(0, 21)));
    /// assert_eq!(expected, it.next());
    ///
    /// let expected = Some(Ok(HalfMatch::must(0, 32)));
    /// assert_eq!(expected, it.next());
    ///
    /// let expected = None;
    /// assert_eq!(expected, it.next());
    ///
    /// # Ok::<(), Box>(())
    /// ```
    #[inline]
    pub fn into_half_matches_iter(self, finder: F) -> TryHalfMatchesIter
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        TryHalfMatchesIter { it: self, finder }
    }

    /// Handles the special case of a match that begins where the previous
    /// match ended. Without this special handling, it'd be possible to get
    /// stuck where an empty match never results in forward progress. This
    /// also makes it more consistent with how presiding general purpose regex
    /// engines work.
    #[cold]
    #[inline(never)]
    fn handle_overlapping_empty_half_match(
        &mut self,
        _: HalfMatch,
        mut finder: F,
    ) -> Result, MatchError>
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        // Since we are only here when 'm.offset()' matches the offset of the
        // last match, it follows that this must have been an empty match.
        // Since we both need to make progress *and* prevent overlapping
        // matches, we discard this match and advance the search by 1.
        //
        // Note that this may start a search in the middle of a codepoint. The
        // regex engines themselves are expected to deal with that and not
        // report any matches within a codepoint if they are configured in
        // UTF-8 mode.
        self.input.set_start(self.input.start().checked_add(1).unwrap());
        finder(&mut self.input)
    }

    /// Handles the special case of an empty match by ensuring that 1) the
    /// iterator always advances and 2) empty matches never overlap with other
    /// matches.
    ///
    /// (1) is necessary because we principally make progress by setting the
    /// starting location of the next search to the ending location of the last
    /// match. But if a match is empty, then this results in a search that does
    /// not advance and thus does not terminate.
    ///
    /// (2) is not strictly necessary, but makes intuitive sense and matches
    /// the presiding behavior of most general purpose regex engines. The
    /// "intuitive sense" here is that we want to report NON-overlapping
    /// matches. So for example, given the regex 'a|(?:)' against the haystack
    /// 'a', without the special handling, you'd get the matches [0, 1) and [1,
    /// 1), where the latter overlaps with the end bounds of the former.
    ///
    /// Note that we mark this cold and forcefully prevent inlining because
    /// handling empty matches like this is extremely rare and does require
    /// quite a bit of code, comparatively. Keeping this code out of the main
    /// iterator function keeps it smaller and more amenable to inlining
    /// itself.
    #[cold]
    #[inline(never)]
    fn handle_overlapping_empty_match(
        &mut self,
        m: Match,
        mut finder: F,
    ) -> Result, MatchError>
    where
        F: FnMut(&mut Input) -> Result, MatchError>,
    {
        assert!(m.is_empty());
        self.input.set_start(self.input.start().checked_add(1).unwrap());
        finder(&mut self.input)
    }
}

impl Debug for Searcher {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("Searcher")
            .field("input", &self.input)
            .field("last_match_end", &self.last_match_end)
            .finish()
    }
}

/// An iterator over all non-overlapping half matches for a fallible search.
///
/// The iterator yields a `Result` value until no more
/// matches could be found.
///
/// The type parameters are as follows:
///
/// * `F` represents the type of a closure that executes the search.
///
/// The lifetime parameters come from the [`Input`] type:
///
/// * `'h` is the lifetime of the underlying haystack.
///
/// When possible, prefer the iterators defined on the regex engine you're
/// using. This tries to abstract over the regex engine and is thus a bit more
/// unwieldy to use.
///
/// This iterator is created by [`Searcher::into_half_matches_iter`].
pub struct TryHalfMatchesIter {
    it: Searcher,
    finder: F,
}

// impl TryHalfMatchesIter {
//     /// Return an infallible version of this iterator.
//     ///
//     /// Any item yielded that corresponds to an error results in a panic. This
//     /// is useful if your underlying regex engine is configured in a way that
//     /// it is guaranteed to never return an error.
//     pub fn infallible(self) -> HalfMatchesIter {
//         HalfMatchesIter(self)
//     }

//     /// Returns the current `Input` used by this iterator.
//     ///
//     /// The `Input` returned is generally equivalent to the one used to
//     /// construct this iterator, but its start position may be different to
//     /// reflect the start of the next search to be executed.
//     pub fn input(&mut self) -> &mut Input {
//         self.it.input()
//     }
// }

impl Iterator for TryHalfMatchesIter
where
    F: FnMut(&mut Input) -> Result, MatchError>,
{
    type Item = Result;

    #[inline]
    fn next(&mut self) -> Option> {
        self.it.try_advance_half(&mut self.finder).transpose()
    }
}

impl core::fmt::Debug for TryHalfMatchesIter {
    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
        f.debug_struct("TryHalfMatchesIter")
            .field("it", &self.it)
            .field("finder", &"")
            .finish()
    }
}

/// An iterator over all non-overlapping half matches for an infallible search.
///
/// The iterator yields a [`HalfMatch`] value until no more matches could be
/// found.
///
/// The type parameters are as follows:
///
/// * `F` represents the type of a closure that executes the search.
///
/// The lifetime parameters come from the [`Input`] type:
///
/// * `'h` is the lifetime of the underlying haystack.
///
/// When possible, prefer the iterators defined on the regex engine you're
/// using. This tries to abstract over the regex engine and is thus a bit more
/// unwieldy to use.
///
/// This iterator is created by [`Searcher::into_half_matches_iter`] and
/// then calling [`TryHalfMatchesIter::infallible`].
#[derive(Debug)]
pub struct HalfMatchesIter(TryHalfMatchesIter);

// impl HalfMatchesIter {
//     /// Returns the current `Input` used by this iterator.
//     ///
//     /// The `Input` returned is generally equivalent to the one used to
//     /// construct this iterator, but its start position may be different to
//     /// reflect the start of the next search to be executed.
//     pub fn input(&mut self) -> &mut Input {
//         self.0.it.input()
//     }
// }

// impl Iterator for HalfMatchesIter
// where
//     F: FnMut(&mut Input) -> Result, MatchError>,
// {
//     type Item = HalfMatch;

//     #[inline]
//     fn next(&mut self) -> Option {
//         match self.0.next()? {
//             Ok(m) => Some(m),
//             Err(err) => panic!(
//                 "unexpected regex half find error: {}\n\
//                  to handle find errors, use 'try' or 'search' methods",
//                 err,
//             ),
//         }
//     }
// }

// #[cfg(test)]
// pub fn assert_eq(
//     mut iter1: impl Iterator,
//     mut iter2: impl Iterator,
// ) {
//     let mut i = 0;
//     loop {
//         match (iter1.next(), iter2.next()) {
//             (None, None) => break,
//             (iter1, iter2) => assert_eq!(iter1, iter2, "{i}"),
//         }
//         i += 1;
//     }
// }

#[cfg(test)]
pub fn prop_assert_eq(
    mut iter1: impl Iterator,
    mut iter2: impl Iterator,
) -> proptest::test_runner::TestCaseResult {
    let mut i = 0;
    let mut prev = None;
    loop {
        match (iter1.next(), iter2.next()) {
            (None, None) => break,
            (iter1, iter2) => {
                proptest::prop_assert_eq!(&iter1, &iter2, "i={}, prev={:?}", i, prev);
                prev = iter1;
            }
        }
        i += 1;
    }
    Ok(())
}
regex-cursor-0.1.4/src/util/prefilter.rs000064400000000000000000000070611046102023000163650ustar  00000000000000/*!
Defines a prefilter for accelerating regex searches.

A prefilter can be created by building a [`Prefilter`] value.

A prefilter represents one of the most important optimizations available for
accelerating regex searches. The idea of a prefilter is to very quickly find
candidate locations in a haystack where a regex _could_ match. Once a candidate
is found, it is then intended for the regex engine to run at that position to
determine whether the candidate is a match or a false positive.

In the aforementioned description of the prefilter optimization also lay its
demise. Namely, if a prefilter has a high false positive rate and it produces
lots of candidates, then a prefilter can overall make a regex search slower.
It can run more slowly because more time is spent ping-ponging between the
prefilter search and the regex engine attempting to confirm each candidate as
a match. This ping-ponging has overhead that adds up, and is exacerbated by
a high false positive rate.

Nevertheless, the optimization is still generally worth performing in most
cases. Particularly given just how much throughput can be improved. (It is not
uncommon for prefilter optimizations to improve throughput by one or two orders
of magnitude.)

Typically a prefilter is used to find occurrences of literal prefixes from a
regex pattern, but this isn't required. A prefilter can be used to look for
suffixes or even inner literals.

Note that as of now, prefilters throw away information about which pattern
each literal comes from. In other words, when a prefilter finds a match,
there's no way to know which pattern (or patterns) it came from. Therefore,
in order to confirm a match, you'll have to check all of the patterns by
running the full regex engine.
*/

use log::debug;
use regex_automata::MatchKind;
use regex_syntax::hir::{literal, Hir};

/// Extracts all of the prefix literals from the given HIR expressions into a
/// single `Seq`. The literals in the sequence are ordered with respect to the
/// order of the given HIR expressions and consistent with the match semantics
/// given.
///
/// The sequence returned is "optimized." That is, they may be shrunk or even
/// truncated according to heuristics with the intent of making them more
/// useful as a prefilter. (Which translates to both using faster algorithms
/// and minimizing the false positive rate.)
///
/// Note that this erases any connection between the literals and which pattern
/// (or patterns) they came from.
///
/// The match kind given must correspond to the match semantics of the regex
/// that is represented by the HIRs given. The match semantics may change the
/// literal sequence returned.
pub(crate) fn prefixes(kind: MatchKind, hirs: &[H]) -> literal::Seq
where
    H: core::borrow::Borrow,
{
    let mut extractor = literal::Extractor::new();
    extractor.kind(literal::ExtractKind::Prefix);

    let mut prefixes = literal::Seq::empty();
    for hir in hirs {
        prefixes.union(&mut extractor.extract(hir.borrow()));
    }
    debug!(
        "prefixes (len={:?}, exact={:?}) extracted before optimization: {:?}",
        prefixes.len(),
        prefixes.is_exact(),
        prefixes
    );
    match kind {
        MatchKind::All => {
            prefixes.sort();
            prefixes.dedup();
        }
        MatchKind::LeftmostFirst => {
            prefixes.optimize_for_prefix_by_preference();
        }
        _ => unreachable!(),
    }
    debug!(
        "prefixes (len={:?}, exact={:?}) extracted after optimization: {:?}",
        prefixes.len(),
        prefixes.is_exact(),
        prefixes
    );
    prefixes
}
regex-cursor-0.1.4/src/util/primitives.rs000064400000000000000000000066161046102023000165710ustar  00000000000000use regex_automata::util::primitives::SmallIndex;
use regex_automata::PatternID;

#[derive(Clone, Debug)]
pub(crate) struct SmallIndexIter {
    rng: core::ops::Range,
}

impl Iterator for SmallIndexIter {
    type Item = SmallIndex;

    fn next(&mut self) -> Option {
        if self.rng.start >= self.rng.end {
            return None;
        }
        let next_id = self.rng.start + 1;
        let id = core::mem::replace(&mut self.rng.start, next_id);
        // new_unchecked is OK since we asserted that the number of
        // elements in this iterator will fit in an ID at construction.
        Some(SmallIndex::new_unchecked(id))
    }
}

macro_rules! index_type_impls {
    ($name:ident, $err:ident, $iter:ident, $withiter:ident) => {
        #[derive(Clone, Debug)]
        pub(crate) struct $iter(SmallIndexIter);

        impl $iter {
            fn new(len: usize) -> $iter {
                assert!(
                    len <= $name::LIMIT,
                    "cannot create iterator for {} when number of \
                     elements exceed {:?}",
                    stringify!($name),
                    $name::LIMIT,
                );
                $iter(SmallIndexIter { rng: 0..len })
            }
        }

        impl Iterator for $iter {
            type Item = $name;

            fn next(&mut self) -> Option<$name> {
                self.0.next().map(|id| $name::new_unchecked(id.as_usize()))
            }
        }

        /// An iterator adapter that is like std::iter::Enumerate, but attaches
        /// small index values instead. It requires `ExactSizeIterator`. At
        /// construction, it ensures that the index of each element in the
        /// iterator is representable in the corresponding small index type.
        #[derive(Clone, Debug)]
        pub(crate) struct $withiter {
            it: I,
            ids: $iter,
        }

        impl $withiter {
            fn new(it: I) -> $withiter {
                let ids = $iter::new(it.len());
                $withiter { it, ids }
            }
        }

        impl Iterator for $withiter {
            type Item = ($name, I::Item);

            fn next(&mut self) -> Option<($name, I::Item)> {
                let item = self.it.next()?;
                // Number of elements in this iterator must match, according
                // to contract of ExactSizeIterator.
                let id = self.ids.next().unwrap();
                Some((id, item))
            }
        }
    };
}

index_type_impls!(PatternID, PatternIDError, PatternIDIter, WithPatternIDIter);
// index_type_impls!(StateID, StateIDError, StateIDIter, WithStateIDIter);

/// A utility trait that defines a couple of adapters for making it convenient
/// to access indices as "small index" types. We require ExactSizeIterator so
/// that iterator construction can do a single check to make sure the index of
/// each element is representable by its small index type.
pub(crate) trait IteratorIndexExt: Iterator {
    fn with_pattern_ids(self) -> WithPatternIDIter
    where
        Self: Sized + ExactSizeIterator,
    {
        WithPatternIDIter::new(self)
    }

    // fn with_state_ids(self) -> WithStateIDIter
    // where
    //     Self: Sized + ExactSizeIterator,
    // {
    //     WithStateIDIter::new(self)
    // }
}

impl IteratorIndexExt for I {}
regex-cursor-0.1.4/src/util/sparse_set.rs000064400000000000000000000137661046102023000165520ustar  00000000000000/*!
This module defines a sparse set data structure. Its most interesting
properties are:

* They preserve insertion order.
* Set membership testing is done in constant time.
* Set insertion is done in constant time.
* Clearing the set is done in constant time.

The cost for doing this is that the capacity of the set needs to be known up
front, and the elements in the set are limited to state identifiers.

These sets are principally used when traversing an NFA state graph. This
happens at search time, for example, in the PikeVM. It also happens during DFA
determinization.
*/

use std::vec;
use std::vec::Vec;

use regex_automata::util::primitives::StateID;

/// A sparse set used for representing ordered NFA states.
///
/// This supports constant time addition and membership testing. Clearing an
/// entire set can also be done in constant time. Iteration yields elements
/// in the order in which they were inserted.
///
/// The data structure is based on: https://research.swtch.com/sparse
/// Note though that we don't actually use uninitialized memory. We generally
/// reuse sparse sets, so the initial allocation cost is bareable. However, its
/// other properties listed above are extremely useful.
#[derive(Clone)]
pub(crate) struct SparseSet {
    /// The number of elements currently in this set.
    len: usize,
    /// Dense contains the ids in the order in which they were inserted.
    dense: Vec,
    /// Sparse maps ids to their location in dense.
    ///
    /// A state ID is in the set if and only if
    /// sparse[id] < len && id == dense[sparse[id]].
    ///
    /// Note that these are indices into 'dense'. It's a little weird to use
    /// StateID here, but we know our length can never exceed the bounds of
    /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
    /// where as a usize is likely double that in most cases.
    sparse: Vec,
}

impl SparseSet {
    /// Create a new sparse set with the given capacity.
    ///
    /// Sparse sets have a fixed size and they cannot grow. Attempting to
    /// insert more distinct elements than the total capacity of the set will
    /// result in a panic.
    ///
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
    #[inline]
    pub(crate) fn new(capacity: usize) -> SparseSet {
        let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
        set.resize(capacity);
        set
    }

    /// Resizes this sparse set to have the new capacity given.
    ///
    /// This set is automatically cleared.
    ///
    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
    #[inline]
    pub(crate) fn resize(&mut self, new_capacity: usize) {
        assert!(
            new_capacity <= StateID::LIMIT,
            "sparse set capacity cannot excced {:?}",
            StateID::LIMIT
        );
        self.clear();
        self.dense.resize(new_capacity, StateID::ZERO);
        self.sparse.resize(new_capacity, StateID::ZERO);
    }

    /// Returns the capacity of this set.
    ///
    /// The capacity represents a fixed limit on the number of distinct
    /// elements that are allowed in this set. The capacity cannot be changed.
    #[inline]
    pub(crate) fn capacity(&self) -> usize {
        self.dense.len()
    }

    /// Returns the number of elements in this set.
    #[inline]
    pub(crate) fn len(&self) -> usize {
        self.len
    }

    /// Returns true if and only if this set is empty.
    #[inline]
    pub(crate) fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Insert the state ID value into this set and return true if the given
    /// state ID was not previously in this set.
    ///
    /// This operation is idempotent. If the given value is already in this
    /// set, then this is a no-op.
    ///
    /// If more than `capacity` ids are inserted, then this panics.
    ///
    /// This is marked as inline(always) since the compiler won't inline it
    /// otherwise, and it's a fairly hot piece of code in DFA determinization.
    #[cfg_attr(feature = "perf-inline", inline(always))]
    pub(crate) fn insert(&mut self, id: StateID) -> bool {
        if self.contains(id) {
            return false;
        }

        let i = self.len();
        assert!(
            i < self.capacity(),
            "{:?} exceeds capacity of {:?} when inserting {:?}",
            i,
            self.capacity(),
            id,
        );
        // OK since i < self.capacity() and self.capacity() is guaranteed to
        // be <= StateID::LIMIT.
        let index = StateID::new_unchecked(i);
        self.dense[index] = id;
        self.sparse[id] = index;
        self.len += 1;
        true
    }

    /// Returns true if and only if this set contains the given value.
    #[inline]
    pub(crate) fn contains(&self, id: StateID) -> bool {
        let index = self.sparse[id];
        index.as_usize() < self.len() && self.dense[index] == id
    }

    /// Clear this set such that it has no members.
    #[inline]
    pub(crate) fn clear(&mut self) {
        self.len = 0;
    }

    #[inline]
    pub(crate) fn iter(&self) -> SparseSetIter<'_> {
        SparseSetIter(self.dense[..self.len()].iter())
    }

    /// Returns the heap memory usage, in bytes, used by this sparse set.
    #[inline]
    pub(crate) fn memory_usage(&self) -> usize {
        self.dense.len() * StateID::SIZE + self.sparse.len() * StateID::SIZE
    }
}

impl core::fmt::Debug for SparseSet {
    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
        let elements: Vec = self.iter().collect();
        f.debug_tuple("SparseSet").field(&elements).finish()
    }
}

/// An iterator over all elements in a sparse set.
///
/// The lifetime `'a` refers to the lifetime of the set being iterated over.
#[derive(Debug)]
pub(crate) struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);

impl<'a> Iterator for SparseSetIter<'a> {
    type Item = StateID;

    #[cfg_attr(feature = "perf-inline", inline(always))]
    fn next(&mut self) -> Option {
        self.0.next().copied()
    }
}
regex-cursor-0.1.4/src/util/tests.rs000064400000000000000000000043111046102023000155260ustar  00000000000000use crate::util::{decode, decode_last};
use crate::Input;
use proptest::{prop_assert_eq, proptest};
use std::iter::successors;

proptest! {
    #[test]
    fn test_decode(haystack: String) {
        let foo = ropey::Rope::from_str(&haystack);
        let mut input = Input::new(foo.slice(..));
        let first_char = decode(&mut input, 0).transpose().unwrap();
        let res: Vec<_> = successors(first_char.map(|c| (0, c)), |(i, c)| {
            decode(&mut input, i + c.len_utf8())
                .transpose()
                .unwrap()
                .map(|c2| (i + c.len_utf8(), c2))
        })
        .collect();
        let ref_chars: Vec<_> = haystack.char_indices().collect();
        prop_assert_eq!(res, ref_chars);

        // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap();
        // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| {
        //     decode(&mut input, i + c.len_utf8())
        //         .transpose()
        //         .unwrap()
        //         .map(|c2| (i + c.len_utf8(), c2))
        // });
    }
    #[test]
    fn test_decode_last(haystack: String) {
        let foo = ropey::Rope::from_str(&haystack);
        let mut input = Input::new(foo.slice(..));
        let end = haystack.len();
        input.move_to(end);
        let first_char = decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, end).transpose().unwrap();
        let res: Vec<_> = successors(first_char.map(|c| (end - c.len_utf8(), c)), |&(i, _)| {
            input.move_to(i);
            decode_last(haystack[..input.haystack_off()].as_bytes(), &mut input, i)
                .transpose()
                .unwrap()
                .map(|c2| (i - c2.len_utf8(), c2))
        })
        .collect();
        let ref_chars: Vec<_> = haystack.char_indices().rev().collect();
        prop_assert_eq!(res, ref_chars);

        // let last_char = decode_last(&[], &mut input, 0).transpose().unwrap();
        // let chars_rev = std::iter::successors(first_char.map(|c| (0, c)), |(i, c)| {
        //     decode(&mut input, i + c.len_utf8())
        //         .transpose()
        //         .unwrap()
        //         .map(|c2| (i + c.len_utf8(), c2))
        // });
    }
}
regex-cursor-0.1.4/src/util/utf8.rs000064400000000000000000000022641046102023000152570ustar  00000000000000/*!
Utilities for dealing with UTF-8.

This module provides some UTF-8 related helper routines, including an
incremental decoder.
*/

/// Returns true if and only if the given offset in the given bytes falls on a
/// valid UTF-8 encoded codepoint boundary.
///
/// If `bytes` is not valid UTF-8, then the behavior of this routine is
/// unspecified.
#[cfg_attr(feature = "perf-inline", inline(always))]
pub(crate) fn is_boundary(bytes: &[u8], i: usize) -> bool {
    match bytes.get(i) {
        // The position at the end of the bytes always represents an empty
        // string, which is a valid boundary. But anything after that doesn't
        // make much sense to call valid a boundary.
        None => i == bytes.len(),
        // Other than ASCII (where the most significant bit is never set),
        // valid starting bytes always have their most significant two bits
        // set, where as continuation bytes never have their second most
        // significant bit set. Therefore, this only returns true when bytes[i]
        // corresponds to a byte that begins a valid UTF-8 encoding of a
        // Unicode scalar value.
        Some(&b) => b <= 0b0111_1111 || b >= 0b1100_0000,
    }
}
regex-cursor-0.1.4/src/util.rs000064400000000000000000000002141046102023000143620ustar  00000000000000pub(crate) mod empty;
pub mod iter;
pub mod prefilter;
pub mod primitives;
pub mod sparse_set;
pub mod utf8;

// #[cfg(test)]
// mod tests;
regex-cursor-0.1.4/test_cases/syntax.rs000064400000000000000000003114241046102023000163110ustar  00000000000000use crate::{
    auto_pairs::AutoPairs,
    chars::char_is_line_ending,
    diagnostic::Severity,
    regex::Regex,
    transaction::{ChangeSet, Operation},
    Rope, RopeSlice, Tendril,
};

use ahash::RandomState;
use arc_swap::{ArcSwap, Guard};
use bitflags::bitflags;
use hashbrown::raw::RawTable;
use slotmap::{DefaultKey as LayerId, HopSlotMap};

use std::{
    borrow::Cow,
    cell::RefCell,
    collections::{HashMap, HashSet, VecDeque},
    fmt::{self, Display},
    hash::{Hash, Hasher},
    mem::{replace, transmute},
    path::{Path, PathBuf},
    str::FromStr,
    sync::Arc,
};

use once_cell::sync::{Lazy, OnceCell};
use serde::{ser::SerializeSeq, Deserialize, Serialize};

use helix_loader::grammar::{get_language, load_runtime_file};

fn deserialize_regex<'de, D>(deserializer: D) -> Result, D::Error>
where
    D: serde::Deserializer<'de>,
{
    Option::::deserialize(deserializer)?
        .map(|buf| Regex::new(&buf).map_err(serde::de::Error::custom))
        .transpose()
}

fn deserialize_lsp_config<'de, D>(deserializer: D) -> Result, D::Error>
where
    D: serde::Deserializer<'de>,
{
    Option::::deserialize(deserializer)?
        .map(|toml| toml.try_into().map_err(serde::de::Error::custom))
        .transpose()
}

fn deserialize_tab_width<'de, D>(deserializer: D) -> Result
where
    D: serde::Deserializer<'de>,
{
    usize::deserialize(deserializer).and_then(|n| {
        if n > 0 && n <= 16 {
            Ok(n)
        } else {
            Err(serde::de::Error::custom(
                "tab width must be a value from 1 to 16 inclusive",
            ))
        }
    })
}

pub fn deserialize_auto_pairs<'de, D>(deserializer: D) -> Result, D::Error>
where
    D: serde::Deserializer<'de>,
{
    Ok(Option::::deserialize(deserializer)?.and_then(AutoPairConfig::into))
}

fn default_timeout() -> u64 {
    20
}

#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct Configuration {
    pub language: Vec,
    #[serde(default)]
    pub language_server: HashMap,
}

impl Default for Configuration {
    fn default() -> Self {
        crate::config::default_syntax_loader()
    }
}

// largely based on tree-sitter/cli/src/loader.rs
#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case", deny_unknown_fields)]
pub struct LanguageConfiguration {
    #[serde(rename = "name")]
    pub language_id: String, // c-sharp, rust, tsx
    #[serde(rename = "language-id")]
    // see the table under https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/#textDocumentItem
    pub language_server_language_id: Option, // csharp, rust, typescriptreact, for the language-server
    pub scope: String,             // source.rust
    pub file_types: Vec, // filename extension or ends_with? 
    #[serde(default)]
    pub shebangs: Vec, // interpreter(s) associated with language
    pub roots: Vec,        // these indicate project roots <.git, Cargo.toml>
    pub comment_token: Option,
    pub text_width: Option,
    pub soft_wrap: Option,

    #[serde(default)]
    pub auto_format: bool,

    #[serde(skip_serializing_if = "Option::is_none")]
    pub formatter: Option,

    #[serde(default)]
    pub diagnostic_severity: Severity,

    pub grammar: Option, // tree-sitter grammar name, defaults to language_id

    // content_regex
    #[serde(default, skip_serializing, deserialize_with = "deserialize_regex")]
    pub injection_regex: Option,
    // first_line_regex
    //
    #[serde(skip)]
    pub(crate) highlight_config: OnceCell>>,
    // tags_config OnceCell<> https://github.com/tree-sitter/tree-sitter/pull/583
    #[serde(
        default,
        skip_serializing_if = "Vec::is_empty",
        serialize_with = "serialize_lang_features",
        deserialize_with = "deserialize_lang_features"
    )]
    pub language_servers: Vec,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub indent: Option,

    #[serde(skip)]
    pub(crate) indent_query: OnceCell>,
    #[serde(skip)]
    pub(crate) textobject_query: OnceCell>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub debugger: Option,

    /// Automatic insertion of pairs to parentheses, brackets,
    /// etc. Defaults to true. Optionally, this can be a list of 2-tuples
    /// to specify a list of characters to pair. This overrides the
    /// global setting.
    #[serde(default, skip_serializing, deserialize_with = "deserialize_auto_pairs")]
    pub auto_pairs: Option,

    pub rulers: Option>, // if set, override editor's rulers

    /// Hardcoded LSP root directories relative to the workspace root, like `examples` or `tools/fuzz`.
    /// Falling back to the current working directory if none are configured.
    pub workspace_lsp_roots: Option>,
}

#[derive(Debug, PartialEq, Eq, Hash)]
pub enum FileType {
    /// The extension of the file, either the `Path::extension` or the full
    /// filename if the file does not have an extension.
    Extension(String),
    /// The suffix of a file. This is compared to a given file's absolute
    /// path, so it can be used to detect files based on their directories.
    Suffix(String),
}

impl Serialize for FileType {
    fn serialize(&self, serializer: S) -> Result
    where
        S: serde::Serializer,
    {
        use serde::ser::SerializeMap;

        match self {
            FileType::Extension(extension) => serializer.serialize_str(extension),
            FileType::Suffix(suffix) => {
                let mut map = serializer.serialize_map(Some(1))?;
                map.serialize_entry("suffix", &suffix.replace(std::path::MAIN_SEPARATOR, "/"))?;
                map.end()
            }
        }
    }
}

impl<'de> Deserialize<'de> for FileType {
    fn deserialize(deserializer: D) -> Result
    where
        D: serde::de::Deserializer<'de>,
    {
        struct FileTypeVisitor;

        impl<'de> serde::de::Visitor<'de> for FileTypeVisitor {
            type Value = FileType;

            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                formatter.write_str("string or table")
            }

            fn visit_str(self, value: &str) -> Result
            where
                E: serde::de::Error,
            {
                Ok(FileType::Extension(value.to_string()))
            }

            fn visit_map(self, mut map: M) -> Result
            where
                M: serde::de::MapAccess<'de>,
            {
                match map.next_entry::()? {
                    Some((key, suffix)) if key == "suffix" => Ok(FileType::Suffix({
                        // FIXME: use `suffix.replace('/', std::path::MAIN_SEPARATOR_STR)`
                        //        if MSRV is updated to 1.68
                        let mut separator = [0; 1];
                        suffix.replace('/', std::path::MAIN_SEPARATOR.encode_utf8(&mut separator))
                    })),
                    Some((key, _value)) => Err(serde::de::Error::custom(format!(
                        "unknown key in `file-types` list: {}",
                        key
                    ))),
                    None => Err(serde::de::Error::custom(
                        "expected a `suffix` key in the `file-types` entry",
                    )),
                }
            }
        }

        deserializer.deserialize_any(FileTypeVisitor)
    }
}

#[derive(Clone, Copy, Debug, Serialize, Deserialize, PartialEq, Eq, Hash)]
#[serde(rename_all = "kebab-case")]
pub enum LanguageServerFeature {
    Format,
    GotoDeclaration,
    GotoDefinition,
    GotoTypeDefinition,
    GotoReference,
    GotoImplementation,
    // Goto, use bitflags, combining previous Goto members?
    SignatureHelp,
    Hover,
    DocumentHighlight,
    Completion,
    CodeAction,
    WorkspaceCommand,
    DocumentSymbols,
    WorkspaceSymbols,
    // Symbols, use bitflags, see above?
    Diagnostics,
    RenameSymbol,
    InlayHints,
}

impl Display for LanguageServerFeature {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        use LanguageServerFeature::*;
        let feature = match self {
            Format => "format",
            GotoDeclaration => "goto-declaration",
            GotoDefinition => "goto-definition",
            GotoTypeDefinition => "goto-type-definition",
            GotoReference => "goto-type-definition",
            GotoImplementation => "goto-implementation",
            SignatureHelp => "signature-help",
            Hover => "hover",
            DocumentHighlight => "document-highlight",
            Completion => "completion",
            CodeAction => "code-action",
            WorkspaceCommand => "workspace-command",
            DocumentSymbols => "document-symbols",
            WorkspaceSymbols => "workspace-symbols",
            Diagnostics => "diagnostics",
            RenameSymbol => "rename-symbol",
            InlayHints => "inlay-hints",
        };
        write!(f, "{feature}",)
    }
}

#[derive(Debug, Serialize, Deserialize)]
#[serde(untagged, rename_all = "kebab-case", deny_unknown_fields)]
enum LanguageServerFeatureConfiguration {
    #[serde(rename_all = "kebab-case")]
    Features {
        #[serde(default, skip_serializing_if = "HashSet::is_empty")]
        only_features: HashSet,
        #[serde(default, skip_serializing_if = "HashSet::is_empty")]
        except_features: HashSet,
        name: String,
    },
    Simple(String),
}

#[derive(Debug, Default)]
pub struct LanguageServerFeatures {
    pub name: String,
    pub only: HashSet,
    pub excluded: HashSet,
}

impl LanguageServerFeatures {
    pub fn has_feature(&self, feature: LanguageServerFeature) -> bool {
        (self.only.is_empty() || self.only.contains(&feature)) && !self.excluded.contains(&feature)
    }
}

fn deserialize_lang_features<'de, D>(
    deserializer: D,
) -> Result, D::Error>
where
    D: serde::Deserializer<'de>,
{
    let raw: Vec = Deserialize::deserialize(deserializer)?;
    let res = raw
        .into_iter()
        .map(|config| match config {
            LanguageServerFeatureConfiguration::Simple(name) => LanguageServerFeatures {
                name,
                ..Default::default()
            },
            LanguageServerFeatureConfiguration::Features {
                only_features,
                except_features,
                name,
            } => LanguageServerFeatures {
                name,
                only: only_features,
                excluded: except_features,
            },
        })
        .collect();
    Ok(res)
}
fn serialize_lang_features(
    map: &Vec,
    serializer: S,
) -> Result
where
    S: serde::Serializer,
{
    let mut serializer = serializer.serialize_seq(Some(map.len()))?;
    for features in map {
        let features = if features.only.is_empty() && features.excluded.is_empty() {
            LanguageServerFeatureConfiguration::Simple(features.name.to_owned())
        } else {
            LanguageServerFeatureConfiguration::Features {
                only_features: features.only.clone(),
                except_features: features.excluded.clone(),
                name: features.name.to_owned(),
            }
        };
        serializer.serialize_element(&features)?;
    }
    serializer.end()
}

#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct LanguageServerConfiguration {
    pub command: String,
    #[serde(default)]
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub args: Vec,
    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
    pub environment: HashMap,
    #[serde(default, skip_serializing, deserialize_with = "deserialize_lsp_config")]
    pub config: Option,
    #[serde(default = "default_timeout")]
    pub timeout: u64,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct FormatterConfiguration {
    pub command: String,
    #[serde(default)]
    #[serde(skip_serializing_if = "Vec::is_empty")]
    pub args: Vec,
}

#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub struct AdvancedCompletion {
    pub name: Option,
    pub completion: Option,
    pub default: Option,
}

#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case", untagged)]
pub enum DebugConfigCompletion {
    Named(String),
    Advanced(AdvancedCompletion),
}

#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[serde(untagged)]
pub enum DebugArgumentValue {
    String(String),
    Array(Vec),
    Boolean(bool),
}

#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub struct DebugTemplate {
    pub name: String,
    pub request: String,
    pub completion: Vec,
    pub args: HashMap,
}

#[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
#[serde(rename_all = "kebab-case")]
pub struct DebugAdapterConfig {
    pub name: String,
    pub transport: String,
    #[serde(default)]
    pub command: String,
    #[serde(default)]
    pub args: Vec,
    pub port_arg: Option,
    pub templates: Vec,
    #[serde(default)]
    pub quirks: DebuggerQuirks,
}

// Different workarounds for adapters' differences
#[derive(Debug, Default, PartialEq, Eq, Clone, Serialize, Deserialize)]
pub struct DebuggerQuirks {
    #[serde(default)]
    pub absolute_paths: bool,
}

#[derive(Debug, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case")]
pub struct IndentationConfiguration {
    #[serde(deserialize_with = "deserialize_tab_width")]
    pub tab_width: usize,
    pub unit: String,
}

/// Configuration for auto pairs
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "kebab-case", deny_unknown_fields, untagged)]
pub enum AutoPairConfig {
    /// Enables or disables auto pairing. False means disabled. True means to use the default pairs.
    Enable(bool),

    /// The mappings of pairs.
    Pairs(HashMap),
}

impl Default for AutoPairConfig {
    fn default() -> Self {
        AutoPairConfig::Enable(true)
    }
}

impl From<&AutoPairConfig> for Option {
    fn from(auto_pair_config: &AutoPairConfig) -> Self {
        match auto_pair_config {
            AutoPairConfig::Enable(false) => None,
            AutoPairConfig::Enable(true) => Some(AutoPairs::default()),
            AutoPairConfig::Pairs(pairs) => Some(AutoPairs::new(pairs.iter())),
        }
    }
}

impl From for Option {
    fn from(auto_pairs_config: AutoPairConfig) -> Self {
        (&auto_pairs_config).into()
    }
}

impl FromStr for AutoPairConfig {
    type Err = std::str::ParseBoolError;

    // only do bool parsing for runtime setting
    fn from_str(s: &str) -> Result {
        let enable: bool = s.parse()?;
        Ok(AutoPairConfig::Enable(enable))
    }
}

#[derive(Debug)]
pub struct TextObjectQuery {
    pub query: Query,
}

#[derive(Debug)]
pub enum CapturedNode<'a> {
    Single(Node<'a>),
    /// Guaranteed to be not empty
    Grouped(Vec>),
}

impl<'a> CapturedNode<'a> {
    pub fn start_byte(&self) -> usize {
        match self {
            Self::Single(n) => n.start_byte(),
            Self::Grouped(ns) => ns[0].start_byte(),
        }
    }

    pub fn end_byte(&self) -> usize {
        match self {
            Self::Single(n) => n.end_byte(),
            Self::Grouped(ns) => ns.last().unwrap().end_byte(),
        }
    }

    pub fn byte_range(&self) -> std::ops::Range {
        self.start_byte()..self.end_byte()
    }
}

/// The maximum number of in-progress matches a TS cursor can consider at once.
/// This is set to a constant in order to avoid performance problems for medium to large files. Set with `set_match_limit`.
/// Using such a limit means that we lose valid captures, so there is fundamentally a tradeoff here.
///
///
/// Old tree sitter versions used a limit of 32 by default until this limit was removed in version `0.19.5` (must now be set manually).
/// However, this causes performance issues for medium to large files.
/// In helix, this problem caused treesitter motions to take multiple seconds to complete in medium-sized rust files (3k loc).
///
///
/// Neovim also encountered this problem and reintroduced this limit after it was removed upstream
/// (see  and ).
/// The number used here is fundamentally a tradeoff between breaking some obscure edge cases and performance.
///
///
/// Neovim chose 64 for this value somewhat arbitrarily ().
/// 64 is too low for some languages though. In particular, it breaks some highlighting for record fields in Erlang record definitions.
/// This number can be increased if new syntax highlight breakages are found, as long as the performance penalty is not too high.
const TREE_SITTER_MATCH_LIMIT: u32 = 256;

impl TextObjectQuery {
    /// Run the query on the given node and return sub nodes which match given
    /// capture ("function.inside", "class.around", etc).
    ///
    /// Captures may contain multiple nodes by using quantifiers (+, *, etc),
    /// and support for this is partial and could use improvement.
    ///
    /// ```query
    /// (comment)+ @capture
    ///
    /// ; OR
    /// (
    ///   (comment)*
    ///   .
    ///   (function)
    /// ) @capture
    /// ```
    pub fn capture_nodes<'a>(
        &'a self,
        capture_name: &str,
        node: Node<'a>,
        slice: RopeSlice<'a>,
        cursor: &'a mut QueryCursor,
    ) -> Option>> {
        self.capture_nodes_any(&[capture_name], node, slice, cursor)
    }

    /// Find the first capture that exists out of all given `capture_names`
    /// and return sub nodes that match this capture.
    pub fn capture_nodes_any<'a>(
        &'a self,
        capture_names: &[&str],
        node: Node<'a>,
        slice: RopeSlice<'a>,
        cursor: &'a mut QueryCursor,
    ) -> Option>> {
        let capture_idx = capture_names
            .iter()
            .find_map(|cap| self.query.capture_index_for_name(cap))?;

        cursor.set_match_limit(TREE_SITTER_MATCH_LIMIT);

        let nodes = cursor
            .captures(&self.query, node, RopeProvider(slice))
            .filter_map(move |(mat, _)| {
                let nodes: Vec<_> = mat
                    .captures
                    .iter()
                    .filter_map(|cap| (cap.index == capture_idx).then_some(cap.node))
                    .collect();

                if nodes.len() > 1 {
                    Some(CapturedNode::Grouped(nodes))
                } else {
                    nodes.into_iter().map(CapturedNode::Single).next()
                }
            });

        Some(nodes)
    }
}

pub fn read_query(language: &str, filename: &str) -> String {
    static INHERITS_REGEX: Lazy =
        Lazy::new(|| Regex::new(r";+\s*inherits\s*:?\s*([a-z_,()-]+)\s*").unwrap());

    let query = load_runtime_file(language, filename).unwrap_or_default();

    // replaces all "; inherits (,)*" with the queries of the given language(s)
    INHERITS_REGEX
        .replace_all(&query, |captures: ®ex::Captures| {
            captures[1]
                .split(',')
                .map(|language| format!("\n{}\n", read_query(language, filename)))
                .collect::()
        })
        .to_string()
}

impl LanguageConfiguration {
    fn initialize_highlight(&self, scopes: &[String]) -> Option> {
        let highlights_query = read_query(&self.language_id, "highlights.scm");
        // always highlight syntax errors
        // highlights_query += "\n(ERROR) @error";

        let injections_query = read_query(&self.language_id, "injections.scm");
        let locals_query = read_query(&self.language_id, "locals.scm");

        if highlights_query.is_empty() {
            None
        } else {
            let language = get_language(self.grammar.as_deref().unwrap_or(&self.language_id))
                .map_err(|err| {
                    log::error!(
                        "Failed to load tree-sitter parser for language {:?}: {}",
                        self.language_id,
                        err
                    )
                })
                .ok()?;
            let config = HighlightConfiguration::new(
                language,
                &highlights_query,
                &injections_query,
                &locals_query,
            )
            .map_err(|err| log::error!("Could not parse queries for language {:?}. Are your grammars out of sync? Try running 'hx --grammar fetch' and 'hx --grammar build'. This query could not be parsed: {:?}", self.language_id, err))
            .ok()?;

            config.configure(scopes);
            Some(Arc::new(config))
        }
    }

    pub fn reconfigure(&self, scopes: &[String]) {
        if let Some(Some(config)) = self.highlight_config.get() {
            config.configure(scopes);
        }
    }

    pub fn highlight_config(&self, scopes: &[String]) -> Option> {
        self.highlight_config
            .get_or_init(|| self.initialize_highlight(scopes))
            .clone()
    }

    pub fn is_highlight_initialized(&self) -> bool {
        self.highlight_config.get().is_some()
    }

    pub fn indent_query(&self) -> Option<&Query> {
        self.indent_query
            .get_or_init(|| self.load_query("indents.scm"))
            .as_ref()
    }

    pub fn textobject_query(&self) -> Option<&TextObjectQuery> {
        self.textobject_query
            .get_or_init(|| {
                self.load_query("textobjects.scm")
                    .map(|query| TextObjectQuery { query })
            })
            .as_ref()
    }

    pub fn scope(&self) -> &str {
        &self.scope
    }

    fn load_query(&self, kind: &str) -> Option {
        let query_text = read_query(&self.language_id, kind);
        if query_text.is_empty() {
            return None;
        }
        let lang = self.highlight_config.get()?.as_ref()?.language;
        Query::new(lang, &query_text)
            .map_err(|e| {
                log::error!(
                    "Failed to parse {} queries for {}: {}",
                    kind,
                    self.language_id,
                    e
                )
            })
            .ok()
    }
}
#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
#[serde(default, rename_all = "kebab-case", deny_unknown_fields)]
pub struct SoftWrap {
    /// Soft wrap lines that exceed viewport width. Default to off
    // NOTE: Option on purpose because the struct is shared between language config and global config.
    // By default the option is None so that the language config falls back to the global config unless explicitly set.
    pub enable: Option,
    /// Maximum space left free at the end of the line.
    /// This space is used to wrap text at word boundaries. If that is not possible within this limit
    /// the word is simply split at the end of the line.
    ///
    /// This is automatically hard-limited to a quarter of the viewport to ensure correct display on small views.
    ///
    /// Default to 20
    pub max_wrap: Option,
    /// Maximum number of indentation that can be carried over from the previous line when softwrapping.
    /// If a line is indented further then this limit it is rendered at the start of the viewport instead.
    ///
    /// This is automatically hard-limited to a quarter of the viewport to ensure correct display on small views.
    ///
    /// Default to 40
    pub max_indent_retain: Option,
    /// Indicator placed at the beginning of softwrapped lines
    ///
    /// Defaults to ↪
    pub wrap_indicator: Option,
    /// Softwrap at `text_width` instead of viewport width if it is shorter
    pub wrap_at_text_width: Option,
}

// Expose loader as Lazy<> global since it's always static?

#[derive(Debug)]
pub struct Loader {
    // highlight_names ?
    language_configs: Vec>,
    language_config_ids_by_extension: HashMap, // Vec
    language_config_ids_by_suffix: HashMap,
    language_config_ids_by_shebang: HashMap,

    language_server_configs: HashMap,

    scopes: ArcSwap>,
}

impl Loader {
    pub fn new(config: Configuration) -> Self {
        let mut loader = Self {
            language_configs: Vec::new(),
            language_server_configs: config.language_server,
            language_config_ids_by_extension: HashMap::new(),
            language_config_ids_by_suffix: HashMap::new(),
            language_config_ids_by_shebang: HashMap::new(),
            scopes: ArcSwap::from_pointee(Vec::new()),
        };

        for config in config.language {
            // get the next id
            let language_id = loader.language_configs.len();

            for file_type in &config.file_types {
                // entry().or_insert(Vec::new).push(language_id);
                match file_type {
                    FileType::Extension(extension) => loader
                        .language_config_ids_by_extension
                        .insert(extension.clone(), language_id),
                    FileType::Suffix(suffix) => loader
                        .language_config_ids_by_suffix
                        .insert(suffix.clone(), language_id),
                };
            }
            for shebang in &config.shebangs {
                loader
                    .language_config_ids_by_shebang
                    .insert(shebang.clone(), language_id);
            }

            loader.language_configs.push(Arc::new(config));
        }

        loader
    }

    pub fn language_config_for_file_name(&self, path: &Path) -> Option> {
        // Find all the language configurations that match this file name
        // or a suffix of the file name.
        let configuration_id = path
            .file_name()
            .and_then(|n| n.to_str())
            .and_then(|file_name| self.language_config_ids_by_extension.get(file_name))
            .or_else(|| {
                path.extension()
                    .and_then(|extension| extension.to_str())
                    .and_then(|extension| self.language_config_ids_by_extension.get(extension))
            })
            .or_else(|| {
                self.language_config_ids_by_suffix
                    .iter()
                    .find_map(|(file_type, id)| {
                        if path.to_str()?.ends_with(file_type) {
                            Some(id)
                        } else {
                            None
                        }
                    })
            });

        configuration_id.and_then(|&id| self.language_configs.get(id).cloned())

        // TODO: content_regex handling conflict resolution
    }

    pub fn language_config_for_shebang(&self, source: &Rope) -> Option> {
        let line = Cow::from(source.line(0));
        static SHEBANG_REGEX: Lazy =
            Lazy::new(|| Regex::new(&["^", SHEBANG].concat()).unwrap());
        let configuration_id = SHEBANG_REGEX
            .captures(&line)
            .and_then(|cap| self.language_config_ids_by_shebang.get(&cap[1]));

        configuration_id.and_then(|&id| self.language_configs.get(id).cloned())
    }

    pub fn language_config_for_scope(&self, scope: &str) -> Option> {
        self.language_configs
            .iter()
            .find(|config| config.scope == scope)
            .cloned()
    }

    pub fn language_config_for_language_id(&self, id: &str) -> Option> {
        self.language_configs
            .iter()
            .find(|config| config.language_id == id)
            .cloned()
    }

    /// Unlike language_config_for_language_id, which only returns Some for an exact id, this
    /// function will perform a regex match on the given string to find the closest language match.
    pub fn language_config_for_name(&self, name: &str) -> Option> {
        let mut best_match_length = 0;
        let mut best_match_position = None;
        for (i, configuration) in self.language_configs.iter().enumerate() {
            if let Some(injection_regex) = &configuration.injection_regex {
                if let Some(mat) = injection_regex.find(name) {
                    let length = mat.end() - mat.start();
                    if length > best_match_length {
                        best_match_position = Some(i);
                        best_match_length = length;
                    }
                }
            }
        }

        best_match_position.map(|i| self.language_configs[i].clone())
    }

    pub fn language_configuration_for_injection_string(
        &self,
        capture: &InjectionLanguageMarker,
    ) -> Option> {
        match capture {
            InjectionLanguageMarker::Name(string) => self.language_config_for_name(string),
            InjectionLanguageMarker::Filename(file) => self.language_config_for_file_name(file),
            InjectionLanguageMarker::Shebang(shebang) => {
                self.language_config_for_language_id(shebang)
            }
        }
    }

    pub fn language_configs(&self) -> impl Iterator> {
        self.language_configs.iter()
    }

    pub fn language_server_configs(&self) -> &HashMap {
        &self.language_server_configs
    }

    pub fn set_scopes(&self, scopes: Vec) {
        self.scopes.store(Arc::new(scopes));

        // Reconfigure existing grammars
        for config in self
            .language_configs
            .iter()
            .filter(|cfg| cfg.is_highlight_initialized())
        {
            config.reconfigure(&self.scopes());
        }
    }

    pub fn scopes(&self) -> Guard>> {
        self.scopes.load()
    }
}

pub struct TsParser {
    parser: tree_sitter::Parser,
    pub cursors: Vec,
}

// could also just use a pool, or a single instance?
thread_local! {
    pub static PARSER: RefCell = RefCell::new(TsParser {
        parser: Parser::new(),
        cursors: Vec::new(),
    })
}

#[derive(Debug)]
pub struct Syntax {
    layers: HopSlotMap,
    root: LayerId,
    loader: Arc,
}

fn byte_range_to_str(range: std::ops::Range, source: RopeSlice) -> Cow {
    Cow::from(source.byte_slice(range))
}

impl Syntax {
    pub fn new(
        source: &Rope,
        config: Arc,
        loader: Arc,
    ) -> Option {
        let root_layer = LanguageLayer {
            tree: None,
            config,
            depth: 0,
            flags: LayerUpdateFlags::empty(),
            ranges: vec![Range {
                start_byte: 0,
                end_byte: usize::MAX,
                start_point: Point::new(0, 0),
                end_point: Point::new(usize::MAX, usize::MAX),
            }],
        };

        // track scope_descriptor: a Vec of scopes for item in tree

        let mut layers = HopSlotMap::default();
        let root = layers.insert(root_layer);

        let mut syntax = Self {
            root,
            layers,
            loader,
        };

        let res = syntax.update(source, source, &ChangeSet::new(source));

        if res.is_err() {
            log::error!("TS parser failed, disabeling TS for the current buffer: {res:?}");
            return None;
        }
        Some(syntax)
    }

    pub fn update(
        &mut self,
        old_source: &Rope,
        source: &Rope,
        changeset: &ChangeSet,
    ) -> Result<(), Error> {
        let mut queue = VecDeque::new();
        queue.push_back(self.root);

        let scopes = self.loader.scopes.load();
        let injection_callback = |language: &InjectionLanguageMarker| {
            self.loader
                .language_configuration_for_injection_string(language)
                .and_then(|language_config| language_config.highlight_config(&scopes))
        };

        // Convert the changeset into tree sitter edits.
        let edits = generate_edits(old_source, changeset);

        // This table allows inverse indexing of `layers`.
        // That is by hashing a `Layer` you can find
        // the `LayerId` of an existing equivalent `Layer` in `layers`.
        //
        // It is used to determine if a new layer exists for an injection
        // or if an existing layer needs to be updated.
        let mut layers_table = RawTable::with_capacity(self.layers.len());
        let layers_hasher = RandomState::new();
        // Use the edits to update all layers markers
        fn point_add(a: Point, b: Point) -> Point {
            if b.row > 0 {
                Point::new(a.row.saturating_add(b.row), b.column)
            } else {
                Point::new(0, a.column.saturating_add(b.column))
            }
        }
        fn point_sub(a: Point, b: Point) -> Point {
            if a.row > b.row {
                Point::new(a.row.saturating_sub(b.row), a.column)
            } else {
                Point::new(0, a.column.saturating_sub(b.column))
            }
        }

        for (layer_id, layer) in self.layers.iter_mut() {
            // The root layer always covers the whole range (0..usize::MAX)
            if layer.depth == 0 {
                layer.flags = LayerUpdateFlags::MODIFIED;
                continue;
            }

            if !edits.is_empty() {
                for range in &mut layer.ranges {
                    // Roughly based on https://github.com/tree-sitter/tree-sitter/blob/ddeaa0c7f534268b35b4f6cb39b52df082754413/lib/src/subtree.c#L691-L720
                    for edit in edits.iter().rev() {
                        let is_pure_insertion = edit.old_end_byte == edit.start_byte;

                        // if edit is after range, skip
                        if edit.start_byte > range.end_byte {
                            // TODO: || (is_noop && edit.start_byte == range.end_byte)
                            continue;
                        }

                        // if edit is before range, shift entire range by len
                        if edit.old_end_byte < range.start_byte {
                            range.start_byte =
                                edit.new_end_byte + (range.start_byte - edit.old_end_byte);
                            range.start_point = point_add(
                                edit.new_end_position,
                                point_sub(range.start_point, edit.old_end_position),
                            );

                            range.end_byte = edit
                                .new_end_byte
                                .saturating_add(range.end_byte - edit.old_end_byte);
                            range.end_point = point_add(
                                edit.new_end_position,
                                point_sub(range.end_point, edit.old_end_position),
                            );

                            layer.flags |= LayerUpdateFlags::MOVED;
                        }
                        // if the edit starts in the space before and extends into the range
                        else if edit.start_byte < range.start_byte {
                            range.start_byte = edit.new_end_byte;
                            range.start_point = edit.new_end_position;

                            range.end_byte = range
                                .end_byte
                                .saturating_sub(edit.old_end_byte)
                                .saturating_add(edit.new_end_byte);
                            range.end_point = point_add(
                                edit.new_end_position,
                                point_sub(range.end_point, edit.old_end_position),
                            );
                            layer.flags = LayerUpdateFlags::MODIFIED;
                        }
                        // If the edit is an insertion at the start of the tree, shift
                        else if edit.start_byte == range.start_byte && is_pure_insertion {
                            range.start_byte = edit.new_end_byte;
                            range.start_point = edit.new_end_position;
                            layer.flags |= LayerUpdateFlags::MOVED;
                        } else {
                            range.end_byte = range
                                .end_byte
                                .saturating_sub(edit.old_end_byte)
                                .saturating_add(edit.new_end_byte);
                            range.end_point = point_add(
                                edit.new_end_position,
                                point_sub(range.end_point, edit.old_end_position),
                            );
                            layer.flags = LayerUpdateFlags::MODIFIED;
                        }
                    }
                }
            }

            let hash = layers_hasher.hash_one(layer);
            // Safety: insert_no_grow is unsafe because it assumes that the table
            // has enough capacity to hold additional elements.
            // This is always the case as we reserved enough capacity above.
            unsafe { layers_table.insert_no_grow(hash, layer_id) };
        }

        PARSER.with(|ts_parser| {
            let ts_parser = &mut ts_parser.borrow_mut();
            ts_parser.parser.set_timeout_micros(1000 * 500); // half a second is pretty generours
            let mut cursor = ts_parser.cursors.pop().unwrap_or_else(QueryCursor::new);
            // TODO: might need to set cursor range
            cursor.set_byte_range(0..usize::MAX);
            cursor.set_match_limit(TREE_SITTER_MATCH_LIMIT);

            let source_slice = source.slice(..);

            while let Some(layer_id) = queue.pop_front() {
                let layer = &mut self.layers[layer_id];

                // Mark the layer as touched
                layer.flags |= LayerUpdateFlags::TOUCHED;

                // If a tree already exists, notify it of changes.
                if let Some(tree) = &mut layer.tree {
                    if layer
                        .flags
                        .intersects(LayerUpdateFlags::MODIFIED | LayerUpdateFlags::MOVED)
                    {
                        for edit in edits.iter().rev() {
                            // Apply the edits in reverse.
                            // If we applied them in order then edit 1 would disrupt the positioning of edit 2.
                            tree.edit(edit);
                        }
                    }

                    if layer.flags.contains(LayerUpdateFlags::MODIFIED) {
                        // Re-parse the tree.
                        layer.parse(&mut ts_parser.parser, source)?;
                    }
                } else {
                    // always parse if this layer has never been parsed before
                    layer.parse(&mut ts_parser.parser, source)?;
                }

                // Switch to an immutable borrow.
                let layer = &self.layers[layer_id];

                // Process injections.
                let matches = cursor.matches(
                    &layer.config.injections_query,
                    layer.tree().root_node(),
                    RopeProvider(source_slice),
                );
                let mut injections = Vec::new();
                for mat in matches {
                    let (injection_capture, content_node, included_children) = layer
                        .config
                        .injection_for_match(&layer.config.injections_query, &mat, source_slice);

                    // Explicitly remove this match so that none of its other captures will remain
                    // in the stream of captures.
                    mat.remove();

                    // If a language is found with the given name, then add a new language layer
                    // to the highlighted document.
                    if let (Some(injection_capture), Some(content_node)) =
                        (injection_capture, content_node)
                    {
                        if let Some(config) = (injection_callback)(&injection_capture) {
                            let ranges =
                                intersect_ranges(&layer.ranges, &[content_node], included_children);

                            if !ranges.is_empty() {
                                injections.push((config, ranges));
                            }
                        }
                    }
                }

                // Process combined injections.
                if let Some(combined_injections_query) = &layer.config.combined_injections_query {
                    let mut injections_by_pattern_index =
                        vec![
                            (None, Vec::new(), IncludedChildren::default());
                            combined_injections_query.pattern_count()
                        ];
                    let matches = cursor.matches(
                        combined_injections_query,
                        layer.tree().root_node(),
                        RopeProvider(source_slice),
                    );
                    for mat in matches {
                        let entry = &mut injections_by_pattern_index[mat.pattern_index];
                        let (injection_capture, content_node, included_children) = layer
                            .config
                            .injection_for_match(combined_injections_query, &mat, source_slice);
                        if injection_capture.is_some() {
                            entry.0 = injection_capture;
                        }
                        if let Some(content_node) = content_node {
                            entry.1.push(content_node);
                        }
                        entry.2 = included_children;
                    }
                    for (lang_name, content_nodes, included_children) in injections_by_pattern_index
                    {
                        if let (Some(lang_name), false) = (lang_name, content_nodes.is_empty()) {
                            if let Some(config) = (injection_callback)(&lang_name) {
                                let ranges = intersect_ranges(
                                    &layer.ranges,
                                    &content_nodes,
                                    included_children,
                                );
                                if !ranges.is_empty() {
                                    injections.push((config, ranges));
                                }
                            }
                        }
                    }
                }

                let depth = layer.depth + 1;
                // TODO: can't inline this since matches borrows self.layers
                for (config, ranges) in injections {
                    let new_layer = LanguageLayer {
                        tree: None,
                        config,
                        depth,
                        ranges,
                        flags: LayerUpdateFlags::empty(),
                    };

                    // Find an identical existing layer
                    let layer = layers_table
                        .get(layers_hasher.hash_one(&new_layer), |&it| {
                            self.layers[it] == new_layer
                        })
                        .copied();

                    // ...or insert a new one.
                    let layer_id = layer.unwrap_or_else(|| self.layers.insert(new_layer));

                    queue.push_back(layer_id);
                }

                // TODO: pre-process local scopes at this time, rather than highlight?
                // would solve problems with locals not working across boundaries
            }

            // Return the cursor back in the pool.
            ts_parser.cursors.push(cursor);

            // Reset all `LayerUpdateFlags` and remove all untouched layers
            self.layers.retain(|_, layer| {
                replace(&mut layer.flags, LayerUpdateFlags::empty())
                    .contains(LayerUpdateFlags::TOUCHED)
            });

            Ok(())
        })
    }

    pub fn tree(&self) -> &Tree {
        self.layers[self.root].tree()
    }

    /// Iterate over the highlighted regions for a given slice of source code.
    pub fn highlight_iter<'a>(
        &'a self,
        source: RopeSlice<'a>,
        range: Option>,
        cancellation_flag: Option<&'a AtomicUsize>,
    ) -> impl Iterator> + 'a {
        let mut layers = self
            .layers
            .iter()
            .filter_map(|(_, layer)| {
                // TODO: if range doesn't overlap layer range, skip it

                // Reuse a cursor from the pool if available.
                let mut cursor = PARSER.with(|ts_parser| {
                    let highlighter = &mut ts_parser.borrow_mut();
                    highlighter.cursors.pop().unwrap_or_else(QueryCursor::new)
                });

                // The `captures` iterator borrows the `Tree` and the `QueryCursor`, which
                // prevents them from being moved. But both of these values are really just
                // pointers, so it's actually ok to move them.
                let cursor_ref =
                    unsafe { mem::transmute::<_, &'static mut QueryCursor>(&mut cursor) };

                // if reusing cursors & no range this resets to whole range
                cursor_ref.set_byte_range(range.clone().unwrap_or(0..usize::MAX));
                cursor_ref.set_match_limit(TREE_SITTER_MATCH_LIMIT);

                let mut captures = cursor_ref
                    .captures(
                        &layer.config.query,
                        layer.tree().root_node(),
                        RopeProvider(source),
                    )
                    .peekable();

                // If there's no captures, skip the layer
                captures.peek()?;

                Some(HighlightIterLayer {
                    highlight_end_stack: Vec::new(),
                    scope_stack: vec![LocalScope {
                        inherits: false,
                        range: 0..usize::MAX,
                        local_defs: Vec::new(),
                    }],
                    cursor,
                    _tree: None,
                    captures: RefCell::new(captures),
                    config: layer.config.as_ref(), // TODO: just reuse `layer`
                    depth: layer.depth,            // TODO: just reuse `layer`
                })
            })
            .collect::>();

        layers.sort_unstable_by_key(|layer| layer.sort_key());

        let mut result = HighlightIter {
            source,
            byte_offset: range.map_or(0, |r| r.start),
            cancellation_flag,
            iter_count: 0,
            layers,
            next_event: None,
            last_highlight_range: None,
        };
        result.sort_layers();
        result
    }

    // Commenting
    // comment_strings_for_pos
    // is_commented

    // Indentation
    // suggested_indent_for_line_at_buffer_row
    // suggested_indent_for_buffer_row
    // indent_level_for_line

    // TODO: Folding
}

bitflags! {
    /// Flags that track the status of a layer
    /// in the `Sytaxn::update` function
    #[derive(Debug)]
    struct LayerUpdateFlags : u32{
        const MODIFIED = 0b001;
        const MOVED = 0b010;
        const TOUCHED = 0b100;
    }
}

#[derive(Debug)]
pub struct LanguageLayer {
    // mode
    // grammar
    pub config: Arc,
    pub(crate) tree: Option,
    pub ranges: Vec,
    pub depth: u32,
    flags: LayerUpdateFlags,
}

/// This PartialEq implementation only checks if that
/// two layers are theoretically identical (meaning they highlight the same text range with the same language).
/// It does not check whether the layers have the same internal treesitter
/// state.
impl PartialEq for LanguageLayer {
    fn eq(&self, other: &Self) -> bool {
        self.depth == other.depth
            && self.config.language == other.config.language
            && self.ranges == other.ranges
    }
}

/// Hash implementation belongs to PartialEq implementation above.
/// See its documentation for details.
impl Hash for LanguageLayer {
    fn hash(&self, state: &mut H) {
        self.depth.hash(state);
        // The transmute is necessary here because tree_sitter::Language does not derive Hash at the moment.
        // However it does use #[repr] transparent so the transmute here is safe
        // as `Language` (which `Grammar` is an alias for) is just a newtype wrapper around a (thin) pointer.
        // This is also compatible with the PartialEq implementation of language
        // as that is just a pointer comparison.
        let language: *const () = unsafe { transmute(self.config.language) };
        language.hash(state);
        self.ranges.hash(state);
    }
}

impl LanguageLayer {
    pub fn tree(&self) -> &Tree {
        // TODO: no unwrap
        self.tree.as_ref().unwrap()
    }

    fn parse(&mut self, parser: &mut Parser, source: &Rope) -> Result<(), Error> {
        parser
            .set_included_ranges(&self.ranges)
            .map_err(|_| Error::InvalidRanges)?;

        parser
            .set_language(self.config.language)
            .map_err(|_| Error::InvalidLanguage)?;

        // unsafe { syntax.parser.set_cancellation_flag(cancellation_flag) };
        let tree = parser
            .parse_with(
                &mut |byte, _| {
                    if byte <= source.len_bytes() {
                        let (chunk, start_byte, _, _) = source.chunk_at_byte(byte);
                        &chunk.as_bytes()[byte - start_byte..]
                    } else {
                        // out of range
                        &[]
                    }
                },
                self.tree.as_ref(),
            )
            .ok_or(Error::Cancelled)?;
        // unsafe { ts_parser.parser.set_cancellation_flag(None) };
        self.tree = Some(tree);
        Ok(())
    }
}

pub(crate) fn generate_edits(
    old_text: &Rope,
    changeset: &ChangeSet,
) -> Vec {
    use Operation::*;
    let mut old_pos = 0;

    let mut edits = Vec::new();

    if changeset.changes.is_empty() {
        return edits;
    }

    let mut iter = changeset.changes.iter().peekable();

    // TODO; this is a lot easier with Change instead of Operation.

    fn point_at_pos(text: &Rope, pos: usize) -> (usize, Point) {
        let byte = text.char_to_byte(pos); // <- attempted to index past end
        let line = text.char_to_line(pos);
        let line_start_byte = text.line_to_byte(line);
        let col = byte - line_start_byte;

        (byte, Point::new(line, col))
    }

    fn traverse(point: Point, text: &Tendril) -> Point {
        let Point {
            mut row,
            mut column,
        } = point;

        // TODO: there should be a better way here.
        let mut chars = text.chars().peekable();
        while let Some(ch) = chars.next() {
            if char_is_line_ending(ch) && !(ch == '\r' && chars.peek() == Some(&'\n')) {
                row += 1;
                column = 0;
            } else {
                column += 1;
            }
        }
        Point { row, column }
    }

    while let Some(change) = iter.next() {
        let len = match change {
            Delete(i) | Retain(i) => *i,
            Insert(_) => 0,
        };
        let mut old_end = old_pos + len;

        match change {
            Retain(_) => {}
            Delete(_) => {
                let (start_byte, start_position) = point_at_pos(old_text, old_pos);
                let (old_end_byte, old_end_position) = point_at_pos(old_text, old_end);

                // deletion
                edits.push(tree_sitter::InputEdit {
                    start_byte,                       // old_pos to byte
                    old_end_byte,                     // old_end to byte
                    new_end_byte: start_byte,         // old_pos to byte
                    start_position,                   // old pos to coords
                    old_end_position,                 // old_end to coords
                    new_end_position: start_position, // old pos to coords
                });
            }
            Insert(s) => {
                let (start_byte, start_position) = point_at_pos(old_text, old_pos);

                // a subsequent delete means a replace, consume it
                if let Some(Delete(len)) = iter.peek() {
                    old_end = old_pos + len;
                    let (old_end_byte, old_end_position) = point_at_pos(old_text, old_end);

                    iter.next();

                    // replacement
                    edits.push(tree_sitter::InputEdit {
                        start_byte,                                    // old_pos to byte
                        old_end_byte,                                  // old_end to byte
                        new_end_byte: start_byte + s.len(),            // old_pos to byte + s.len()
                        start_position,                                // old pos to coords
                        old_end_position,                              // old_end to coords
                        new_end_position: traverse(start_position, s), // old pos + chars, newlines matter too (iter over)
                    });
                } else {
                    // insert
                    edits.push(tree_sitter::InputEdit {
                        start_byte,                                    // old_pos to byte
                        old_end_byte: start_byte,                      // same
                        new_end_byte: start_byte + s.len(),            // old_pos + s.len()
                        start_position,                                // old pos to coords
                        old_end_position: start_position,              // same
                        new_end_position: traverse(start_position, s), // old pos + chars, newlines matter too (iter over)
                    });
                }
            }
        }
        old_pos = old_end;
    }
    edits
}

use std::sync::atomic::{AtomicUsize, Ordering};
use std::{iter, mem, ops, str, usize};
use tree_sitter::{
    Language as Grammar, Node, Parser, Point, Query, QueryCaptures, QueryCursor, QueryError,
    QueryMatch, Range, TextProvider, Tree, TreeCursor,
};

const CANCELLATION_CHECK_INTERVAL: usize = 100;

/// Indicates which highlight should be applied to a region of source code.
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub struct Highlight(pub usize);

/// Represents the reason why syntax highlighting failed.
#[derive(Debug, PartialEq, Eq)]
pub enum Error {
    Cancelled,
    InvalidLanguage,
    InvalidRanges,
    Unknown,
}

/// Represents a single step in rendering a syntax-highlighted document.
#[derive(Copy, Clone, Debug)]
pub enum HighlightEvent {
    Source { start: usize, end: usize },
    HighlightStart(Highlight),
    HighlightEnd,
}

/// Contains the data needed to highlight code written in a particular language.
///
/// This struct is immutable and can be shared between threads.
#[derive(Debug)]
pub struct HighlightConfiguration {
    pub language: Grammar,
    pub query: Query,
    injections_query: Query,
    combined_injections_query: Option,
    highlights_pattern_index: usize,
    highlight_indices: ArcSwap>>,
    non_local_variable_patterns: Vec,
    injection_content_capture_index: Option,
    injection_language_capture_index: Option,
    injection_filename_capture_index: Option,
    injection_shebang_capture_index: Option,
    local_scope_capture_index: Option,
    local_def_capture_index: Option,
    local_def_value_capture_index: Option,
    local_ref_capture_index: Option,
}

#[derive(Debug)]
struct LocalDef<'a> {
    name: Cow<'a, str>,
    value_range: ops::Range,
    highlight: Option,
}

#[derive(Debug)]
struct LocalScope<'a> {
    inherits: bool,
    range: ops::Range,
    local_defs: Vec>,
}

#[derive(Debug)]
struct HighlightIter<'a> {
    source: RopeSlice<'a>,
    byte_offset: usize,
    cancellation_flag: Option<&'a AtomicUsize>,
    layers: Vec>,
    iter_count: usize,
    next_event: Option,
    last_highlight_range: Option<(usize, usize, u32)>,
}

// Adapter to convert rope chunks to bytes
pub struct ChunksBytes<'a> {
    chunks: ropey::iter::Chunks<'a>,
}
impl<'a> Iterator for ChunksBytes<'a> {
    type Item = &'a [u8];
    fn next(&mut self) -> Option {
        self.chunks.next().map(str::as_bytes)
    }
}

pub struct RopeProvider<'a>(pub RopeSlice<'a>);
impl<'a> TextProvider<'a> for RopeProvider<'a> {
    type I = ChunksBytes<'a>;

    fn text(&mut self, node: Node) -> Self::I {
        let fragment = self.0.byte_slice(node.start_byte()..node.end_byte());
        ChunksBytes {
            chunks: fragment.chunks(),
        }
    }
}

struct HighlightIterLayer<'a> {
    _tree: Option,
    cursor: QueryCursor,
    captures: RefCell>>>,
    config: &'a HighlightConfiguration,
    highlight_end_stack: Vec,
    scope_stack: Vec>,
    depth: u32,
}

impl<'a> fmt::Debug for HighlightIterLayer<'a> {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("HighlightIterLayer").finish()
    }
}

impl HighlightConfiguration {
    /// Creates a `HighlightConfiguration` for a given `Grammar` and set of highlighting
    /// queries.
    ///
    /// # Parameters
    ///
    /// * `language`  - The Tree-sitter `Grammar` that should be used for parsing.
    /// * `highlights_query` - A string containing tree patterns for syntax highlighting. This
    ///   should be non-empty, otherwise no syntax highlights will be added.
    /// * `injections_query` -  A string containing tree patterns for injecting other languages
    ///   into the document. This can be empty if no injections are desired.
    /// * `locals_query` - A string containing tree patterns for tracking local variable
    ///   definitions and references. This can be empty if local variable tracking is not needed.
    ///
    /// Returns a `HighlightConfiguration` that can then be used with the `highlight` method.
    pub fn new(
        language: Grammar,
        highlights_query: &str,
        injection_query: &str,
        locals_query: &str,
    ) -> Result {
        // Concatenate the query strings, keeping track of the start offset of each section.
        let mut query_source = String::new();
        query_source.push_str(locals_query);
        let highlights_query_offset = query_source.len();
        query_source.push_str(highlights_query);

        // Construct a single query by concatenating the three query strings, but record the
        // range of pattern indices that belong to each individual string.
        let query = Query::new(language, &query_source)?;
        let mut highlights_pattern_index = 0;
        for i in 0..(query.pattern_count()) {
            let pattern_offset = query.start_byte_for_pattern(i);
            if pattern_offset < highlights_query_offset {
                highlights_pattern_index += 1;
            }
        }

        let mut injections_query = Query::new(language, injection_query)?;

        // Construct a separate query just for dealing with the 'combined injections'.
        // Disable the combined injection patterns in the main query.
        let mut combined_injections_query = Query::new(language, injection_query)?;
        let mut has_combined_queries = false;
        for pattern_index in 0..injections_query.pattern_count() {
            let settings = injections_query.property_settings(pattern_index);
            if settings.iter().any(|s| &*s.key == "injection.combined") {
                has_combined_queries = true;
                injections_query.disable_pattern(pattern_index);
            } else {
                combined_injections_query.disable_pattern(pattern_index);
            }
        }
        let combined_injections_query = if has_combined_queries {
            Some(combined_injections_query)
        } else {
            None
        };

        // Find all of the highlighting patterns that are disabled for nodes that
        // have been identified as local variables.
        let non_local_variable_patterns = (0..query.pattern_count())
            .map(|i| {
                query
                    .property_predicates(i)
                    .iter()
                    .any(|(prop, positive)| !*positive && prop.key.as_ref() == "local")
            })
            .collect();

        // Store the numeric ids for all of the special captures.
        let mut injection_content_capture_index = None;
        let mut injection_language_capture_index = None;
        let mut injection_filename_capture_index = None;
        let mut injection_shebang_capture_index = None;
        let mut local_def_capture_index = None;
        let mut local_def_value_capture_index = None;
        let mut local_ref_capture_index = None;
        let mut local_scope_capture_index = None;
        for (i, name) in query.capture_names().iter().enumerate() {
            let i = Some(i as u32);
            match name.as_str() {
                "local.definition" => local_def_capture_index = i,
                "local.definition-value" => local_def_value_capture_index = i,
                "local.reference" => local_ref_capture_index = i,
                "local.scope" => local_scope_capture_index = i,
                _ => {}
            }
        }

        for (i, name) in injections_query.capture_names().iter().enumerate() {
            let i = Some(i as u32);
            match name.as_str() {
                "injection.content" => injection_content_capture_index = i,
                "injection.language" => injection_language_capture_index = i,
                "injection.filename" => injection_filename_capture_index = i,
                "injection.shebang" => injection_shebang_capture_index = i,
                _ => {}
            }
        }

        let highlight_indices = ArcSwap::from_pointee(vec![None; query.capture_names().len()]);
        Ok(Self {
            language,
            query,
            injections_query,
            combined_injections_query,
            highlights_pattern_index,
            highlight_indices,
            non_local_variable_patterns,
            injection_content_capture_index,
            injection_language_capture_index,
            injection_filename_capture_index,
            injection_shebang_capture_index,
            local_scope_capture_index,
            local_def_capture_index,
            local_def_value_capture_index,
            local_ref_capture_index,
        })
    }

    /// Get a slice containing all of the highlight names used in the configuration.
    pub fn names(&self) -> &[String] {
        self.query.capture_names()
    }

    /// Set the list of recognized highlight names.
    ///
    /// Tree-sitter syntax-highlighting queries specify highlights in the form of dot-separated
    /// highlight names like `punctuation.bracket` and `function.method.builtin`. Consumers of
    /// these queries can choose to recognize highlights with different levels of specificity.
    /// For example, the string `function.builtin` will match against `function.builtin.constructor`
    /// but will not match `function.method.builtin` and `function.method`.
    ///
    /// When highlighting, results are returned as `Highlight` values, which contain the index
    /// of the matched highlight this list of highlight names.
    pub fn configure(&self, recognized_names: &[String]) {
        let mut capture_parts = Vec::new();
        let indices: Vec<_> = self
            .query
            .capture_names()
            .iter()
            .map(move |capture_name| {
                capture_parts.clear();
                capture_parts.extend(capture_name.split('.'));

                let mut best_index = None;
                let mut best_match_len = 0;
                for (i, recognized_name) in recognized_names.iter().enumerate() {
                    let recognized_name = recognized_name;
                    let mut len = 0;
                    let mut matches = true;
                    for (i, part) in recognized_name.split('.').enumerate() {
                        match capture_parts.get(i) {
                            Some(capture_part) if *capture_part == part => len += 1,
                            _ => {
                                matches = false;
                                break;
                            }
                        }
                    }
                    if matches && len > best_match_len {
                        best_index = Some(i);
                        best_match_len = len;
                    }
                }
                best_index.map(Highlight)
            })
            .collect();

        self.highlight_indices.store(Arc::new(indices));
    }

    fn injection_pair<'a>(
        &self,
        query_match: &QueryMatch<'a, 'a>,
        source: RopeSlice<'a>,
    ) -> (Option>, Option>) {
        let mut injection_capture = None;
        let mut content_node = None;

        for capture in query_match.captures {
            let index = Some(capture.index);
            if index == self.injection_language_capture_index {
                let name = byte_range_to_str(capture.node.byte_range(), source);
                injection_capture = Some(InjectionLanguageMarker::Name(name));
            } else if index == self.injection_filename_capture_index {
                let name = byte_range_to_str(capture.node.byte_range(), source);
                let path = Path::new(name.as_ref()).to_path_buf();
                injection_capture = Some(InjectionLanguageMarker::Filename(path.into()));
            } else if index == self.injection_shebang_capture_index {
                let node_slice = source.byte_slice(capture.node.byte_range());

                // some languages allow space and newlines before the actual string content
                // so a shebang could be on either the first or second line
                let lines = if let Ok(end) = node_slice.try_line_to_byte(2) {
                    node_slice.byte_slice(..end)
                } else {
                    node_slice
                };

                static SHEBANG_REGEX: Lazy = Lazy::new(|| Regex::new(SHEBANG).unwrap());

                injection_capture = SHEBANG_REGEX
                    .captures(&Cow::from(lines))
                    .map(|cap| InjectionLanguageMarker::Shebang(cap[1].to_owned()))
            } else if index == self.injection_content_capture_index {
                content_node = Some(capture.node);
            }
        }
        (injection_capture, content_node)
    }

    fn injection_for_match<'a>(
        &self,
        query: &'a Query,
        query_match: &QueryMatch<'a, 'a>,
        source: RopeSlice<'a>,
    ) -> (
        Option>,
        Option>,
        IncludedChildren,
    ) {
        let (mut injection_capture, content_node) = self.injection_pair(query_match, source);

        let mut included_children = IncludedChildren::default();
        for prop in query.property_settings(query_match.pattern_index) {
            match prop.key.as_ref() {
                // In addition to specifying the language name via the text of a
                // captured node, it can also be hard-coded via a `#set!` predicate
                // that sets the injection.language key.
                "injection.language" if injection_capture.is_none() => {
                    injection_capture = prop
                        .value
                        .as_ref()
                        .map(|s| InjectionLanguageMarker::Name(s.as_ref().into()));
                }

                // By default, injections do not include the *children* of an
                // `injection.content` node - only the ranges that belong to the
                // node itself. This can be changed using a `#set!` predicate that
                // sets the `injection.include-children` key.
                "injection.include-children" => included_children = IncludedChildren::All,

                // Some queries might only exclude named children but include unnamed
                // children in their `injection.content` node. This can be enabled using
                // a `#set!` predicate that sets the `injection.include-unnamed-children` key.
                "injection.include-unnamed-children" => {
                    included_children = IncludedChildren::Unnamed
                }
                _ => {}
            }
        }

        (injection_capture, content_node, included_children)
    }
}

impl<'a> HighlightIterLayer<'a> {
    // First, sort scope boundaries by their byte offset in the document. At a
    // given position, emit scope endings before scope beginnings. Finally, emit
    // scope boundaries from deeper layers first.
    fn sort_key(&self) -> Option<(usize, bool, isize)> {
        let depth = -(self.depth as isize);
        let next_start = self
            .captures
            .borrow_mut()
            .peek()
            .map(|(m, i)| m.captures[*i].node.start_byte());
        let next_end = self.highlight_end_stack.last().cloned();
        match (next_start, next_end) {
            (Some(start), Some(end)) => {
                if start < end {
                    Some((start, true, depth))
                } else {
                    Some((end, false, depth))
                }
            }
            (Some(i), None) => Some((i, true, depth)),
            (None, Some(j)) => Some((j, false, depth)),
            _ => None,
        }
    }
}

#[derive(Clone)]
enum IncludedChildren {
    None,
    All,
    Unnamed,
}

impl Default for IncludedChildren {
    fn default() -> Self {
        Self::None
    }
}

// Compute the ranges that should be included when parsing an injection.
// This takes into account three things:
// * `parent_ranges` - The ranges must all fall within the *current* layer's ranges.
// * `nodes` - Every injection takes place within a set of nodes. The injection ranges
//   are the ranges of those nodes.
// * `includes_children` - For some injections, the content nodes' children should be
//   excluded from the nested document, so that only the content nodes' *own* content
//   is reparsed. For other injections, the content nodes' entire ranges should be
//   reparsed, including the ranges of their children.
fn intersect_ranges(
    parent_ranges: &[Range],
    nodes: &[Node],
    included_children: IncludedChildren,
) -> Vec {
    let mut cursor = nodes[0].walk();
    let mut result = Vec::new();
    let mut parent_range_iter = parent_ranges.iter();
    let mut parent_range = parent_range_iter
        .next()
        .expect("Layers should only be constructed with non-empty ranges vectors");
    for node in nodes.iter() {
        let mut preceding_range = Range {
            start_byte: 0,
            start_point: Point::new(0, 0),
            end_byte: node.start_byte(),
            end_point: node.start_position(),
        };
        let following_range = Range {
            start_byte: node.end_byte(),
            start_point: node.end_position(),
            end_byte: usize::MAX,
            end_point: Point::new(usize::MAX, usize::MAX),
        };

        for excluded_range in node
            .children(&mut cursor)
            .filter_map(|child| match included_children {
                IncludedChildren::None => Some(child.range()),
                IncludedChildren::All => None,
                IncludedChildren::Unnamed => {
                    if child.is_named() {
                        Some(child.range())
                    } else {
                        None
                    }
                }
            })
            .chain([following_range].iter().cloned())
        {
            let mut range = Range {
                start_byte: preceding_range.end_byte,
                start_point: preceding_range.end_point,
                end_byte: excluded_range.start_byte,
                end_point: excluded_range.start_point,
            };
            preceding_range = excluded_range;

            if range.end_byte < parent_range.start_byte {
                continue;
            }

            while parent_range.start_byte <= range.end_byte {
                if parent_range.end_byte > range.start_byte {
                    if range.start_byte < parent_range.start_byte {
                        range.start_byte = parent_range.start_byte;
                        range.start_point = parent_range.start_point;
                    }

                    if parent_range.end_byte < range.end_byte {
                        if range.start_byte < parent_range.end_byte {
                            result.push(Range {
                                start_byte: range.start_byte,
                                start_point: range.start_point,
                                end_byte: parent_range.end_byte,
                                end_point: parent_range.end_point,
                            });
                        }
                        range.start_byte = parent_range.end_byte;
                        range.start_point = parent_range.end_point;
                    } else {
                        if range.start_byte < range.end_byte {
                            result.push(range);
                        }
                        break;
                    }
                }

                if let Some(next_range) = parent_range_iter.next() {
                    parent_range = next_range;
                } else {
                    return result;
                }
            }
        }
    }
    result
}

impl<'a> HighlightIter<'a> {
    fn emit_event(
        &mut self,
        offset: usize,
        event: Option,
    ) -> Option> {
        let result;
        if self.byte_offset < offset {
            result = Some(Ok(HighlightEvent::Source {
                start: self.byte_offset,
                end: offset,
            }));
            self.byte_offset = offset;
            self.next_event = event;
        } else {
            result = event.map(Ok);
        }
        self.sort_layers();
        result
    }

    fn sort_layers(&mut self) {
        while !self.layers.is_empty() {
            if let Some(sort_key) = self.layers[0].sort_key() {
                let mut i = 0;
                while i + 1 < self.layers.len() {
                    if let Some(next_offset) = self.layers[i + 1].sort_key() {
                        if next_offset < sort_key {
                            i += 1;
                            continue;
                        }
                    } else {
                        let layer = self.layers.remove(i + 1);
                        PARSER.with(|ts_parser| {
                            let highlighter = &mut ts_parser.borrow_mut();
                            highlighter.cursors.push(layer.cursor);
                        });
                    }
                    break;
                }
                if i > 0 {
                    self.layers[0..(i + 1)].rotate_left(1);
                }
                break;
            } else {
                let layer = self.layers.remove(0);
                PARSER.with(|ts_parser| {
                    let highlighter = &mut ts_parser.borrow_mut();
                    highlighter.cursors.push(layer.cursor);
                });
            }
        }
    }
}

impl<'a> Iterator for HighlightIter<'a> {
    type Item = Result;

    fn next(&mut self) -> Option {
        'main: loop {
            // If we've already determined the next highlight boundary, just return it.
            if let Some(e) = self.next_event.take() {
                return Some(Ok(e));
            }

            // Periodically check for cancellation, returning `Cancelled` error if the
            // cancellation flag was flipped.
            if let Some(cancellation_flag) = self.cancellation_flag {
                self.iter_count += 1;
                if self.iter_count >= CANCELLATION_CHECK_INTERVAL {
                    self.iter_count = 0;
                    if cancellation_flag.load(Ordering::Relaxed) != 0 {
                        return Some(Err(Error::Cancelled));
                    }
                }
            }

            // If none of the layers have any more highlight boundaries, terminate.
            if self.layers.is_empty() {
                let len = self.source.len_bytes();
                return if self.byte_offset < len {
                    let result = Some(Ok(HighlightEvent::Source {
                        start: self.byte_offset,
                        end: len,
                    }));
                    self.byte_offset = len;
                    result
                } else {
                    None
                };
            }

            // Get the next capture from whichever layer has the earliest highlight boundary.
            let range;
            let layer = &mut self.layers[0];
            let captures = layer.captures.get_mut();
            if let Some((next_match, capture_index)) = captures.peek() {
                let next_capture = next_match.captures[*capture_index];
                range = next_capture.node.byte_range();

                // If any previous highlight ends before this node starts, then before
                // processing this capture, emit the source code up until the end of the
                // previous highlight, and an end event for that highlight.
                if let Some(end_byte) = layer.highlight_end_stack.last().cloned() {
                    if end_byte <= range.start {
                        layer.highlight_end_stack.pop();
                        return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd));
                    }
                }
            }
            // If there are no more captures, then emit any remaining highlight end events.
            // And if there are none of those, then just advance to the end of the document.
            else if let Some(end_byte) = layer.highlight_end_stack.last().cloned() {
                layer.highlight_end_stack.pop();
                return self.emit_event(end_byte, Some(HighlightEvent::HighlightEnd));
            } else {
                return self.emit_event(self.source.len_bytes(), None);
            };

            let (mut match_, capture_index) = captures.next().unwrap();
            let mut capture = match_.captures[capture_index];

            // Remove from the local scope stack any local scopes that have already ended.
            while range.start > layer.scope_stack.last().unwrap().range.end {
                layer.scope_stack.pop();
            }

            // If this capture is for tracking local variables, then process the
            // local variable info.
            let mut reference_highlight = None;
            let mut definition_highlight = None;
            while match_.pattern_index < layer.config.highlights_pattern_index {
                // If the node represents a local scope, push a new local scope onto
                // the scope stack.
                if Some(capture.index) == layer.config.local_scope_capture_index {
                    definition_highlight = None;
                    let mut scope = LocalScope {
                        inherits: true,
                        range: range.clone(),
                        local_defs: Vec::new(),
                    };
                    for prop in layer.config.query.property_settings(match_.pattern_index) {
                        if let "local.scope-inherits" = prop.key.as_ref() {
                            scope.inherits =
                                prop.value.as_ref().map_or(true, |r| r.as_ref() == "true");
                        }
                    }
                    layer.scope_stack.push(scope);
                }
                // If the node represents a definition, add a new definition to the
                // local scope at the top of the scope stack.
                else if Some(capture.index) == layer.config.local_def_capture_index {
                    reference_highlight = None;
                    let scope = layer.scope_stack.last_mut().unwrap();

                    let mut value_range = 0..0;
                    for capture in match_.captures {
                        if Some(capture.index) == layer.config.local_def_value_capture_index {
                            value_range = capture.node.byte_range();
                        }
                    }

                    let name = byte_range_to_str(range.clone(), self.source);
                    scope.local_defs.push(LocalDef {
                        name,
                        value_range,
                        highlight: None,
                    });
                    definition_highlight = scope.local_defs.last_mut().map(|s| &mut s.highlight);
                }
                // If the node represents a reference, then try to find the corresponding
                // definition in the scope stack.
                else if Some(capture.index) == layer.config.local_ref_capture_index
                    && definition_highlight.is_none()
                {
                    definition_highlight = None;
                    let name = byte_range_to_str(range.clone(), self.source);
                    for scope in layer.scope_stack.iter().rev() {
                        if let Some(highlight) = scope.local_defs.iter().rev().find_map(|def| {
                            if def.name == name && range.start >= def.value_range.end {
                                Some(def.highlight)
                            } else {
                                None
                            }
                        }) {
                            reference_highlight = highlight;
                            break;
                        }
                        if !scope.inherits {
                            break;
                        }
                    }
                }

                // Continue processing any additional matches for the same node.
                if let Some((next_match, next_capture_index)) = captures.peek() {
                    let next_capture = next_match.captures[*next_capture_index];
                    if next_capture.node == capture.node {
                        capture = next_capture;
                        match_ = captures.next().unwrap().0;
                        continue;
                    }
                }

                self.sort_layers();
                continue 'main;
            }

            // Otherwise, this capture must represent a highlight.
            // If this exact range has already been highlighted by an earlier pattern, or by
            // a different layer, then skip over this one.
            if let Some((last_start, last_end, last_depth)) = self.last_highlight_range {
                if range.start == last_start && range.end == last_end && layer.depth < last_depth {
                    self.sort_layers();
                    continue 'main;
                }
            }

            // If the current node was found to be a local variable, then skip over any
            // highlighting patterns that are disabled for local variables.
            if definition_highlight.is_some() || reference_highlight.is_some() {
                while layer.config.non_local_variable_patterns[match_.pattern_index] {
                    if let Some((next_match, next_capture_index)) = captures.peek() {
                        let next_capture = next_match.captures[*next_capture_index];
                        if next_capture.node == capture.node {
                            capture = next_capture;
                            match_ = captures.next().unwrap().0;
                            continue;
                        }
                    }

                    self.sort_layers();
                    continue 'main;
                }
            }

            // Once a highlighting pattern is found for the current node, skip over
            // any later highlighting patterns that also match this node. Captures
            // for a given node are ordered by pattern index, so these subsequent
            // captures are guaranteed to be for highlighting, not injections or
            // local variables.
            while let Some((next_match, next_capture_index)) = captures.peek() {
                let next_capture = next_match.captures[*next_capture_index];
                if next_capture.node == capture.node {
                    captures.next();
                } else {
                    break;
                }
            }

            let current_highlight = layer.config.highlight_indices.load()[capture.index as usize];

            // If this node represents a local definition, then store the current
            // highlight value on the local scope entry representing this node.
            if let Some(definition_highlight) = definition_highlight {
                *definition_highlight = current_highlight;
            }

            // Emit a scope start event and push the node's end position to the stack.
            if let Some(highlight) = reference_highlight.or(current_highlight) {
                self.last_highlight_range = Some((range.start, range.end, layer.depth));
                layer.highlight_end_stack.push(range.end);
                return self
                    .emit_event(range.start, Some(HighlightEvent::HighlightStart(highlight)));
            }

            self.sort_layers();
        }
    }
}

#[derive(Debug, Clone)]
pub enum InjectionLanguageMarker<'a> {
    Name(Cow<'a, str>),
    Filename(Cow<'a, Path>),
    Shebang(String),
}

const SHEBANG: &str = r"#!\s*(?:\S*[/\\](?:env\s+(?:\-\S+\s+)*)?)?([^\s\.\d]+)";

pub struct Merge {
    iter: I,
    spans: Box)>>,

    next_event: Option,
    next_span: Option<(usize, std::ops::Range)>,

    queue: Vec,
}

/// Merge a list of spans into the highlight event stream.
pub fn merge>(
    iter: I,
    spans: Vec<(usize, std::ops::Range)>,
) -> Merge {
    let spans = Box::new(spans.into_iter());
    let mut merge = Merge {
        iter,
        spans,
        next_event: None,
        next_span: None,
        queue: Vec::new(),
    };
    merge.next_event = merge.iter.next();
    merge.next_span = merge.spans.next();
    merge
}

impl> Iterator for Merge {
    type Item = HighlightEvent;
    fn next(&mut self) -> Option {
        use HighlightEvent::*;
        if let Some(event) = self.queue.pop() {
            return Some(event);
        }

        loop {
            match (self.next_event, &self.next_span) {
                // this happens when range is partially or fully offscreen
                (Some(Source { start, .. }), Some((span, range))) if start > range.start => {
                    if start > range.end {
                        self.next_span = self.spans.next();
                    } else {
                        self.next_span = Some((*span, start..range.end));
                    };
                }
                _ => break,
            }
        }

        match (self.next_event, &self.next_span) {
            (Some(HighlightStart(i)), _) => {
                self.next_event = self.iter.next();
                Some(HighlightStart(i))
            }
            (Some(HighlightEnd), _) => {
                self.next_event = self.iter.next();
                Some(HighlightEnd)
            }
            (Some(Source { start, end }), Some((_, range))) if start < range.start => {
                let intersect = range.start.min(end);
                let event = Source {
                    start,
                    end: intersect,
                };

                if end == intersect {
                    // the event is complete
                    self.next_event = self.iter.next();
                } else {
                    // subslice the event
                    self.next_event = Some(Source {
                        start: intersect,
                        end,
                    });
                };

                Some(event)
            }
            (Some(Source { start, end }), Some((span, range))) if start == range.start => {
                let intersect = range.end.min(end);
                let event = HighlightStart(Highlight(*span));

                // enqueue in reverse order
                self.queue.push(HighlightEnd);
                self.queue.push(Source {
                    start,
                    end: intersect,
                });

                if end == intersect {
                    // the event is complete
                    self.next_event = self.iter.next();
                } else {
                    // subslice the event
                    self.next_event = Some(Source {
                        start: intersect,
                        end,
                    });
                };

                if intersect == range.end {
                    self.next_span = self.spans.next();
                } else {
                    self.next_span = Some((*span, intersect..range.end));
                }

                Some(event)
            }
            (Some(event), None) => {
                self.next_event = self.iter.next();
                Some(event)
            }
            // Can happen if cursor at EOF and/or diagnostic reaches past the end.
            // We need to actually emit events for the cursor-at-EOF situation,
            // even though the range is past the end of the text.  This needs to be
            // handled appropriately by the drawing code by not assuming that
            // all `Source` events point to valid indices in the rope.
            (None, Some((span, range))) => {
                let event = HighlightStart(Highlight(*span));
                self.queue.push(HighlightEnd);
                self.queue.push(Source {
                    start: range.start,
                    end: range.end,
                });
                self.next_span = self.spans.next();
                Some(event)
            }
            (None, None) => None,
            e => unreachable!("{:?}", e),
        }
    }
}

fn node_is_visible(node: &Node) -> bool {
    node.is_missing() || (node.is_named() && node.language().node_kind_is_visible(node.kind_id()))
}

pub fn pretty_print_tree(fmt: &mut W, node: Node) -> fmt::Result {
    if node.child_count() == 0 {
        if node_is_visible(&node) {
            write!(fmt, "({})", node.kind())
        } else {
            write!(fmt, "\"{}\"", node.kind())
        }
    } else {
        pretty_print_tree_impl(fmt, &mut node.walk(), 0)
    }
}

fn pretty_print_tree_impl(
    fmt: &mut W,
    cursor: &mut TreeCursor,
    depth: usize,
) -> fmt::Result {
    let node = cursor.node();
    let visible = node_is_visible(&node);

    if visible {
        let indentation_columns = depth * 2;
        write!(fmt, "{:indentation_columns$}", "")?;

        if let Some(field_name) = cursor.field_name() {
            write!(fmt, "{}: ", field_name)?;
        }

        write!(fmt, "({}", node.kind())?;
    }

    // Handle children.
    if cursor.goto_first_child() {
        loop {
            if node_is_visible(&cursor.node()) {
                fmt.write_char('\n')?;
            }

            pretty_print_tree_impl(fmt, cursor, depth + 1)?;

            if !cursor.goto_next_sibling() {
                break;
            }
        }

        let moved = cursor.goto_parent();
        // The parent of the first child must exist, and must be `node`.
        debug_assert!(moved);
        debug_assert!(cursor.node() == node);
    }

    if visible {
        fmt.write_char(')')?;
    }

    Ok(())
}

#[cfg(test)]
mod test {
    use super::*;
    use crate::{Rope, Transaction};

    #[test]
    fn test_textobject_queries() {
        let query_str = r#"
        (line_comment)+ @quantified_nodes
        ((line_comment)+) @quantified_nodes_grouped
        ((line_comment) (line_comment)) @multiple_nodes_grouped
        "#;
        let source = Rope::from_str(
            r#"
/// a comment on
/// multiple lines
        "#,
        );

        let loader = Loader::new(Configuration {
            language: vec![],
            language_server: HashMap::new(),
        });
        let language = get_language("rust").unwrap();

        let query = Query::new(language, query_str).unwrap();
        let textobject = TextObjectQuery { query };
        let mut cursor = QueryCursor::new();

        let config = HighlightConfiguration::new(language, "", "", "").unwrap();
        let syntax = Syntax::new(&source, Arc::new(config), Arc::new(loader)).unwrap();

        let root = syntax.tree().root_node();
        let mut test = |capture, range| {
            let matches: Vec<_> = textobject
                .capture_nodes(capture, root, source.slice(..), &mut cursor)
                .unwrap()
                .collect();

            assert_eq!(
                matches[0].byte_range(),
                range,
                "@{} expected {:?}",
                capture,
                range
            )
        };

        test("quantified_nodes", 1..36);
        // NOTE: Enable after implementing proper node group capturing
        // test("quantified_nodes_grouped", 1..36);
        // test("multiple_nodes_grouped", 1..36);
    }

    #[test]
    fn test_parser() {
        let highlight_names: Vec = [
            "attribute",
            "constant",
            "function.builtin",
            "function",
            "keyword",
            "operator",
            "property",
            "punctuation",
            "punctuation.bracket",
            "punctuation.delimiter",
            "string",
            "string.special",
            "tag",
            "type",
            "type.builtin",
            "variable",
            "variable.builtin",
            "variable.parameter",
        ]
        .iter()
        .cloned()
        .map(String::from)
        .collect();

        let loader = Loader::new(Configuration {
            language: vec![],
            language_server: HashMap::new(),
        });

        let language = get_language("rust").unwrap();
        let config = HighlightConfiguration::new(
            language,
            &std::fs::read_to_string("../runtime/grammars/sources/rust/queries/highlights.scm")
                .unwrap(),
            &std::fs::read_to_string("../runtime/grammars/sources/rust/queries/injections.scm")
                .unwrap(),
            "", // locals.scm
        )
        .unwrap();
        config.configure(&highlight_names);

        let source = Rope::from_str(
            "
            struct Stuff {}
            fn main() {}
        ",
        );
        let syntax = Syntax::new(&source, Arc::new(config), Arc::new(loader)).unwrap();
        let tree = syntax.tree();
        let root = tree.root_node();
        assert_eq!(root.kind(), "source_file");

        assert_eq!(
            root.to_sexp(),
            concat!(
                "(source_file ",
                "(struct_item name: (type_identifier) body: (field_declaration_list)) ",
                "(function_item name: (identifier) parameters: (parameters) body: (block)))"
            )
        );

        let struct_node = root.child(0).unwrap();
        assert_eq!(struct_node.kind(), "struct_item");
    }

    #[test]
    fn test_input_edits() {
        use tree_sitter::InputEdit;

        let doc = Rope::from("hello world!\ntest 123");
        let transaction = Transaction::change(
            &doc,
            vec![(6, 11, Some("test".into())), (12, 17, None)].into_iter(),
        );
        let edits = generate_edits(&doc, transaction.changes());
        // transaction.apply(&mut state);

        assert_eq!(
            edits,
            &[
                InputEdit {
                    start_byte: 6,
                    old_end_byte: 11,
                    new_end_byte: 10,
                    start_position: Point { row: 0, column: 6 },
                    old_end_position: Point { row: 0, column: 11 },
                    new_end_position: Point { row: 0, column: 10 }
                },
                InputEdit {
                    start_byte: 12,
                    old_end_byte: 17,
                    new_end_byte: 12,
                    start_position: Point { row: 0, column: 12 },
                    old_end_position: Point { row: 1, column: 4 },
                    new_end_position: Point { row: 0, column: 12 }
                }
            ]
        );

        // Testing with the official example from tree-sitter
        let mut doc = Rope::from("fn test() {}");
        let transaction =
            Transaction::change(&doc, vec![(8, 8, Some("a: u32".into()))].into_iter());
        let edits = generate_edits(&doc, transaction.changes());
        transaction.apply(&mut doc);

        assert_eq!(doc, "fn test(a: u32) {}");
        assert_eq!(
            edits,
            &[InputEdit {
                start_byte: 8,
                old_end_byte: 8,
                new_end_byte: 14,
                start_position: Point { row: 0, column: 8 },
                old_end_position: Point { row: 0, column: 8 },
                new_end_position: Point { row: 0, column: 14 }
            }]
        );
    }

    #[track_caller]
    fn assert_pretty_print(
        language_name: &str,
        source: &str,
        expected: &str,
        start: usize,
        end: usize,
    ) {
        let source = Rope::from_str(source);

        let loader = Loader::new(Configuration {
            language: vec![],
            language_server: HashMap::new(),
        });
        let language = get_language(language_name).unwrap();

        let config = HighlightConfiguration::new(language, "", "", "").unwrap();
        let syntax = Syntax::new(&source, Arc::new(config), Arc::new(loader)).unwrap();

        let root = syntax
            .tree()
            .root_node()
            .descendant_for_byte_range(start, end)
            .unwrap();

        let mut output = String::new();
        pretty_print_tree(&mut output, root).unwrap();

        assert_eq!(expected, output);
    }

    #[test]
    fn test_pretty_print() {
        let source = r#"/// Hello"#;
        assert_pretty_print("rust", source, "(line_comment)", 0, source.len());

        // A large tree should be indented with fields:
        let source = r#"fn main() {
            println!("Hello, World!");
        }"#;
        assert_pretty_print(
            "rust",
            source,
            concat!(
                "(function_item\n",
                "  name: (identifier)\n",
                "  parameters: (parameters)\n",
                "  body: (block\n",
                "    (expression_statement\n",
                "      (macro_invocation\n",
                "        macro: (identifier)\n",
                "        (token_tree\n",
                "          (string_literal))))))",
            ),
            0,
            source.len(),
        );

        // Selecting a token should print just that token:
        let source = r#"fn main() {}"#;
        assert_pretty_print("rust", source, r#""fn""#, 0, 1);

        // Error nodes are printed as errors:
        let source = r#"}{"#;
        assert_pretty_print("rust", source, "(ERROR)", 0, source.len());

        // Fields broken under unnamed nodes are determined correctly.
        // In the following source, `object` belongs to the `singleton_method`
        // rule but `name` and `body` belong to an unnamed helper `_method_rest`.
        // This can cause a bug with a pretty-printing implementation that
        // uses `Node::field_name_for_child` to determine field names but is
        // fixed when using `TreeCursor::field_name`.
        let source = "def self.method_name
          true
        end";
        assert_pretty_print(
            "ruby",
            source,
            concat!(
                "(singleton_method\n",
                "  object: (self)\n",
                "  name: (identifier)\n",
                "  body: (body_statement\n",
                "    (true)))"
            ),
            0,
            source.len(),
        );
    }

    #[test]
    fn test_load_runtime_file() {
        // Test to make sure we can load some data from the runtime directory.
        let contents = load_runtime_file("rust", "indents.scm").unwrap();
        assert!(!contents.is_empty());

        let results = load_runtime_file("rust", "does-not-exist");
        assert!(results.is_err());
    }
}