fancy-regex-0.7.1/.cargo_vcs_info.json0000644000000001120000000000000132420ustar { "git": { "sha1": "c886d5fb8275bf6e5108e5483c45acad68d8b60c" } } fancy-regex-0.7.1/.github/FUNDING.yml000064400000000000000000000000350000000000000151710ustar 00000000000000github: [raphlinus, robinst] fancy-regex-0.7.1/.github/workflows/ci.yml000064400000000000000000000041530000000000000165340ustar 00000000000000# Based on https://github.com/actions-rs/meta/blob/master/recipes/msrv.md name: ci on: pull_request: push: branches: - main schedule: - cron: '00 01 * * *' jobs: check: name: check runs-on: ubuntu-latest strategy: matrix: rust: - stable steps: - name: Checkout sources uses: actions/checkout@v2 - name: Install toolchain uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust }} profile: minimal override: true - name: Run cargo check uses: actions-rs/cargo@v1 with: command: check test: name: test runs-on: ubuntu-latest strategy: matrix: rust: - 1.41.1 # MSRV (minimum supported Rust version) - stable - beta steps: - name: Checkout sources uses: actions/checkout@v2 - name: Install toolchain uses: actions-rs/toolchain@v1 with: toolchain: ${{ matrix.rust }} profile: minimal override: true - name: Run cargo test uses: actions-rs/cargo@v1 with: command: test fmt: name: rustfmt runs-on: ubuntu-latest steps: - name: Checkout sources uses: actions/checkout@v2 - name: Install toolchain uses: actions-rs/toolchain@v1 with: toolchain: stable profile: minimal override: true components: rustfmt - name: Run cargo fmt uses: actions-rs/cargo@v1 with: command: fmt args: --all -- --check coverage: name: coverage runs-on: ubuntu-latest container: image: xd009642/tarpaulin:0.16.0 options: --security-opt seccomp=unconfined steps: - name: Checkout sources uses: actions/checkout@v2 - name: Generate code coverage uses: actions-rs/cargo@v1 with: command: tarpaulin args: --out Xml - name: Upload to codecov.io uses: codecov/codecov-action@v1 with: fail_ci_if_error: true fancy-regex-0.7.1/.gitignore000064400000000000000000000000510000000000000140020ustar 00000000000000Cargo.lock target *.iml .idea/ .vscode/ fancy-regex-0.7.1/AUTHORS000064400000000000000000000005050000000000000130660ustar 00000000000000# This is the list of Fancy Regex authors for copyright purposes. # # This does not necessarily list everyone who has contributed code, since in # some cases, their employer may be the copyright holder. To see the full list # of contributors, see the revision history in source control. Google LLC Raph Levien Robin Stocker fancy-regex-0.7.1/CHANGELOG.md000064400000000000000000000124500000000000000136310ustar 00000000000000# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). This project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html), with the exception that 0.x versions can break between minor versions. ## [0.7.1] - 2021-07-29 ### Fixed - Fix panic on incomplete escape sequences in input regexes - Disallow quantifers on lookarounds and other zero-width assertion expressions, e.g. the `+` in `(?=hello)+` ## [0.7.0] - 2021-07-12 ### Added - `Regex` now has replace methods like the regex crate: - `replace` - single replacement - `replace_all` - replace all non-overlapping matches - `replacen` - configurable number of replacements ## [0.6.0] - 2021-05-17 ### Added - `Regex` now implements `Clone`, `Display`, `FromStr` - `Captures` now implements `Index` to access captures by number and `Index<&str>` to access by name ## [0.5.0] - 2021-02-15 ### Added - Methods `find_iter` and `captures_iter` to iterate over all non-overlapping matches for a string - Method `find_from_pos` to `find` starting from a specific position ### Changed - MSRV (minimum supported Rust version) is now 1.41.1 (from 1.32.0) ## [0.4.1] - 2020-11-09 ### Added - `escape` function to escape special characters in a string so that it matches literally ## [0.4.0] - 2020-09-27 ### Added - Support for named groups and backrefs: - Capture with `(?...)` or `(?P...)` - Backref with `\k` or `(?P=name)` - `Captures::name` to get matched group by name - `Regex::capture_names` to get capture names in regex - Support for expanding matches using a replacement template string - `Captures::expand` for regex crate compatible syntax - See `Expander` for python-compatible syntax and advanced usage - `Match::range` and some `From` impls for convenience ## [0.3.5] - 2020-04-28 ### Changed - Include string snippet in errors for unknown group and invalid escape to make it easier to identify the problem. ## [0.3.4] - 2020-04-28 ### Added - Support comments using `(?# comment)` syntax - Support unicode escapes like `\u21D2` and `\U0001F60A` ## [0.3.3] - 2020-02-28 ### Changed - Optimization: Delegate const-sized suffixes in more cases - Optimization: Use `captures_read_at` when delegating to regex crate ## [0.3.2] - 2020-02-05 ### Fixed - Some regexes with fancy parts in the beginning/middle didn't match when they should have, e.g. `((?!x)(a|ab))c` didn't match `abc`. ## [0.3.1] - 2019-12-09 ### Added - Add `delegate_size_limit` and `delegate_dfa_size_limit` to `RegexBuilder` to allow configuring these limits for regex crate. ## [0.3.0] - 2019-11-27 ### Added - Add limit for backtracking so that execution errors instead of running for a long time in case of catastrophic backtracking. - Add `RegexBuilder` with `backtrack_limit` to configure the new backtrack limit per regex. - `Error` now implements `std::error::Error` trait ### Fixed - Fix panic in backref matching with multibyte chars ## [0.2.0] - 2019-10-19 ### Added - More documentation and examples - Support character class nesting and intersections (implemented in regex crate) - Support atomic groups, both the the `(?>foo)` group syntax and the `a++`, `a*+` and `a?+` possessive syntax - Support `\b`, `\f`, `\t`, `\n`, `\r`, `\v` - Support look-behind with variable sized alternative - Implement `Debug` for `Regex` - More test coverage including running one of Oniguruma's test suites ### Changed - Change `find` to return a `Match` struct (breaking change) - Change `Captures` API (breaking change): - Replace `at` and `pos` with `get` that returns a `Match` struct - Remove `is_empty` (use `len`) - Allow unescaped `]` and `}` as literals - Allow unescaped `{` as literal when not after atom - Allow escapes such as `\<` or `\e` inside character classes - Allow up to 8 characters in `\x{...}` escape - Allow escaping of space to make literal space - Allow `(a|)` - Reject invalid backreferences ### Fixed - Multiple fixes for alternatives in look-arounds - Fix hex escape to not include letters after "F" - Fix handling of unescaped `]` in character classes - Fix case insensitive character classes and other escapes - Don't ignore spaces in character classes even with "comment mode" ## [0.1.0] - 2017-02-06 ### Added - Initial release [0.7.1]: https://github.com/fancy-regex/fancy-regex/compare/0.7.0...0.7.1 [0.7.0]: https://github.com/fancy-regex/fancy-regex/compare/0.6.0...0.7.0 [0.6.0]: https://github.com/fancy-regex/fancy-regex/compare/0.5.0...0.6.0 [0.5.0]: https://github.com/fancy-regex/fancy-regex/compare/0.4.1...0.5.0 [0.4.1]: https://github.com/fancy-regex/fancy-regex/compare/0.4.0...0.4.1 [0.4.0]: https://github.com/fancy-regex/fancy-regex/compare/0.3.5...0.4.0 [0.3.5]: https://github.com/fancy-regex/fancy-regex/compare/0.3.4...0.3.5 [0.3.4]: https://github.com/fancy-regex/fancy-regex/compare/0.3.3...0.3.4 [0.3.3]: https://github.com/fancy-regex/fancy-regex/compare/0.3.2...0.3.3 [0.3.2]: https://github.com/fancy-regex/fancy-regex/compare/0.3.1...0.3.2 [0.3.1]: https://github.com/fancy-regex/fancy-regex/compare/0.3.0...0.3.1 [0.3.0]: https://github.com/fancy-regex/fancy-regex/compare/0.2.0...0.3.0 [0.2.0]: https://github.com/fancy-regex/fancy-regex/compare/0.1.0...0.2.0 [0.1.0]: https://github.com/fancy-regex/fancy-regex/commits/0.1.0 fancy-regex-0.7.1/CONTRIBUTING.md000064400000000000000000000011240000000000000142450ustar 00000000000000# Contributing The fancy-regex project is committed to fostering and preserving a diverse, welcoming community; all participants are expected to follow the [Rust Code of Conduct](https://www.rust-lang.org/en-US/conduct.html). Patching processes for this project are somewhat informal, as it's maintained by a small group. No Contributor License Agreement is needed. If this is your first substantive pull request in this repo, feel free to add yourself to the AUTHORS file. Make sure to run `cargo test` and `cargo fmt` to make sure your changes pass the tests and are formatted as expected. fancy-regex-0.7.1/Cargo.lock0000644000000421640000000000000112320ustar # This file is automatically @generated by Cargo. # It is not intended for manual editing. [[package]] name = "aho-corasick" version = "0.7.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" dependencies = [ "memchr", ] [[package]] name = "atty" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9b39be18770d11421cdb1b9947a45dd3f37e93092cbf377614828a319d5fee8" dependencies = [ "hermit-abi", "libc", "winapi", ] [[package]] name = "autocfg" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cdb031dd78e28731d87d56cc8ffef4a8f36ca26c38fe2de700543e627f8a464a" [[package]] name = "bit-set" version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e11e16035ea35e4e5997b393eacbf6f63983188f7a2ad25bfb13465f5ad59de" dependencies = [ "bit-vec", ] [[package]] name = "bit-vec" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349f9b6a179ed607305526ca489b34ad0a41aed5f7980fa90eb03160b69598fb" [[package]] name = "bitflags" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cf1de2fe8c75bc145a2f577add951f8134889b4795d47466a54a5c846d691693" [[package]] name = "bstr" version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "90682c8d613ad3373e66de8c6411e0ae2ab2571e879d2efbf73558cc66f21279" dependencies = [ "lazy_static", "memchr", "regex-automata", "serde", ] [[package]] name = "bumpalo" version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" [[package]] name = "cast" version = "0.2.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4c24dab4283a142afa2fdca129b80ad2c6284e073930f964c3a1293c225ee39a" dependencies = [ "rustc_version", ] [[package]] name = "cfg-if" version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "clap" version = "2.33.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37e58ac78573c40708d45522f0d80fa2f01cc4f9b4e2bf749807255454312002" dependencies = [ "bitflags", "textwrap", "unicode-width", ] [[package]] name = "criterion" version = "0.3.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ab327ed7354547cc2ef43cbe20ef68b988e70b4b593cbd66a2a61733123a3d23" dependencies = [ "atty", "cast", "clap", "criterion-plot", "csv", "itertools", "lazy_static", "num-traits", "oorandom", "plotters", "rayon", "regex", "serde", "serde_cbor", "serde_derive", "serde_json", "tinytemplate", "walkdir", ] [[package]] name = "criterion-plot" version = "0.4.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d00996de9f2f7559f7f4dc286073197f83e92256a59ed395f9aac01fe717da57" dependencies = [ "cast", "itertools", ] [[package]] name = "crossbeam-channel" version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06ed27e177f16d65f0f0c22a213e17c696ace5dd64b14258b52f9417ccb52db4" dependencies = [ "cfg-if", "crossbeam-utils", ] [[package]] name = "crossbeam-deque" version = "0.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94af6efb46fef72616855b036a624cf27ba656ffc9be1b9a3c931cfc7749a9a9" dependencies = [ "cfg-if", "crossbeam-epoch", "crossbeam-utils", ] [[package]] name = "crossbeam-epoch" version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" dependencies = [ "cfg-if", "crossbeam-utils", "lazy_static", "memoffset", "scopeguard", ] [[package]] name = "crossbeam-utils" version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" dependencies = [ "cfg-if", "lazy_static", ] [[package]] name = "csv" version = "1.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" dependencies = [ "bstr", "csv-core", "itoa", "ryu", "serde", ] [[package]] name = "csv-core" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" dependencies = [ "memchr", ] [[package]] name = "either" version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "env_logger" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a19187fea3ac7e84da7dacf48de0c45d63c6a76f9490dae389aead16c243fce3" dependencies = [ "log", "regex", ] [[package]] name = "fancy-regex" version = "0.7.1" dependencies = [ "bit-set", "criterion", "matches", "quickcheck", "regex", ] [[package]] name = "getrandom" version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7fcd999463524c52659517fe2cea98493cfe485d10565e7b0fb07dbba7ad2753" dependencies = [ "cfg-if", "libc", "wasi", ] [[package]] name = "half" version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62aca2aba2d62b4a7f5b33f3712cb1b0692779a56fb510499d5c0aa594daeaf3" [[package]] name = "hermit-abi" version = "0.1.19" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "62b467343b94ba476dcb2500d242dadbb39557df889310ac77c5d99100aaac33" dependencies = [ "libc", ] [[package]] name = "itertools" version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "69ddb889f9d0d08a67338271fa9b62996bc788c7796a5c18cf057420aaed5eaf" dependencies = [ "either", ] [[package]] name = "itoa" version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd25036021b0de88a0aff6b850051563c6516d0bf53f8638938edbb9de732736" [[package]] name = "js-sys" version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "83bdfbace3a0e81a4253f73b49e960b053e396a11012cbd49b9b74d6a2b67062" dependencies = [ "wasm-bindgen", ] [[package]] name = "lazy_static" version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" [[package]] name = "libc" version = "0.2.98" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "320cfe77175da3a483efed4bc0adc1968ca050b098ce4f2f1c13a56626128790" [[package]] name = "log" version = "0.4.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "51b9bbe6c47d51fc3e1a9b945965946b4c44142ab8792c50835a980d362c2710" dependencies = [ "cfg-if", ] [[package]] name = "matches" version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7ffc5c5338469d4d3ea17d269fa8ea3512ad247247c30bd2df69e68309ed0a08" [[package]] name = "memchr" version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "memoffset" version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" dependencies = [ "autocfg", ] [[package]] name = "num-traits" version = "0.2.14" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a64b1ec5cda2586e284722486d802acf1f7dbdc623e2bfc57e65ca1cd099290" dependencies = [ "autocfg", ] [[package]] name = "num_cpus" version = "1.13.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "05499f3756671c15885fee9034446956fff3f243d6077b91e5767df161f766b3" dependencies = [ "hermit-abi", "libc", ] [[package]] name = "oorandom" version = "11.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ab1bc2a289d34bd04a330323ac98a1b4bc82c9d9fcb1e66b63caa84da26b575" [[package]] name = "plotters" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a3fd9ec30b9749ce28cd91f255d569591cdf937fe280c312143e3c4bad6f2a" dependencies = [ "num-traits", "plotters-backend", "plotters-svg", "wasm-bindgen", "web-sys", ] [[package]] name = "plotters-backend" version = "0.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d88417318da0eaf0fdcdb51a0ee6c3bed624333bff8f946733049380be67ac1c" [[package]] name = "plotters-svg" version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "521fa9638fa597e1dc53e9412a4f9cefb01187ee1f7413076f9e6749e2885ba9" dependencies = [ "plotters-backend", ] [[package]] name = "proc-macro2" version = "1.0.28" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c7ed8b8c7b886ea3ed7dde405212185f423ab44682667c8c6dd14aa1d9f6612" dependencies = [ "unicode-xid", ] [[package]] name = "quickcheck" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "efc008b226fa5bdeabfc788d6679692223e940da371a95e26d87678333dac7c8" dependencies = [ "env_logger", "log", "rand", ] [[package]] name = "quote" version = "1.0.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d0b9745dc2debf507c8422de05d7226cc1f0644216dfdfead988f9b1ab32a7" dependencies = [ "proc-macro2", ] [[package]] name = "rand" version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2e7573632e6454cf6b99d7aac4ccca54be06da05aca2ef7423d22d27d4d4bcd8" dependencies = [ "rand_core", ] [[package]] name = "rand_core" version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" dependencies = [ "getrandom", ] [[package]] name = "rayon" version = "1.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06aca804d41dbc8ba42dfd964f0d01334eceb64314b9ecf7c5fad5188a06d90" dependencies = [ "autocfg", "crossbeam-deque", "either", "rayon-core", ] [[package]] name = "rayon-core" version = "1.9.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d78120e2c850279833f1dd3582f730c4ab53ed95aeaaaa862a2a5c71b1656d8e" dependencies = [ "crossbeam-channel", "crossbeam-deque", "crossbeam-utils", "lazy_static", "num_cpus", ] [[package]] name = "regex" version = "1.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d07a8629359eb56f1e2fb1652bb04212c072a87ba68546a04065d525673ac461" dependencies = [ "aho-corasick", "memchr", "regex-syntax", ] [[package]] name = "regex-automata" version = "0.1.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" [[package]] name = "regex-syntax" version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" [[package]] name = "rustc_version" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" dependencies = [ "semver", ] [[package]] name = "ryu" version = "1.0.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "71d301d4193d031abdd79ff7e3dd721168a9572ef3fe51a1517aba235bd8f86e" [[package]] name = "same-file" version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" dependencies = [ "winapi-util", ] [[package]] name = "scopeguard" version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd" [[package]] name = "semver" version = "1.0.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5f3aac57ee7f3272d8395c6e4f502f434f0e289fcd62876f70daa008c20dcabe" [[package]] name = "serde" version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec7505abeacaec74ae4778d9d9328fe5a5d04253220a85c4ee022239fc996d03" [[package]] name = "serde_cbor" version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e18acfa2f90e8b735b2836ab8d538de304cbb6729a7360729ea5a895d15a622" dependencies = [ "half", "serde", ] [[package]] name = "serde_derive" version = "1.0.126" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "963a7dbc9895aeac7ac90e74f34a5d5261828f79df35cbed41e10189d3804d43" dependencies = [ "proc-macro2", "quote", "syn", ] [[package]] name = "serde_json" version = "1.0.65" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "28c5e91e4240b46c4c19219d6cc84784444326131a4210f496f948d5cc827a29" dependencies = [ "itoa", "ryu", "serde", ] [[package]] name = "syn" version = "1.0.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1873d832550d4588c3dbc20f01361ab00bfe741048f71e3fecf145a7cc18b29c" dependencies = [ "proc-macro2", "quote", "unicode-xid", ] [[package]] name = "textwrap" version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d326610f408c7a4eb6f51c37c330e496b08506c9457c9d34287ecc38809fb060" dependencies = [ "unicode-width", ] [[package]] name = "tinytemplate" version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be4d6b5f19ff7664e8c98d03e2139cb510db9b0a60b55f8e8709b689d939b6bc" dependencies = [ "serde", "serde_json", ] [[package]] name = "unicode-width" version = "0.1.8" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9337591893a19b88d8d87f2cec1e73fad5cdfd10e5a6f349f498ad6ea2ffb1e3" [[package]] name = "unicode-xid" version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ccb82d61f80a663efe1f787a51b16b5a51e3314d6ac365b08639f52387b33f3" [[package]] name = "walkdir" version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "808cf2735cd4b6866113f648b791c6adc5714537bc222d9347bb203386ffda56" dependencies = [ "same-file", "winapi", "winapi-util", ] [[package]] name = "wasi" version = "0.10.2+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" [[package]] name = "wasm-bindgen" version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d54ee1d4ed486f78874278e63e4069fc1ab9f6a18ca492076ffb90c5eb2997fd" dependencies = [ "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b33f6a0694ccfea53d94db8b2ed1c3a8a4c86dd936b13b9f0a15ec4a451b900" dependencies = [ "bumpalo", "lazy_static", "log", "proc-macro2", "quote", "syn", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-macro" version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "088169ca61430fe1e58b8096c24975251700e7b1f6fd91cc9d59b04fb9b18bd4" dependencies = [ "quote", "wasm-bindgen-macro-support", ] [[package]] name = "wasm-bindgen-macro-support" version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "be2241542ff3d9f241f5e2cb6dd09b37efe786df8851c54957683a49f0987a97" dependencies = [ "proc-macro2", "quote", "syn", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" version = "0.2.74" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d7cff876b8f18eed75a66cf49b65e7f967cb354a7aa16003fb55dbfd25b44b4f" [[package]] name = "web-sys" version = "0.3.51" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e828417b379f3df7111d3a2a9e5753706cae29c41f7c4029ee9fd77f3e09e582" dependencies = [ "js-sys", "wasm-bindgen", ] [[package]] name = "winapi" version = "0.3.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" dependencies = [ "winapi-i686-pc-windows-gnu", "winapi-x86_64-pc-windows-gnu", ] [[package]] name = "winapi-i686-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "70ec6ce85bb158151cae5e5c87f95a8e97d2c0c4b001223f33a334e3ce5de178" dependencies = [ "winapi", ] [[package]] name = "winapi-x86_64-pc-windows-gnu" version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" fancy-regex-0.7.1/Cargo.toml0000644000000023710000000000000112510ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] edition = "2018" name = "fancy-regex" version = "0.7.1" authors = ["Raph Levien ", "Robin Stocker "] description = "An implementation of regexes, supporting a relatively rich set of features, including backreferences and look-around." documentation = "https://docs.rs/fancy-regex" readme = "README.md" categories = ["text-processing"] license = "MIT" repository = "https://github.com/fancy-regex/fancy-regex" [[bench]] name = "bench" harness = false [dependencies.bit-set] version = "0.5" [dependencies.regex] version = "1.2" [dev-dependencies.criterion] version = "= 0.3.4" [dev-dependencies.matches] version = "0.1.8" [dev-dependencies.quickcheck] version = "= 1.0.1" [features] track_caller = [] fancy-regex-0.7.1/Cargo.toml.orig0000644000000016650000000000000122150ustar [package] name = "fancy-regex" version = "0.7.1" # remember to update html_root_url authors = ["Raph Levien ", "Robin Stocker "] edition = "2018" license = "MIT" description = "An implementation of regexes, supporting a relatively rich set of features, including backreferences and look-around." readme = "README.md" repository = "https://github.com/fancy-regex/fancy-regex" documentation = "https://docs.rs/fancy-regex" categories = ["text-processing"] [features] # Enable #[track_caller] in unit tests. track_caller = [] [dependencies] regex = "1.2" # when we go to >= 1.3.8, we can get rid of the `contains_empty` workaround, see https://github.com/rust-lang/regex/blob/master/CHANGELOG.md#138-2020-05-28 bit-set = "0.5" [dev-dependencies] criterion = "= 0.3.4" # 0.3.5 requires Rust >= 1.46.0 matches = "0.1.8" quickcheck = "= 1.0.1" # 1.0.2 requires Rust >= 1.46.0 [[bench]] name = "bench" harness = false fancy-regex-0.7.1/Cargo.toml.orig000064400000000000000000000016650000000000000147150ustar 00000000000000[package] name = "fancy-regex" version = "0.7.1" # remember to update html_root_url authors = ["Raph Levien ", "Robin Stocker "] edition = "2018" license = "MIT" description = "An implementation of regexes, supporting a relatively rich set of features, including backreferences and look-around." readme = "README.md" repository = "https://github.com/fancy-regex/fancy-regex" documentation = "https://docs.rs/fancy-regex" categories = ["text-processing"] [features] # Enable #[track_caller] in unit tests. track_caller = [] [dependencies] regex = "1.2" # when we go to >= 1.3.8, we can get rid of the `contains_empty` workaround, see https://github.com/rust-lang/regex/blob/master/CHANGELOG.md#138-2020-05-28 bit-set = "0.5" [dev-dependencies] criterion = "= 0.3.4" # 0.3.5 requires Rust >= 1.46.0 matches = "0.1.8" quickcheck = "= 1.0.1" # 1.0.2 requires Rust >= 1.46.0 [[bench]] name = "bench" harness = false fancy-regex-0.7.1/LICENSE000064400000000000000000000020710000000000000130230ustar 00000000000000The MIT License Copyright 2015 The Fancy Regex Authors. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. fancy-regex-0.7.1/PERFORMANCE.md000064400000000000000000000056430000000000000141110ustar 00000000000000The README has a quick introduction to the performance of this crate. This will look at some examples and compare them to the Oniguruma engine. ## Catastrophic backtracking Backtracking engines can have worst-case performance when the regular expression forces the engine to consider an exponentially increasing number of sub-cases. For a good explanation of that, read [Runaway Regular Expressions: Catastrophic Backtracking][]. Let's look at the regex from the README again: (a|b|ab)*bc And the input text: ababababababababababababababababababababababababababababac Python's engine has exponential runtime. The regex crate and fancy-regex however have no problem with it. ## Oniguruma [Oniguruma][] implements a backtracking engine. So we'd expect it to have a problem with the above regex too. However, in the above case, it quickly finds that there's no match. How is that possible? The answer is that it has optimizations which sometimes help it avoid having to do any matching at all: In the pattern `(a|b|ab)*bc`, you might notice that if the input doesn't contain `bc`, the pattern will never match. Oniguruma detects that and, before it tries to do any matching, tries to find `bc` in the input. But what happens if we add `bc` at the end of the input, like this: ababababababababababababababababababababababababababababacbc Now the optimization doesn't help anymore, and Oniguruma is slow too. ## fancy-regex For `(a|b|ab)*bc` fancy-regex is fast in all cases because it can delegate to the regex crate which matches it in linear runtime. Let's look at another regex, one that makes use of a "fancy" look-ahead: (a|b|ab)*(?=c) When fancy-regex matches it against this input: abababababababababababababababababababababababababababab It's slow! The reason is that `(?=c)` is not supported by the regex crate, so we need to handle it with backtracking. And because `(a|b|ab)*` is before it, we need to do it with backtracking as well. Oniguruma doesn't have a problem with this particular case because its optimization saves it again: It checks if there's a `c` in the input before doing any matching. There's nothing preventing fancy-regex from adding similar optimizations in the future, but it's not done yet. Note that how much fancy-regex can do without backtracking depends on the structure of the regex. For example, with `(?=(a|b|ab)*bc)`, the inner part of the look-ahead can be delegated to regex entirely. ### Summary * If the regex doesn't use fancy features, fancy-regex should have linear runtime compared to Oniguruma's exponential worst-case. * Even if the regex doesn't use any fancy features, Oniguruma can be faster because it is a mature and highly optimized engine. * With fancy features, Oniguruma can be faster because of optimizations. [Runaway Regular Expressions: Catastrophic Backtracking]: https://www.regular-expressions.info/catastrophic.html [Oniguruma]: https://github.com/kkos/oniguruma fancy-regex-0.7.1/README.md000064400000000000000000000145400000000000000133010ustar 00000000000000# fancy-regex A Rust library for compiling and matching regular expressions. It uses a hybrid regex implementation designed to support a relatively rich set of features. In particular, it uses backtracking to implement "fancy" features such as look-around and backtracking, which are not supported in purely NFA-based implementations (exemplified by [RE2](https://github.com/google/re2), and implemented in Rust in the [regex](https://crates.io/crates/regex) crate). [![docs](https://docs.rs/fancy-regex/badge.svg)](https://docs.rs/fancy-regex) [![crate](https://img.shields.io/crates/v/fancy-regex.svg)](https://crates.io/crates/fancy-regex) [![ci](https://github.com/fancy-regex/fancy-regex/workflows/ci/badge.svg)](https://github.com/fancy-regex/fancy-regex/actions?query=workflow%3Aci) [![codecov](https://codecov.io/gh/fancy-regex/fancy-regex/branch/main/graph/badge.svg)](https://codecov.io/gh/fancy-regex/fancy-regex) A goal is to be as efficient as possible. For a given regex, the NFA implementation has asymptotic running time linear in the length of the input, while in the general case a backtracking implementation has exponential blowup. An example given in [Static Analysis for Regular Expression Exponential Runtime via Substructural Logics](https://www.cs.bham.ac.uk/~hxt/research/redos_full.pdf) is: ```python import re re.compile('(a|b|ab)*bc').match('ab' * 28 + 'ac') ``` In Python (tested on both 2.7 and 3.5), this match takes 91s, and doubles for each additional repeat of 'ab'. Thus, many proponents [advocate](https://swtch.com/~rsc/regexp/regexp1.html) a purely NFA (nondeterministic finite automaton) based approach. Even so, backreferences and look-around do add richness to regexes, and they are commonly used in applications such as syntax highlighting for text editors. In particular, TextMate's [syntax definitions](https://manual.macromates.com/en/language_grammars), based on the [Oniguruma](https://github.com/kkos/oniguruma) backtracking engine, are now used in a number of other popular editors, including Sublime Text and Atom. These syntax definitions routinely use backreferences and look-around. For example, the following regex captures a single-line Rust raw string: ``` r(#*)".*?"\1 ``` There is no NFA that can express this simple and useful pattern. Yet, a backtracking implementation handles it efficiently. This package is one of the first that handles both cases well. The exponential blowup case above is run in 258ns. Thus, it should be a very appealing alternative for applications that require both richness and performance. ## A warning about worst-case performance NFA-based approaches give strong guarantees about worst-case performance. For regexes that contain "fancy" features such as backreferences and look-around, this module gives no corresponding guarantee. If an attacker can control the regular expressions that will be matched against, they will be able to successfully mount a denial-of-service attack. Be warned. See [PERFORMANCE.md](PERFORMANCE.md) for some examples. ## A hybrid approach One workable approach is to detect the presence of "fancy" features, and choose either an NFA implementation or a backtracker depending on whether they are used. However, this module attempts to be more fine-grained. Instead, it implements a true hybrid approach. In essence, it is a backtracking VM (as well explained in [Regular Expression Matching: the Virtual Machine Approach](https://swtch.com/~rsc/regexp/regexp2.html)) in which one of the "instructions" in the VM delegates to an inner NFA implementation (in Rust, the regex crate, though a similar approach would certainly be possible using RE2 or the Go [regexp](https://golang.org/pkg/regexp/) package). Then there's an analysis which decides for each subexpression whether it is "hard", or can be delegated to the NFA matcher. At the moment, it is eager, and delegates as much as possible to the NFA engine. ## Theory **(This section is written in a somewhat informal style; I hope to expand on it)** The fundamental idea is that it's a backtracking VM like PCRE, but as much as possible it delegates to an "inner" RE engine like RE2 (in this case, the Rust one). For the sublanguage not using fancy features, the library becomes a thin wrapper. Otherwise, you do an analysis to figure out what you can delegate and what you have to backtrack. I was thinking it might be tricky, but it's actually quite simple. The first phase, you just label each subexpression as "hard" (groups that get referenced in a backref, look-around, etc), and bubble that up. You also do a little extra analysis, mostly determining whether an expression has constant match length, and the minimum length. The second phase is top down, and you carry a context, also a boolean indicating whether it's "hard" or not. Intuitively, a hard context is one in which the match length will affect future backtracking. If the subexpression is easy and the context is easy, generate an instruction in the VM that delegates to the inner NFA implementation. Otherwise, generate VM code as in a backtracking engine. Most expression nodes are pretty straightforward; the only interesting case is concat (a sequence of subexpressions). Even that one is not terribly complex. First, determine a prefix of easy nodes of constant match length (this won't affect backtracking, so safe to delegate to NFA). Then, if your context is easy, determine a suffix of easy nodes. Both of these delegate to NFA. For the ones in between, recursively compile. In an easy context, the last of these also gets an easy context; everything else is generated in a hard context. So, conceptually, hard context flows from right to left, and from parents to children. ## Current status Still in development, though the basic ideas are in place. Currently, the following features are missing: * Procedure calls and recursive expressions ## Acknowledgements Many thanks to [Andrew Gallant](http://blog.burntsushi.net/about/) for stimulating conversations that inspired this approach, as well as for creating the excellent regex crate. ## Authors The main author is Raph Levien, with many contributions from Robin Stocker. ## Contributions We gladly accept contributions via GitHub pull requests. Please see [CONTRIBUTING.md](CONTRIBUTING.md) for more details. This project started out as a Google 20% project, but none of the authors currently work at Google so it has been forked to be community-maintained. fancy-regex-0.7.1/benches/bench.rs000064400000000000000000000075520000000000000150630ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #[macro_use] extern crate criterion; use criterion::Criterion; use std::time::Duration; use fancy_regex::internal::{analyze, compile, run_default}; use fancy_regex::Expr; use regex::Regex; fn parse_lifetime_re(c: &mut Criterion) { c.bench_function("parse_lifetime_re", |b| { b.iter(|| Expr::parse_tree("\\'[a-zA-Z_][a-zA-Z0-9_]*(?!\\')\\b").unwrap()) }); } fn parse_literal_re(c: &mut Criterion) { c.bench_function("parse_literal_re", |b| { b.iter(|| Expr::parse_tree("^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})").unwrap()) }); } fn parse_literal_re_regex(c: &mut Criterion) { c.bench_function("parse_literal_re_regex", |b| { b.iter(|| Regex::new("^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})").unwrap()) }); } fn parse_misc(c: &mut Criterion) { c.bench_function("parse_misc", |b| { b.iter(|| Expr::parse_tree("^\\p{L}|\\p{N}|\\s|.|\\d").unwrap()) }); } fn analyze_literal_re(c: &mut Criterion) { let re = "^\\\\([!-/:-@\\[-`\\{-~aftnrv]|[0-7]{1,3}|x[0-9a-fA-F]{2}|x\\{[0-9a-fA-F]{1,6}\\})"; let tree = Expr::parse_tree(re).unwrap(); c.bench_function("analyze_literal_re", |b| b.iter(|| analyze(&tree).unwrap())); } fn run_backtrack(c: &mut Criterion) { let tree = Expr::parse_tree("^.*?(([ab]+)\\1b)").unwrap(); let a = analyze(&tree).unwrap(); let p = compile(&a).unwrap(); c.bench_function("run_backtrack", |b| { b.iter(|| run_default(&p, "babab", 0).unwrap()) }); } // The following regex is a pathological case for backtracking // implementations, see README.md: fn run_tricky(c: &mut Criterion) { let tree = Expr::parse_tree("(a|b|ab)*bc").unwrap(); let a = analyze(&tree).unwrap(); let p = compile(&a).unwrap(); let mut s = String::new(); for _ in 0..28 { s.push_str("ab"); } s.push_str("ac"); c.bench_function("run_tricky", |b| b.iter(|| run_default(&p, &s, 0).unwrap())); } fn run_backtrack_limit(c: &mut Criterion) { let tree = Expr::parse_tree("(?i)(a|b|ab)*(?=c)").unwrap(); let a = analyze(&tree).unwrap(); let p = compile(&a).unwrap(); let s = "abababababababababababababababababababababababababababab"; c.bench_function("run_backtrack_limit", |b| { b.iter(|| run_default(&p, &s, 0).unwrap_err()) }); } criterion_group!( name = benches; config = Criterion::default().warm_up_time(Duration::from_secs(10)); targets = parse_lifetime_re, parse_literal_re, parse_literal_re_regex, parse_misc, analyze_literal_re, run_backtrack, run_tricky, ); criterion_group!( name = slow_benches; config = Criterion::default().sample_size(10); targets = run_backtrack_limit, ); criterion_main!(benches, slow_benches); fancy-regex-0.7.1/codecov.yml000064400000000000000000000001100000000000000141530ustar 00000000000000# Make codecov not add verbose comments to pull requests comment: false fancy-regex-0.7.1/examples/toy.rs000064400000000000000000000113640000000000000150220ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! A simple test app for exercising and debugging the regex engine. use fancy_regex::internal::{analyze, compile, run_trace, Insn, Prog}; use fancy_regex::*; use std::env; use std::str::FromStr; fn main() { let mut args = env::args().skip(1); if let Some(cmd) = args.next() { if cmd == "parse" { if let Some(re) = args.next() { let e = Expr::parse_tree(&re); println!("{:#?}", e); } } else if cmd == "analyze" { if let Some(re) = args.next() { let tree = Expr::parse_tree(&re).unwrap(); let a = analyze(&tree); println!("{:#?}", a); } } else if cmd == "compile" { if let Some(re) = args.next() { let r = Regex::new(&re).unwrap(); r.debug_print(); } } else if cmd == "run" { let re = args.next().expect("expected regexp argument"); let r = Regex::new(&re).unwrap(); let text = args.next().expect("expected text argument"); let mut pos = 0; if let Some(pos_str) = args.next() { pos = usize::from_str(&pos_str).unwrap(); } if let Some(caps) = r.captures_from_pos(&text, pos).unwrap() { print!("captures:"); for i in 0..caps.len() { print!(" {}:", i); if let Some(m) = caps.get(i) { print!("[{}..{}] \"{}\"", m.start(), m.end(), m.as_str()); } else { print!("_"); } } println!(""); for cap in caps.iter() { println!("iterate {:?}", cap); } } else { println!("no match"); } } else if cmd == "trace" { if let Some(re) = args.next() { let prog = prog(&re); if let Some(s) = args.next() { run_trace(&prog, &s, 0).unwrap(); } } } else if cmd == "trace-inner" { if let Some(re) = args.next() { let tree = Expr::parse_tree(&re).unwrap(); let a = analyze(&tree).unwrap(); let p = compile(&a).unwrap(); if let Some(s) = args.next() { run_trace(&p, &s, 0).unwrap(); } } } else if cmd == "graph" { let re = args.next().expect("expected regexp argument"); graph(&re); } else { println!("commands: parse|analyze|compile|graph , run|trace|trace-inner "); } } } fn graph(re: &str) { let prog = prog(re); println!("digraph G {{"); for (i, insn) in prog.body.iter().enumerate() { let label = format!("{:?}", insn) .replace(r#"\"#, r#"\\"#) .replace(r#"""#, r#"\""#); println!(r#"{:3} [label="{}: {}"];"#, i, i, label); match *insn { Insn::Split(a, b) => { println!("{:3} -> {};", i, a); println!("{:3} -> {};", i, b); } Insn::Jmp(target) => { println!("{:3} -> {};", i, target); } Insn::End => {} _ => { println!("{:3} -> {};", i, i + 1); } } } println!("}}"); } fn prog(re: &str) -> Prog { let tree = Expr::parse_tree(re).expect("Expected parsing regex to work"); let result = analyze(&tree).expect("Expected analyze to succeed"); compile(&result).expect("Expected compile to succeed") } fancy-regex-0.7.1/src/analyze.rs000064400000000000000000000223320000000000000146200ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! Analysis of regex expressions. use bit_set::BitSet; use std::cmp::min; use std::usize; use crate::parse::{ExprTree, NamedGroups}; use crate::Error; use crate::Expr; use crate::Result; #[derive(Debug)] pub struct Info<'a> { pub(crate) start_group: usize, pub(crate) end_group: usize, pub(crate) min_size: usize, pub(crate) const_size: bool, pub(crate) hard: bool, /// Whether the expression's matching could be dependent on what the /// previous character was. E.g. `^` matches if there's no previous /// character; `(?m:^)` matches if the previous character was a newline. /// The matching of `\b` depends on the previous character. pub(crate) looks_left: bool, pub(crate) expr: &'a Expr, pub(crate) children: Vec>, } impl<'a> Info<'a> { pub(crate) fn is_literal(&self) -> bool { match *self.expr { Expr::Literal { casei, .. } => !casei, Expr::Concat(_) => self.children.iter().all(|child| child.is_literal()), _ => false, } } pub(crate) fn push_literal(&self, buf: &mut String) { match *self.expr { // could be more paranoid about checking casei Expr::Literal { ref val, .. } => buf.push_str(val), Expr::Concat(_) => { for child in &self.children { child.push_literal(buf); } } _ => panic!("push_literal called on non-literal"), } } } struct Analyzer<'a> { backrefs: &'a BitSet, group_ix: usize, group_names: &'a NamedGroups, } impl<'a> Analyzer<'a> { fn visit(&mut self, expr: &'a Expr) -> Result> { let start_group = self.group_ix; let mut children = Vec::new(); let mut min_size = 0; let mut const_size = false; let mut hard = false; let mut looks_left = false; match *expr { Expr::Empty | Expr::EndText | Expr::EndLine => { const_size = true; } Expr::Any { .. } => { min_size = 1; const_size = true; } Expr::Literal { ref val, casei } => { // right now each character in a literal gets its own node, that might change min_size = 1; const_size = literal_const_size(val, casei); } Expr::StartText | Expr::StartLine => { const_size = true; looks_left = true; } Expr::Concat(ref v) => { const_size = true; for child in v { let child_info = self.visit(child)?; looks_left |= child_info.looks_left && min_size == 0; min_size += child_info.min_size; const_size &= child_info.const_size; hard |= child_info.hard; children.push(child_info); } } Expr::Alt(ref v) => { let child_info = self.visit(&v[0])?; min_size = child_info.min_size; const_size = child_info.const_size; hard = child_info.hard; looks_left = child_info.looks_left; children.push(child_info); for child in &v[1..] { let child_info = self.visit(child)?; const_size &= child_info.const_size && min_size == child_info.min_size; min_size = min(min_size, child_info.min_size); hard |= child_info.hard; looks_left |= child_info.looks_left; children.push(child_info); } } Expr::Group(ref child) => { let group = self.group_ix; self.group_ix += 1; let child_info = self.visit(child)?; min_size = child_info.min_size; const_size = child_info.const_size; looks_left = child_info.looks_left; // If there's a backref to this group, we potentially have to backtrack within the // group. E.g. with `(x|xy)\1` and input `xyxy`, `x` matches but then the backref // doesn't, so we have to backtrack and try `xy`. hard = child_info.hard | self.backrefs.contains(group); children.push(child_info); } Expr::LookAround(ref child, _) => { let child_info = self.visit(child)?; // min_size = 0 const_size = true; hard = true; looks_left = child_info.looks_left; children.push(child_info); } Expr::Repeat { ref child, lo, hi, .. } => { let child_info = self.visit(child)?; min_size = child_info.min_size * lo; const_size = child_info.const_size && lo == hi; hard = child_info.hard; looks_left = child_info.looks_left; children.push(child_info); } Expr::Delegate { size, .. } => { // currently only used for empty and single-char matches min_size = size; const_size = true; looks_left = size == 0; // TODO: conservative for \z } Expr::Backref(group) => { if group >= self.group_ix { return Err(Error::InvalidBackref); } hard = true; } Expr::NamedBackref(ref name) => { if !self.group_names.contains_key(name) { return Err(Error::InvalidBackref); } hard = true; } Expr::AtomicGroup(ref child) => { let child_info = self.visit(child)?; min_size = child_info.min_size; const_size = child_info.const_size; looks_left = child_info.looks_left; hard = true; // TODO: possibly could weaken children.push(child_info); } }; Ok(Info { expr, children, start_group, end_group: self.group_ix, min_size, const_size, hard, looks_left, }) } } fn literal_const_size(_: &str, _: bool) -> bool { // Right now, regex doesn't do sophisticated case folding, // test below will fail when that changes, then we need to // do something fancier here. true } /// Analyze the parsed expression to determine whether it requires fancy features. pub fn analyze<'a>(tree: &'a ExprTree) -> Result> { let mut analyzer = Analyzer { backrefs: &tree.backrefs, group_ix: 0, group_names: &tree.named_groups, }; analyzer.visit(&tree.expr) } #[cfg(test)] mod tests { use super::analyze; use super::literal_const_size; use crate::Expr; use regex; #[test] fn case_folding_safe() { let re = regex::Regex::new("(?i:ß)").unwrap(); if re.is_match("SS") { assert!(!literal_const_size("ß", true)); } // Another tricky example, Armenian ECH YIWN let re = regex::Regex::new("(?i:\\x{0587})").unwrap(); if re.is_match("\u{0565}\u{0582}") { assert!(!literal_const_size("\u{0587}", true)); } } #[test] fn invalid_backref_1() { assert!(analyze(&Expr::parse_tree(".\\0").unwrap()).is_err()); } #[test] fn invalid_backref_2() { assert!(analyze(&Expr::parse_tree("(.\\1)").unwrap()).is_err()); } #[test] fn invalid_backref_3() { assert!(analyze(&Expr::parse_tree("\\1(.)").unwrap()).is_err()); } #[test] fn is_literal() { let tree = Expr::parse_tree("abc").unwrap(); let info = analyze(&tree).unwrap(); assert_eq!(info.is_literal(), true); } #[test] fn is_literal_with_repeat() { let tree = Expr::parse_tree("abc*").unwrap(); let info = analyze(&tree).unwrap(); assert_eq!(info.is_literal(), false); } } fancy-regex-0.7.1/src/compile.rs000064400000000000000000000532450000000000000146140ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! Compilation of regexes to VM. use std::usize; use crate::analyze::Info; use crate::vm::{Insn, Prog}; use crate::Error; use crate::Expr; use crate::LookAround; use crate::LookAround::*; use crate::RegexOptions; use crate::Result; // I'm thinking it probably doesn't make a lot of sense having this split // out from Compiler. struct VMBuilder { prog: Vec, n_saves: usize, } impl VMBuilder { fn new(max_group: usize) -> VMBuilder { VMBuilder { prog: Vec::new(), n_saves: max_group * 2, } } fn build(self) -> Prog { Prog::new(self.prog, self.n_saves) } fn newsave(&mut self) -> usize { let result = self.n_saves; self.n_saves += 1; result } fn pc(&self) -> usize { self.prog.len() } // would "emit" be a better name? fn add(&mut self, insn: Insn) { self.prog.push(insn); } fn set_jmp_target(&mut self, jmp_pc: usize, target: usize) { match self.prog[jmp_pc] { Insn::Jmp(ref mut next) => *next = target, _ => panic!("mutating instruction other than Jmp"), } } fn set_split_target(&mut self, split_pc: usize, target: usize, second: bool) { match self.prog[split_pc] { Insn::Split(_, ref mut y) if second => *y = target, Insn::Split(ref mut x, _) => *x = target, _ => panic!("mutating instruction other than Split"), } } fn set_repeat_target(&mut self, repeat_pc: usize, target: usize) { match self.prog[repeat_pc] { Insn::RepeatGr { ref mut next, .. } | Insn::RepeatNg { ref mut next, .. } | Insn::RepeatEpsilonGr { ref mut next, .. } | Insn::RepeatEpsilonNg { ref mut next, .. } => *next = target, _ => panic!("mutating instruction other than Repeat"), } } } struct Compiler { b: VMBuilder, options: RegexOptions, } impl Compiler { fn new(max_group: usize) -> Compiler { Compiler { b: VMBuilder::new(max_group), options: Default::default(), } } fn visit(&mut self, info: &Info<'_>, hard: bool) -> Result<()> { if !hard && !info.hard { // easy case, delegate entire subexpr return self.compile_delegate(info); } match *info.expr { Expr::Empty => (), Expr::Literal { ref val, casei } => { if !casei { self.b.add(Insn::Lit(val.clone())); } else { self.compile_delegate(info)?; } } Expr::Any { newline: true } => { self.b.add(Insn::Any); } Expr::Any { newline: false } => { self.b.add(Insn::AnyNoNL); } Expr::Concat(_) => { self.compile_concat(info, hard)?; } Expr::Alt(_) => { let count = info.children.len(); self.compile_alt(count, |compiler, i| compiler.visit(&info.children[i], hard))?; } Expr::Group(_) => { let group = info.start_group; self.b.add(Insn::Save(group * 2)); self.visit(&info.children[0], hard)?; self.b.add(Insn::Save(group * 2 + 1)); } Expr::Repeat { lo, hi, greedy, .. } => { self.compile_repeat(info, lo, hi, greedy, hard)?; } Expr::LookAround(_, la) => { self.compile_lookaround(info, la)?; } Expr::Backref(group) => { self.b.add(Insn::Backref(group * 2)); } Expr::AtomicGroup(_) => { // TODO optimization: atomic insns are not needed if the // child doesn't do any backtracking. self.b.add(Insn::BeginAtomic); self.visit(&info.children[0], false)?; self.b.add(Insn::EndAtomic); } Expr::Delegate { .. } | Expr::StartText | Expr::EndText | Expr::StartLine | Expr::EndLine => { // TODO: might want to have more specialized impls self.compile_delegate(info)?; } Expr::NamedBackref(_) => { unreachable!("named backrefs should have been eliminated"); } } Ok(()) } fn compile_alt(&mut self, count: usize, mut handle_alternative: F) -> Result<()> where F: FnMut(&mut Compiler, usize) -> Result<()>, { let mut jmps = Vec::new(); let mut last_pc = usize::MAX; for i in 0..count { let has_next = i != count - 1; let pc = self.b.pc(); if has_next { self.b.add(Insn::Split(pc + 1, usize::MAX)); } if last_pc != usize::MAX { self.b.set_split_target(last_pc, pc, true); } last_pc = pc; handle_alternative(self, i)?; if has_next { // All except the last branch need to jump over instructions of // other branches. The last branch can just continue to the next // instruction. let pc = self.b.pc(); jmps.push(pc); self.b.add(Insn::Jmp(0)); } } let next_pc = self.b.pc(); for jmp_pc in jmps { self.b.set_jmp_target(jmp_pc, next_pc); } Ok(()) } fn compile_concat(&mut self, info: &Info<'_>, hard: bool) -> Result<()> { // First: determine a prefix which is constant size and not hard. let prefix_end = info .children .iter() .take_while(|c| c.const_size && !c.hard) .count(); // If incoming difficulty is not hard, the suffix after the last // hard child can be done with NFA. let suffix_len = if !hard { info.children[prefix_end..] .iter() .rev() .take_while(|c| !c.hard) .count() } else { // Even for hard, we can delegate a const-sized suffix info.children[prefix_end..] .iter() .rev() .take_while(|c| c.const_size && !c.hard) .count() }; let suffix_begin = info.children.len() - suffix_len; self.compile_delegates(&info.children[..prefix_end])?; for child in info.children[prefix_end..suffix_begin].iter() { self.visit(child, true)?; } self.compile_delegates(&info.children[suffix_begin..]) } fn compile_repeat( &mut self, info: &Info<'_>, lo: usize, hi: usize, greedy: bool, hard: bool, ) -> Result<()> { let child = &info.children[0]; if lo == 0 && hi == 1 { // e? let pc = self.b.pc(); self.b.add(Insn::Split(pc + 1, pc + 1)); // TODO: do we want to do an epsilon check here? If we do // it here and in Alt, we might be able to make a good // bound on stack depth self.visit(child, hard)?; let next_pc = self.b.pc(); self.b.set_split_target(pc, next_pc, greedy); return Ok(()); } let hard = hard | info.hard; if hi == usize::MAX && child.min_size == 0 { // Use RepeatEpsilon instructions to prevent empty repeat let repeat = self.b.newsave(); let check = self.b.newsave(); self.b.add(Insn::Save0(repeat)); let pc = self.b.pc(); if greedy { self.b.add(Insn::RepeatEpsilonGr { lo, next: usize::MAX, repeat, check, }); } else { self.b.add(Insn::RepeatEpsilonNg { lo, next: usize::MAX, repeat, check, }); } self.visit(child, hard)?; self.b.add(Insn::Jmp(pc)); let next_pc = self.b.pc(); self.b.set_repeat_target(pc, next_pc); } else if lo == 0 && hi == usize::MAX { // e* let pc = self.b.pc(); self.b.add(Insn::Split(pc + 1, pc + 1)); self.visit(child, hard)?; self.b.add(Insn::Jmp(pc)); let next_pc = self.b.pc(); self.b.set_split_target(pc, next_pc, greedy); } else if lo == 1 && hi == usize::MAX { // e+ let pc = self.b.pc(); self.visit(child, hard)?; let next = self.b.pc() + 1; let (x, y) = if greedy { (pc, next) } else { (next, pc) }; self.b.add(Insn::Split(x, y)); } else { let repeat = self.b.newsave(); self.b.add(Insn::Save0(repeat)); let pc = self.b.pc(); if greedy { self.b.add(Insn::RepeatGr { lo, hi, next: usize::MAX, repeat, }); } else { self.b.add(Insn::RepeatNg { lo, hi, next: usize::MAX, repeat, }); } self.visit(child, hard)?; self.b.add(Insn::Jmp(pc)); let next_pc = self.b.pc(); self.b.set_repeat_target(pc, next_pc); } Ok(()) } fn compile_lookaround(&mut self, info: &Info<'_>, la: LookAround) -> Result<()> { let inner = &info.children[0]; match la { LookBehind => { if let Info { const_size: false, expr: &Expr::Alt(_), .. } = inner { // Make const size by transforming `(?<=a|bb)` to `(?<=a)|(?<=bb)` let alternatives = &inner.children; self.compile_alt(alternatives.len(), |compiler, i| { let alternative = &alternatives[i]; compiler.compile_positive_lookaround(alternative, la) }) } else { self.compile_positive_lookaround(inner, la) } } LookBehindNeg => { if let Info { const_size: false, expr: &Expr::Alt(_), .. } = inner { // Make const size by transforming `(? self.compile_positive_lookaround(inner, la), LookAheadNeg => self.compile_negative_lookaround(inner, la), } } fn compile_positive_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> { let save = self.b.newsave(); self.b.add(Insn::Save(save)); self.compile_lookaround_inner(inner, la)?; self.b.add(Insn::Restore(save)); Ok(()) } fn compile_negative_lookaround(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> { let pc = self.b.pc(); self.b.add(Insn::Split(pc + 1, usize::MAX)); self.compile_lookaround_inner(inner, la)?; self.b.add(Insn::FailNegativeLookAround); let next_pc = self.b.pc(); self.b.set_split_target(pc, next_pc, true); Ok(()) } fn compile_lookaround_inner(&mut self, inner: &Info<'_>, la: LookAround) -> Result<()> { if la == LookBehind || la == LookBehindNeg { if !inner.const_size { return Err(Error::LookBehindNotConst); } self.b.add(Insn::GoBack(inner.min_size)); } self.visit(inner, false) } fn compile_delegates(&mut self, infos: &[Info<'_>]) -> Result<()> { if infos.is_empty() { return Ok(()); } // TODO: might want to do something similar for case insensitive literals // (have is_literal return an additional bool for casei) if infos.iter().all(|e| e.is_literal()) { let mut val = String::new(); for info in infos { info.push_literal(&mut val); } self.b.add(Insn::Lit(val)); return Ok(()); } let mut delegate_builder = DelegateBuilder::new(); for info in infos { delegate_builder.push(info); } let delegate = delegate_builder.build(&self.options)?; self.b.add(delegate); Ok(()) } fn compile_delegate(&mut self, info: &Info) -> Result<()> { let insn = if info.is_literal() { let mut val = String::new(); info.push_literal(&mut val); Insn::Lit(val) } else { DelegateBuilder::new().push(info).build(&self.options)? }; self.b.add(insn); Ok(()) } } pub(crate) fn compile_inner(inner_re: &str, options: &RegexOptions) -> Result { let mut builder = regex::RegexBuilder::new(inner_re); if let Some(size_limit) = options.delegate_size_limit { builder.size_limit(size_limit); } if let Some(dfa_size_limit) = options.delegate_dfa_size_limit { builder.dfa_size_limit(dfa_size_limit); } builder.build().map_err(Error::InnerError) } /// Compile the analyzed expressions into a program. pub fn compile(info: &Info<'_>) -> Result { let mut c = Compiler::new(info.end_group); c.visit(info, false)?; c.b.add(Insn::End); Ok(c.b.build()) } struct DelegateBuilder { re: String, min_size: usize, const_size: bool, looks_left: bool, start_group: Option, end_group: usize, } impl DelegateBuilder { fn new() -> Self { Self { re: "^".to_string(), min_size: 0, const_size: true, looks_left: false, start_group: None, end_group: 0, } } fn push(&mut self, info: &Info<'_>) -> &mut DelegateBuilder { // TODO: might want to detect case of a group with no captures // inside, so we can run find() instead of captures() self.looks_left |= info.looks_left && self.min_size == 0; self.min_size += info.min_size; self.const_size &= info.const_size; if self.start_group.is_none() { self.start_group = Some(info.start_group); } self.end_group = info.end_group; // Add expression. The precedence argument has to be 1 here to // ensure correct grouping in these cases: // // If we have multiple expressions, we are building a concat. // Without grouping, we'd turn ["a", "b|c"] into "^ab|c". But we // want "^a(?:b|c)". // // Even with a single expression, because we add `^` at the // beginning, we need a group. Otherwise `["a|b"]` would be turned // into `"^a|b"` instead of `"^(?:a|b)"`. info.expr.to_str(&mut self.re, 1); self } fn build(&self, options: &RegexOptions) -> Result { let start_group = self.start_group.expect("Expected at least one expression"); let end_group = self.end_group; let compiled = compile_inner(&self.re, options)?; if self.looks_left { // The "s" flag is for allowing `.` to match `\n` let inner1 = ["^(?s:.)", &self.re[1..]].concat(); let compiled1 = compile_inner(&inner1, options)?; Ok(Insn::Delegate { inner: Box::new(compiled), inner1: Some(Box::new(compiled1)), start_group, end_group, }) } else if self.const_size && start_group == end_group { let size = self.min_size; Ok(Insn::DelegateSized(Box::new(compiled), size)) } else { Ok(Insn::Delegate { inner: Box::new(compiled), inner1: None, start_group, end_group, }) } } } #[cfg(test)] mod tests { use super::*; use crate::analyze::analyze; use crate::parse::ExprTree; use crate::vm::Insn::*; use bit_set::BitSet; use matches::assert_matches; #[test] fn jumps_for_alternation() { let tree = ExprTree { expr: Expr::Alt(vec![ Expr::Literal { val: "a".into(), casei: false, }, Expr::Literal { val: "b".into(), casei: false, }, Expr::Literal { val: "c".into(), casei: false, }, ]), backrefs: BitSet::new(), named_groups: Default::default(), }; let info = analyze(&tree).unwrap(); let mut c = Compiler::new(0); // Force "hard" so that compiler doesn't just delegate c.visit(&info, true).unwrap(); c.b.add(Insn::End); let prog = c.b.prog; assert_eq!(prog.len(), 8, "prog: {:?}", prog); assert_matches!(prog[0], Split(1, 3)); assert_matches!(prog[1], Lit(ref l) if l == "a"); assert_matches!(prog[2], Jmp(7)); assert_matches!(prog[3], Split(4, 6)); assert_matches!(prog[4], Lit(ref l) if l == "b"); assert_matches!(prog[5], Jmp(7)); assert_matches!(prog[6], Lit(ref l) if l == "c"); assert_matches!(prog[7], End); } #[test] fn look_around_pattern_can_be_delegated() { let prog = compile_prog("(?=ab*)c"); assert_eq!(prog.len(), 5, "prog: {:?}", prog); assert_matches!(prog[0], Save(0)); assert_delegate(&prog[1], "^ab*"); assert_matches!(prog[2], Restore(0)); assert_matches!(prog[3], Lit(ref l) if l == "c"); assert_matches!(prog[4], End); } #[test] fn easy_concat_can_delegate_end() { let prog = compile_prog("(?!x)(?:a|ab)x*"); assert_eq!(prog.len(), 5, "prog: {:?}", prog); assert_matches!(prog[0], Split(1, 3)); assert_matches!(prog[1], Lit(ref l) if l == "x"); assert_matches!(prog[2], FailNegativeLookAround); assert_delegate(&prog[3], "^(?:a|ab)x*"); assert_matches!(prog[4], End); } #[test] fn hard_concat_can_delegate_const_size_end() { let prog = compile_prog("(?:(?!x)(?:a|b)c)x*"); assert_eq!(prog.len(), 6, "prog: {:?}", prog); assert_matches!(prog[0], Split(1, 3)); assert_matches!(prog[1], Lit(ref l) if l == "x"); assert_matches!(prog[2], FailNegativeLookAround); assert_delegate_sized(&prog[3], "^(?:a|b)c"); assert_delegate(&prog[4], "^x*"); assert_matches!(prog[5], End); } #[test] fn hard_concat_can_not_delegate_variable_end() { let prog = compile_prog("(?:(?!x)(?:a|ab))x*"); assert_eq!(prog.len(), 9, "prog: {:?}", prog); assert_matches!(prog[0], Split(1, 3)); assert_matches!(prog[1], Lit(ref l) if l == "x"); assert_matches!(prog[2], FailNegativeLookAround); assert_matches!(prog[3], Split(4, 6)); assert_matches!(prog[4], Lit(ref l) if l == "a"); assert_matches!(prog[5], Jmp(7)); assert_matches!(prog[6], Lit(ref l) if l == "ab"); assert_delegate(&prog[7], "^x*"); assert_matches!(prog[8], End); } fn compile_prog(re: &str) -> Vec { let tree = Expr::parse_tree(re).unwrap(); let info = analyze(&tree).unwrap(); let prog = compile(&info).unwrap(); prog.body } fn assert_delegate(insn: &Insn, re: &str) { match insn { Insn::Delegate { inner, .. } => { assert_eq!(inner.as_str(), re); } _ => { panic!("Expected Insn::Delegate but was {:#?}", insn); } } } fn assert_delegate_sized(insn: &Insn, re: &str) { match insn { Insn::DelegateSized(inner, ..) => { assert_eq!(inner.as_str(), re); } _ => { panic!("Expected Insn::DelegateSized but was {:#?}", insn); } } } } fancy-regex-0.7.1/src/error.rs000064400000000000000000000102530000000000000143050ustar 00000000000000use std::fmt; /// Result type for this crate with specific error enum. pub type Result = ::std::result::Result; /// An error for the result of compiling or running a regex. #[derive(Debug)] pub enum Error { // Compile time errors /// General parsing error ParseError, /// Opening parenthesis without closing parenthesis, e.g. `(a|b` UnclosedOpenParen, /// Invalid repeat syntax InvalidRepeat, /// Pattern too deeply nested RecursionExceeded, /// Look-behind assertion without constant size LookBehindNotConst, /// Backslash without following character TrailingBackslash, /// Invalid escape InvalidEscape(String), /// Unicode escape not closed UnclosedUnicodeName, /// Invalid hex escape InvalidHex, /// Invalid codepoint for hex or unicode escape InvalidCodepointValue, /// Invalid character class InvalidClass, /// Unknown group flag UnknownFlag(String), /// Disabling Unicode not supported NonUnicodeUnsupported, /// Invalid back reference InvalidBackref, /// Regex crate error InnerError(regex::Error), /// Couldn't parse group name InvalidGroupName, /// Invalid group id in escape sequence InvalidGroupNameBackref(String), /// Once named groups are used you cannot refer to groups by number NamedBackrefOnly, /// Quantifier on lookaround or other zero-width assertion TargetNotRepeatable, // Run time errors /// Max stack size exceeded for backtracking while executing regex. StackOverflow, /// Max limit for backtracking count exceeded while executing the regex. /// Configure using /// [`RegexBuilder::backtrack_limit`](struct.RegexBuilder.html#method.backtrack_limit). BacktrackLimitExceeded, /// This enum may grow additional variants, so this makes sure clients don't count on exhaustive /// matching. Otherwise, adding a new variant could break existing code. #[doc(hidden)] __Nonexhaustive, } impl ::std::error::Error for Error {} impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // We should make these more helpful, e.g. by including the parts of the regex that lead to // the error. match self { Error::ParseError => write!(f, "General parsing error"), Error::UnclosedOpenParen => { write!(f, "Opening parenthesis without closing parenthesis") } Error::InvalidRepeat => write!(f, "Invalid repeat syntax"), Error::RecursionExceeded => write!(f, "Pattern too deeply nested"), Error::LookBehindNotConst => write!(f, "Look-behind assertion without constant size"), Error::TrailingBackslash => write!(f, "Backslash without following character"), Error::InvalidEscape(s) => write!(f, "Invalid escape: {}", s), Error::UnclosedUnicodeName => write!(f, "Unicode escape not closed"), Error::InvalidHex => write!(f, "Invalid hex escape"), Error::InvalidCodepointValue => { write!(f, "Invalid codepoint for hex or unicode escape") } Error::InvalidClass => write!(f, "Invalid character class"), Error::UnknownFlag(s) => write!(f, "Unknown group flag: {}", s), Error::NonUnicodeUnsupported => write!(f, "Disabling Unicode not supported"), Error::InvalidBackref => write!(f, "Invalid back reference"), Error::InnerError(e) => write!(f, "Regex error: {}", e), Error::StackOverflow => write!(f, "Max stack size exceeded for backtracking"), Error::BacktrackLimitExceeded => write!(f, "Max limit for backtracking count exceeded"), Error::__Nonexhaustive => unreachable!(), Error::InvalidGroupName => write!(f, "Could not parse group name"), Error::InvalidGroupNameBackref(s) => write!(f, "Invalid group name in back reference: {}", s), Error::TargetNotRepeatable => write!(f, "Target of repeat operator is invalid"), Error::NamedBackrefOnly => write!(f, "Numbered backref/call not allowed because named group was used, use a named backref instead"), } } } fancy-regex-0.7.1/src/expand.rs000064400000000000000000000173210000000000000144360ustar 00000000000000use crate::parse::{parse_decimal, parse_id}; use crate::{Captures, Error, Regex}; use std::borrow::Cow; use std::io; use std::mem; /// A set of options for expanding a template string using the contents /// of capture groups. #[derive(Debug)] pub struct Expander { sub_char: char, open: &'static str, close: &'static str, allow_undelimited_name: bool, } impl Default for Expander { /// Returns the default expander used by [`Captures::expand`]. /// /// [`Captures::expand`]: struct.Captures.html#expand fn default() -> Self { Expander { sub_char: '$', open: "{", close: "}", allow_undelimited_name: true, } } } impl Expander { /// Returns an expander that uses Python-compatible syntax. /// /// Expands all instances of `\num` or `\g` in `replacement` /// to the corresponding capture group `num` or `name`, and writes /// them to the `dst` buffer given. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `\0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// /// `num` must be an integer corresponding to the index of the /// capture group. /// /// If `num` or `name` isn't a valid capture group (whether the name doesn't exist /// or isn't a valid index), then it is replaced with the empty string. /// /// The longest possible number is used. e.g., `\10` looks up capture /// group 10 and not capture group 1 followed by a literal 0. /// /// To write a literal `\`, use `\\`. pub fn python() -> Expander { Expander { sub_char: '\\', open: "g<", close: ">", allow_undelimited_name: false, } } /// Checks `template` for errors. The following conditions are checked for: /// /// - A reference to a numbered group that does not exist in `regex` /// - A reference to a numbered group (other than 0) when `regex` contains named groups /// - A reference to a named group that does not occur in `regex` /// - An opening group name delimiter without a closing delimiter /// - Using an empty string as a group name pub fn check(&self, template: &str, regex: &Regex) -> crate::Result<()> { let on_group_num = |num| { if num == 0 { Ok(()) } else if !regex.named_groups.is_empty() { Err(Error::NamedBackrefOnly) } else if num < regex.captures_len() { Ok(()) } else { Err(Error::InvalidBackref) } }; self.exec(template, |step| match step { Step::Char(_) => Ok(()), Step::GroupName(name) => { if regex.named_groups.contains_key(name) { Ok(()) } else if let Ok(num) = name.parse() { on_group_num(num) } else { Err(Error::InvalidBackref) } } Step::GroupNum(num) => on_group_num(num), Step::Error => Err(Error::ParseError), }) } /// Escapes the substitution character in `text` so it appears literally /// in the output of `expansion`. /// /// ``` /// assert_eq!( /// fancy_regex::Expander::default().escape("Has a literal $ sign."), /// "Has a literal $$ sign.", /// ); /// ``` pub fn escape<'a>(&self, text: &'a str) -> Cow<'a, str> { if text.contains(self.sub_char) { let mut quoted = String::with_capacity(self.sub_char.len_utf8() * 2); quoted.push(self.sub_char); quoted.push(self.sub_char); Cow::Owned(text.replace(self.sub_char, "ed)) } else { Cow::Borrowed(text) } } #[doc(hidden)] #[deprecated(since = "0.4.0", note = "Use `escape` instead.")] pub fn quote<'a>(&self, text: &'a str) -> Cow<'a, str> { self.escape(text) } /// Expands the template string `template` using the syntax defined /// by this expander and the values of capture groups from `captures`. pub fn expansion(&self, template: &str, captures: &Captures<'_>) -> String { let mut cursor = io::Cursor::new(Vec::with_capacity(template.len())); self.write_expansion(&mut cursor, template, captures) .expect("expansion succeeded"); String::from_utf8(cursor.into_inner()).expect("expansion is UTF-8") } /// Appends the expansion produced by `expansion` to `dst`. Potentially more efficient /// than calling `expansion` directly and appending to an existing string. pub fn append_expansion(&self, dst: &mut String, template: &str, captures: &Captures<'_>) { let pos = dst.len(); let mut cursor = io::Cursor::new(mem::replace(dst, String::new()).into_bytes()); cursor.set_position(pos as u64); self.write_expansion(&mut cursor, template, captures) .expect("expansion succeeded"); *dst = String::from_utf8(cursor.into_inner()).expect("expansion is UTF-8"); } /// Writes the expansion produced by `expansion` to `dst`. Potentially more efficient /// than calling `expansion` directly and writing the result. pub fn write_expansion( &self, mut dst: impl io::Write, template: &str, captures: &Captures<'_>, ) -> io::Result<()> { self.exec(template, |step| match step { Step::Char(c) => write!(dst, "{}", c), Step::GroupName(name) => { if let Some(m) = captures.name(name) { write!(dst, "{}", m.as_str()) } else if let Some(m) = name.parse().ok().and_then(|num| captures.get(num)) { write!(dst, "{}", m.as_str()) } else { Ok(()) } } Step::GroupNum(num) => { if let Some(m) = captures.get(num) { write!(dst, "{}", m.as_str()) } else { Ok(()) } } Step::Error => Ok(()), }) } fn exec<'t, E>( &self, template: &'t str, mut f: impl FnMut(Step<'t>) -> Result<(), E>, ) -> Result<(), E> { debug_assert!(!self.open.is_empty()); debug_assert!(!self.close.is_empty()); let mut iter = template.chars(); while let Some(c) = iter.next() { if c == self.sub_char { let tail = iter.as_str(); let skip = if tail.starts_with(self.sub_char) { f(Step::Char(self.sub_char))?; 1 } else if let Some((id, skip)) = parse_id(tail, self.open, self.close).or_else(|| { if self.allow_undelimited_name { parse_id(tail, "", "") } else { None } }) { f(Step::GroupName(id))?; skip } else if let Some((skip, num)) = parse_decimal(tail, 0) { f(Step::GroupNum(num))?; skip } else { f(Step::Error)?; f(Step::Char(self.sub_char))?; 0 }; iter = iter.as_str()[skip..].chars(); } else { f(Step::Char(c))?; } } Ok(()) } } enum Step<'a> { Char(char), GroupName(&'a str), GroupNum(usize), Error, } fancy-regex-0.7.1/src/lib.rs000064400000000000000000001453720000000000000137350ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /*! An implementation of regexes, supporting a relatively rich set of features, including backreferences and lookaround. It builds on top of the excellent [regex] crate. If you are not familiar with it, make sure you read its documentation and maybe you don't even need fancy-regex. If your regex or parts of it does not use any special features, the matching is delegated to the regex crate. That means it has linear runtime. But if you use "fancy" features such as backreferences or look-around, an engine with backtracking needs to be used. In that case, the regex can be slow and take exponential time to run because of what is called "catastrophic backtracking". This depends on the regex and the input. # Usage The API should feel very similar to the regex crate, and involves compiling a regex and then using it to find matches in text. ## Example: Matching text An example with backreferences to check if a text consists of two identical words: ```rust use fancy_regex::Regex; let re = Regex::new(r"^(\w+) (\1)$").unwrap(); let result = re.is_match("foo foo"); assert!(result.is_ok()); let did_match = result.unwrap(); assert!(did_match); ``` Note that like in the regex crate, the regex needs anchors like `^` and `$` to match against the entire input text. ## Example: Finding the position of matches ```rust use fancy_regex::Regex; let re = Regex::new(r"(\d)\1").unwrap(); let result = re.find("foo 22"); assert!(result.is_ok(), "execution was successful"); let match_option = result.unwrap(); assert!(match_option.is_some(), "found a match"); let m = match_option.unwrap(); assert_eq!(m.start(), 4); assert_eq!(m.end(), 6); assert_eq!(m.as_str(), "22"); ``` ## Example: Capturing groups ```rust use fancy_regex::Regex; let re = Regex::new(r"(?exp)` : match *exp*, creating capture group named *name* \ `\k` : match the exact string that the capture group named *name* matched \ `(?Pexp)` : same as `(?exp)` for compatibility with Python, etc. \ `(?P=name)` : same as `\k` for compatibility with Python, etc. Look-around assertions for matching without changing the current position: `(?=exp)` : look-ahead, succeeds if *exp* matches to the right of the current position \ `(?!exp)` : negative look-ahead, succeeds if *exp* doesn't match to the right \ `(?<=exp)` : look-behind, succeeds if *exp* matches to the left of the current position \ `(?exp)` to prevent backtracking within `exp`, e.g.: ``` # use fancy_regex::Regex; let re = Regex::new(r"^a(?>bc|b)c$").unwrap(); assert!(re.is_match("abcc").unwrap()); // Doesn't match because `|b` is never tried because of the atomic group assert!(!re.is_match("abc").unwrap()); ``` [regex]: https://crates.io/crates/regex */ #![doc(html_root_url = "https://docs.rs/fancy-regex/0.7.1")] #![deny(missing_docs)] #![deny(missing_debug_implementations)] use std::fmt; use std::fmt::{Debug, Formatter}; use std::ops::{Index, Range}; use std::str::FromStr; use std::sync::Arc; use std::usize; mod analyze; mod compile; mod error; mod expand; mod parse; mod replacer; mod vm; use crate::analyze::analyze; use crate::compile::compile; use crate::parse::{ExprTree, NamedGroups, Parser}; use crate::vm::Prog; pub use crate::error::{Error, Result}; pub use crate::expand::Expander; pub use crate::replacer::{NoExpand, Replacer, ReplacerRef}; use std::borrow::Cow; const MAX_RECURSION: usize = 64; // the public API /// A builder for a `Regex` to allow configuring options. #[derive(Debug)] pub struct RegexBuilder(RegexOptions); /// A compiled regular expression. #[derive(Clone)] pub struct Regex { inner: RegexImpl, named_groups: Arc, } // Separate enum because we don't want to expose any of this #[derive(Clone)] enum RegexImpl { // Do we want to box this? It's pretty big... Wrap { inner: regex::Regex, options: RegexOptions, }, Fancy { prog: Prog, n_groups: usize, options: RegexOptions, }, } /// A single match of a regex or group in an input text #[derive(Copy, Clone, Debug, Eq, PartialEq)] pub struct Match<'t> { text: &'t str, start: usize, end: usize, } /// An iterator over all non-overlapping matches for a particular string. /// /// The iterator yields a `Result`. The iterator stops when no more /// matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. #[derive(Debug)] pub struct Matches<'r, 't> { re: &'r Regex, text: &'t str, last_end: usize, last_match: Option, } impl<'r, 't> Matches<'r, 't> { /// Return the text being searched. pub fn text(&self) -> &'t str { self.text } /// Return the underlying regex. pub fn regex(&self) -> &'r Regex { &self.re } } impl<'r, 't> Iterator for Matches<'r, 't> { type Item = Result>; /// Adapted from the `regex` crate. Calls `find_from_pos` repeatedly. /// Ignores empty matches immediately after a match. fn next(&mut self) -> Option { if self.last_end > self.text.len() { return None; } let mat = match self.re.find_from_pos(self.text, self.last_end) { Err(error) => return Some(Err(error)), Ok(None) => return None, Ok(Some(mat)) => mat, }; if mat.start == mat.end { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = next_utf8(self.text, mat.end); // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(mat.end) == self.last_match { return self.next(); } } else { self.last_end = mat.end; } self.last_match = Some(mat.end); Some(Ok(mat)) } } /// An iterator that yields all non-overlapping capture groups matching a /// particular regular expression. /// /// The iterator stops when no more matches can be found. /// /// `'r` is the lifetime of the compiled regular expression and `'t` is the /// lifetime of the matched string. #[derive(Debug)] pub struct CaptureMatches<'r, 't>(Matches<'r, 't>); impl<'r, 't> CaptureMatches<'r, 't> { /// Return the text being searched. pub fn text(&self) -> &'t str { self.0.text } /// Return the underlying regex. pub fn regex(&self) -> &'r Regex { &self.0.re } } impl<'r, 't> Iterator for CaptureMatches<'r, 't> { type Item = Result>; /// Adapted from the `regex` crate. Calls `captures_from_pos` repeatedly. /// Ignores empty matches immediately after a match. fn next(&mut self) -> Option { if self.0.last_end > self.0.text.len() { return None; } let captures = match self.0.re.captures_from_pos(self.0.text, self.0.last_end) { Err(error) => return Some(Err(error)), Ok(None) => return None, Ok(Some(captures)) => captures, }; let mat = captures .get(0) .expect("`Captures` is expected to have entire match at 0th position"); if mat.start == mat.end { self.0.last_end = next_utf8(self.0.text, mat.end); if Some(mat.end) == self.0.last_match { return self.next(); } } else { self.0.last_end = mat.end; } self.0.last_match = Some(mat.end); Some(Ok(captures)) } } /// A set of capture groups found for a regex. #[derive(Debug)] pub struct Captures<'t> { inner: CapturesImpl<'t>, named_groups: Arc, } #[derive(Debug)] enum CapturesImpl<'t> { Wrap { text: &'t str, locations: regex::CaptureLocations, }, Fancy { text: &'t str, saves: Vec, }, } /// Iterator for captured groups in order in which they appear in the regex. #[derive(Debug)] pub struct SubCaptureMatches<'c, 't> { caps: &'c Captures<'t>, i: usize, } #[derive(Clone, Debug)] struct RegexOptions { pattern: String, backtrack_limit: usize, delegate_size_limit: Option, delegate_dfa_size_limit: Option, } impl Default for RegexOptions { fn default() -> Self { RegexOptions { pattern: String::new(), backtrack_limit: 1_000_000, delegate_size_limit: None, delegate_dfa_size_limit: None, } } } impl RegexBuilder { /// Create a new regex builder with a regex pattern. /// /// If the pattern is invalid, the call to `build` will fail later. pub fn new(pattern: &str) -> Self { let mut builder = RegexBuilder(RegexOptions::default()); builder.0.pattern = pattern.to_string(); builder } /// Build the `Regex`. /// /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed. pub fn build(&self) -> Result { Regex::new_options(self.0.clone()) } /// Limit for how many times backtracking should be attempted for fancy regexes (where /// backtracking is used). If this limit is exceeded, execution returns an error with /// [`Error::BacktrackLimitExceeded`](enum.Error.html#variant.BacktrackLimitExceeded). /// This is for preventing a regex with catastrophic backtracking to run for too long. /// /// Default is `1_000_000` (1 million). pub fn backtrack_limit(&mut self, limit: usize) -> &mut Self { self.0.backtrack_limit = limit; self } /// Set the approximate size limit of the compiled regular expression. /// /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As /// such the actual limit is closer to ` * delegate_size_limit`. pub fn delegate_size_limit(&mut self, limit: usize) -> &mut Self { self.0.delegate_size_limit = Some(limit); self } /// Set the approximate size of the cache used by the DFA. /// /// This option is forwarded from the wrapped `regex` crate. Note that depending on the used /// regex features there may be multiple delegated sub-regexes fed to the `regex` crate. As /// such the actual limit is closer to ` * /// delegate_dfa_size_limit`. pub fn delegate_dfa_size_limit(&mut self, limit: usize) -> &mut Self { self.0.delegate_dfa_size_limit = Some(limit); self } } impl fmt::Debug for Regex { /// Shows the original regular expression. fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl fmt::Display for Regex { /// Shows the original regular expression fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result { write!(f, "{}", self.as_str()) } } impl FromStr for Regex { type Err = Error; /// Attempts to parse a string into a regular expression fn from_str(s: &str) -> Result { Regex::new(s) } } impl Regex { /// Parse and compile a regex with default options, see `RegexBuilder`. /// /// Returns an [`Error`](enum.Error.html) if the pattern could not be parsed. pub fn new(re: &str) -> Result { let options = RegexOptions { pattern: re.to_string(), ..RegexOptions::default() }; Self::new_options(options) } fn new_options(options: RegexOptions) -> Result { let raw_tree = Expr::parse_tree(&options.pattern)?; // wrapper to search for re at arbitrary start position, // and to capture the match bounds let tree = ExprTree { expr: Expr::Concat(vec![ Expr::Repeat { child: Box::new(Expr::Any { newline: true }), lo: 0, hi: usize::MAX, greedy: false, }, Expr::Group(Box::new(raw_tree.expr)), ]), ..raw_tree }; let info = analyze(&tree)?; let inner_info = &info.children[1].children[0]; // references inner expr if !inner_info.hard { // easy case, wrap regex // we do our own to_str because escapes are different let mut re_cooked = String::new(); // same as raw_tree.expr above, but it was moved, so traverse to find it let raw_e = match tree.expr { Expr::Concat(ref v) => match v[1] { Expr::Group(ref child) => child, _ => unreachable!(), }, _ => unreachable!(), }; raw_e.to_str(&mut re_cooked, 0); let inner = compile::compile_inner(&re_cooked, &options)?; return Ok(Regex { inner: RegexImpl::Wrap { inner, options }, named_groups: Arc::new(tree.named_groups), }); } let prog = compile(&info)?; Ok(Regex { inner: RegexImpl::Fancy { prog, n_groups: info.end_group, options, }, named_groups: Arc::new(tree.named_groups), }) } /// Returns the original string of this regex. pub fn as_str(&self) -> &str { match &self.inner { RegexImpl::Wrap { options, .. } => &options.pattern, RegexImpl::Fancy { options, .. } => &options.pattern, } } /// Check if the regex matches the input text. /// /// # Example /// /// Test if some text contains the same word twice: /// /// ```rust /// # use fancy_regex::Regex; /// /// let re = Regex::new(r"(\w+) \1").unwrap(); /// assert!(re.is_match("mirror mirror on the wall").unwrap()); /// ``` pub fn is_match(&self, text: &str) -> Result { match &self.inner { RegexImpl::Wrap { ref inner, .. } => Ok(inner.is_match(text)), RegexImpl::Fancy { ref prog, options, .. } => { let result = vm::run(prog, text, 0, 0, options)?; Ok(result.is_some()) } } } /// Returns an iterator for each successive non-overlapping match in `text`. /// /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures_iter()] /// method. /// /// # Example /// /// Find all words followed by an exclamation point: /// /// ```rust /// # use fancy_regex::Regex; /// /// let re = Regex::new(r"\w+(?=!)").unwrap(); /// let mut matches = re.find_iter("so fancy! even with! iterators!"); /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "fancy"); /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "with"); /// assert_eq!(matches.next().unwrap().unwrap().as_str(), "iterators"); /// assert!(matches.next().is_none()); /// ``` pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> Matches<'r, 't> { Matches { re: &self, text, last_end: 0, last_match: None, } } /// Find the first match in the input text. /// /// If you have capturing groups in your regex that you want to extract, use the [Regex::captures()] /// method. /// /// # Example /// /// Find a word that is followed by an exclamation point: /// /// ```rust /// # use fancy_regex::Regex; /// /// let re = Regex::new(r"\w+(?=!)").unwrap(); /// assert_eq!(re.find("so fancy!").unwrap().unwrap().as_str(), "fancy"); /// ``` pub fn find<'t>(&self, text: &'t str) -> Result>> { self.find_from_pos(text, 0) } /// Returns the first match in `text`, starting from the specified byte position `pos`. /// /// # Examples /// /// Finding match starting at a position: /// /// ``` /// # use fancy_regex::Regex; /// let re = Regex::new(r"(?m:^)(\d+)").unwrap(); /// let text = "1 test 123\n2 foo"; /// let mat = re.find_from_pos(text, 7).unwrap().unwrap(); /// /// assert_eq!(mat.start(), 11); /// assert_eq!(mat.end(), 12); /// ``` /// /// Note that in some cases this is not the same as using the `find` /// method and passing a slice of the string, see [Regex::captures_from_pos()] for details. pub fn find_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result>> { match &self.inner { RegexImpl::Wrap { inner, .. } => Ok(inner .find_at(text, pos) .map(|m| Match::new(text, m.start(), m.end()))), RegexImpl::Fancy { prog, options, .. } => { let result = vm::run(prog, text, pos, 0, options)?; Ok(result.map(|saves| Match::new(text, saves[0], saves[1]))) } } } /// Returns an iterator over all the non-overlapping capture groups matched in `text`. /// /// # Examples /// /// Finding all matches and capturing parts of each: /// /// ```rust /// # use fancy_regex::Regex; /// /// let re = Regex::new(r"(\d{4})-(\d{2})").unwrap(); /// let text = "It was between 2018-04 and 2020-01"; /// let mut all_captures = re.captures_iter(text); /// /// let first = all_captures.next().unwrap().unwrap(); /// assert_eq!(first.get(1).unwrap().as_str(), "2018"); /// assert_eq!(first.get(2).unwrap().as_str(), "04"); /// assert_eq!(first.get(0).unwrap().as_str(), "2018-04"); /// /// let second = all_captures.next().unwrap().unwrap(); /// assert_eq!(second.get(1).unwrap().as_str(), "2020"); /// assert_eq!(second.get(2).unwrap().as_str(), "01"); /// assert_eq!(second.get(0).unwrap().as_str(), "2020-01"); /// /// assert!(all_captures.next().is_none()); /// ``` pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> CaptureMatches<'r, 't> { CaptureMatches(self.find_iter(text)) } /// Returns the capture groups for the first match in `text`. /// /// If no match is found, then `Ok(None)` is returned. /// /// # Examples /// /// Finding matches and capturing parts of the match: /// /// ```rust /// # use fancy_regex::Regex; /// /// let re = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap(); /// let text = "The date was 2018-04-07"; /// let captures = re.captures(text).unwrap().unwrap(); /// /// assert_eq!(captures.get(1).unwrap().as_str(), "2018"); /// assert_eq!(captures.get(2).unwrap().as_str(), "04"); /// assert_eq!(captures.get(3).unwrap().as_str(), "07"); /// assert_eq!(captures.get(0).unwrap().as_str(), "2018-04-07"); /// ``` pub fn captures<'t>(&self, text: &'t str) -> Result>> { self.captures_from_pos(text, 0) } /// Returns the capture groups for the first match in `text`, starting from /// the specified byte position `pos`. /// /// # Examples /// /// Finding captures starting at a position: /// /// ``` /// # use fancy_regex::Regex; /// let re = Regex::new(r"(?m:^)(\d+)").unwrap(); /// let text = "1 test 123\n2 foo"; /// let captures = re.captures_from_pos(text, 7).unwrap().unwrap(); /// /// let group = captures.get(1).unwrap(); /// assert_eq!(group.as_str(), "2"); /// assert_eq!(group.start(), 11); /// assert_eq!(group.end(), 12); /// ``` /// /// Note that in some cases this is not the same as using the `captures` /// method and passing a slice of the string, see the capture that we get /// when we do this: /// /// ``` /// # use fancy_regex::Regex; /// let re = Regex::new(r"(?m:^)(\d+)").unwrap(); /// let text = "1 test 123\n2 foo"; /// let captures = re.captures(&text[7..]).unwrap().unwrap(); /// assert_eq!(captures.get(1).unwrap().as_str(), "123"); /// ``` /// /// This matched the number "123" because it's at the beginning of the text /// of the string slice. /// pub fn captures_from_pos<'t>(&self, text: &'t str, pos: usize) -> Result>> { let named_groups = self.named_groups.clone(); match &self.inner { RegexImpl::Wrap { inner, .. } => { let mut locations = inner.capture_locations(); let result = inner.captures_read_at(&mut locations, text, pos); Ok(result.map(|_| Captures { inner: CapturesImpl::Wrap { text, locations }, named_groups, })) } RegexImpl::Fancy { prog, n_groups, options, .. } => { let result = vm::run(prog, text, pos, 0, options)?; Ok(result.map(|mut saves| { saves.truncate(n_groups * 2); Captures { inner: CapturesImpl::Fancy { text, saves }, named_groups, } })) } } } /// Returns the number of captures, including the implicit capture of the entire expression. pub fn captures_len(&self) -> usize { match &self.inner { RegexImpl::Wrap { inner, .. } => inner.captures_len(), RegexImpl::Fancy { n_groups, .. } => *n_groups, } } /// Returns an iterator over the capture names. pub fn capture_names(&self) -> CaptureNames { let mut names = Vec::new(); names.resize(self.captures_len(), None); for (name, &i) in self.named_groups.iter() { names[i] = Some(name.as_str()); } CaptureNames(names.into_iter()) } // for debugging only #[doc(hidden)] pub fn debug_print(&self) { match &self.inner { RegexImpl::Wrap { inner, .. } => println!("wrapped {:?}", inner), RegexImpl::Fancy { prog, .. } => prog.debug_print(), } } /// Replaces the leftmost-first match with the replacement provided. /// The replacement can be a regular string (where `$N` and `$name` are /// expanded to match capture groups) or a function that takes the matches' /// `Captures` and returns the replaced string. /// /// If no match is found, then a copy of the string is returned unchanged. /// /// # Replacement string syntax /// /// All instances of `$name` in the replacement text is replaced with the /// corresponding capture group `name`. /// /// `name` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// /// If `name` isn't a valid capture group (whether the name doesn't exist /// or isn't a valid index), then it is replaced with the empty string. /// /// The longest possible name is used. e.g., `$1a` looks up the capture /// group named `1a` and not the capture group at index `1`. To exert more /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$` use `$$`. /// /// # Examples /// /// Note that this function is polymorphic with respect to the replacement. /// In typical usage, this can just be a normal string: /// /// ```rust /// # use fancy_regex::Regex; /// let re = Regex::new("[^01]+").unwrap(); /// assert_eq!(re.replace("1078910", ""), "1010"); /// ``` /// /// But anything satisfying the `Replacer` trait will work. For example, /// a closure of type `|&Captures| -> String` provides direct access to the /// captures corresponding to a match. This allows one to access /// capturing group matches easily: /// /// ```rust /// # use fancy_regex::{Regex, Captures}; /// let re = Regex::new(r"([^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", |caps: &Captures| { /// format!("{} {}", &caps[2], &caps[1]) /// }); /// assert_eq!(result, "Bruce Springsteen"); /// ``` /// /// But this is a bit cumbersome to use all the time. Instead, a simple /// syntax is supported that expands `$name` into the corresponding capture /// group. Here's the last example, but using this expansion technique /// with named capture groups: /// /// ```rust /// # use fancy_regex::Regex; /// let re = Regex::new(r"(?P[^,\s]+),\s+(?P\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", "$first $last"); /// assert_eq!(result, "Bruce Springsteen"); /// ``` /// /// Note that using `$2` instead of `$first` or `$1` instead of `$last` /// would produce the same result. To write a literal `$` use `$$`. /// /// Sometimes the replacement string requires use of curly braces to /// delineate a capture group replacement and surrounding literal text. /// For example, if we wanted to join two words together with an /// underscore: /// /// ```rust /// # use fancy_regex::Regex; /// let re = Regex::new(r"(?P\w+)\s+(?P\w+)").unwrap(); /// let result = re.replace("deep fried", "${first}_$second"); /// assert_eq!(result, "deep_fried"); /// ``` /// /// Without the curly braces, the capture group name `first_` would be /// used, and since it doesn't exist, it would be replaced with the empty /// string. /// /// Finally, sometimes you just want to replace a literal string with no /// regard for capturing group expansion. This can be done by wrapping a /// byte string with `NoExpand`: /// /// ```rust /// # use fancy_regex::Regex; /// use fancy_regex::NoExpand; /// /// let re = Regex::new(r"(?P[^,\s]+),\s+(\S+)").unwrap(); /// let result = re.replace("Springsteen, Bruce", NoExpand("$2 $last")); /// assert_eq!(result, "$2 $last"); /// ``` pub fn replace<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { self.replacen(text, 1, rep) } /// Replaces all non-overlapping matches in `text` with the replacement /// provided. This is the same as calling `replacen` with `limit` set to /// `0`. /// /// See the documentation for `replace` for details on how to access /// capturing group matches in the replacement string. pub fn replace_all<'t, R: Replacer>(&self, text: &'t str, rep: R) -> Cow<'t, str> { self.replacen(text, 0, rep) } /// Replaces at most `limit` non-overlapping matches in `text` with the /// replacement provided. If `limit` is 0, then all non-overlapping matches /// are replaced. /// /// See the documentation for `replace` for details on how to access /// capturing group matches in the replacement string. pub fn replacen<'t, R: Replacer>( &self, text: &'t str, limit: usize, mut rep: R, ) -> Cow<'t, str> { // If we know that the replacement doesn't have any capture expansions, // then we can fast path. The fast path can make a tremendous // difference: // // 1) We use `find_iter` instead of `captures_iter`. Not asking for // captures generally makes the regex engines faster. // 2) We don't need to look up all of the capture groups and do // replacements inside the replacement string. We just push it // at each match and be done with it. if let Some(rep) = rep.no_expansion() { let mut it = self.find_iter(text).enumerate().peekable(); if it.peek().is_none() { return Cow::Borrowed(text); } let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, m) in it { let m = m.unwrap(); if limit > 0 && i >= limit { break; } new.push_str(&text[last_match..m.start()]); new.push_str(&rep); last_match = m.end(); } new.push_str(&text[last_match..]); return Cow::Owned(new); } // The slower path, which we use if the replacement needs access to // capture groups. let mut it = self.captures_iter(text).enumerate().peekable(); if it.peek().is_none() { return Cow::Borrowed(text); } let mut new = String::with_capacity(text.len()); let mut last_match = 0; for (i, cap) in it { let cap = cap.unwrap(); if limit > 0 && i >= limit { break; } // unwrap on 0 is OK because captures only reports matches let m = cap.get(0).unwrap(); new.push_str(&text[last_match..m.start()]); rep.replace_append(&cap, &mut new); last_match = m.end(); } new.push_str(&text[last_match..]); Cow::Owned(new) } } impl<'t> Match<'t> { /// Returns the starting byte offset of the match in the text. #[inline] pub fn start(&self) -> usize { self.start } /// Returns the ending byte offset of the match in the text. #[inline] pub fn end(&self) -> usize { self.end } /// Returns the range over the starting and ending byte offsets of the match in text. #[inline] pub fn range(&self) -> Range { self.start..self.end } /// Returns the matched text. #[inline] pub fn as_str(&self) -> &'t str { &self.text[self.start..self.end] } /// Creates a new match from the given text and byte offsets. fn new(text: &'t str, start: usize, end: usize) -> Match<'t> { Match { text, start, end } } } impl<'t> From> for &'t str { fn from(m: Match<'t>) -> &'t str { m.as_str() } } impl<'t> From> for Range { fn from(m: Match<'t>) -> Range { m.range() } } #[allow(clippy::len_without_is_empty)] // follow regex's API impl<'t> Captures<'t> { /// Get the capture group by its index in the regex. /// /// If there is no match for that group or the index does not correspond to a group, `None` is /// returned. The index 0 returns the whole match. pub fn get(&self, i: usize) -> Option> { match &self.inner { CapturesImpl::Wrap { text, locations } => { locations .get(i) .map(|(start, end)| Match { text, start, end }) } CapturesImpl::Fancy { text, ref saves } => { let slot = i * 2; if slot >= saves.len() { return None; } let lo = saves[slot]; if lo == std::usize::MAX { return None; } let hi = saves[slot + 1]; Some(Match { text, start: lo, end: hi, }) } } } /// Returns the match for a named capture group. Returns `None` the capture /// group did not match or if there is no group with the given name. pub fn name(&self, name: &str) -> Option> { self.named_groups.get(name).and_then(|i| self.get(*i)) } /// Expands all instances of `$group` in `replacement` to the corresponding /// capture group `name`, and writes them to the `dst` buffer given. /// /// `group` may be an integer corresponding to the index of the /// capture group (counted by order of opening parenthesis where `\0` is the /// entire match) or it can be a name (consisting of letters, digits or /// underscores) corresponding to a named capture group. /// /// If `group` isn't a valid capture group (whether the name doesn't exist /// or isn't a valid index), then it is replaced with the empty string. /// /// The longest possible name is used. e.g., `$1a` looks up the capture /// group named `1a` and not the capture group at index `1`. To exert more /// precise control over the name, use braces, e.g., `${1}a`. /// /// To write a literal `$`, use `$$`. /// /// For more control over expansion, see [`Expander`]. /// /// [`Expander`]: expand/struct.Expander.html pub fn expand(&self, replacement: &str, dst: &mut String) { Expander::default().append_expansion(dst, replacement, self); } /// Iterate over the captured groups in order in which they appeared in the regex. The first /// capture corresponds to the whole match. pub fn iter<'c>(&'c self) -> SubCaptureMatches<'c, 't> { SubCaptureMatches { caps: self, i: 0 } } /// How many groups were captured. This is always at least 1 because group 0 returns the whole /// match. pub fn len(&self) -> usize { match &self.inner { CapturesImpl::Wrap { locations, .. } => locations.len(), CapturesImpl::Fancy { saves, .. } => saves.len() / 2, } } } /// Copied from [`regex::Captures`]... /// /// Get a group by index. /// /// `'t` is the lifetime of the matched text. /// /// The text can't outlive the `Captures` object if this method is /// used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it); to do that, use `get()` instead. /// /// # Panics /// /// If there is no group at the given index. impl<'t> Index for Captures<'t> { type Output = str; fn index(&self, i: usize) -> &str { self.get(i) .map(|m| m.as_str()) .unwrap_or_else(|| panic!("no group at index '{}'", i)) } } /// Copied from [`regex::Captures`]... /// /// Get a group by name. /// /// `'t` is the lifetime of the matched text and `'i` is the lifetime /// of the group name (the index). /// /// The text can't outlive the `Captures` object if this method is /// used, because of how `Index` is defined (normally `a[i]` is part /// of `a` and can't outlive it); to do that, use `name` instead. /// /// # Panics /// /// If there is no group named by the given value. impl<'t, 'i> Index<&'i str> for Captures<'t> { type Output = str; fn index<'a>(&'a self, name: &'i str) -> &'a str { self.name(name) .map(|m| m.as_str()) .unwrap_or_else(|| panic!("no group named '{}'", name)) } } impl<'c, 't> Iterator for SubCaptureMatches<'c, 't> { type Item = Option>; fn next(&mut self) -> Option>> { if self.i < self.caps.len() { let result = self.caps.get(self.i); self.i += 1; Some(result) } else { None } } } // TODO: might be nice to implement ExactSizeIterator etc for SubCaptures /// Regular expression AST. This is public for now but may change. #[derive(Debug, PartialEq, Eq)] pub enum Expr { /// An empty expression, e.g. the last branch in `(a|b|)` Empty, /// Any character, regex `.` Any { /// Whether it also matches newlines or not newline: bool, }, /// Start of input text StartText, /// End of input text EndText, /// Start of a line StartLine, /// End of a line EndLine, /// The string as a literal, e.g. `a` Literal { /// The string to match val: String, /// Whether match is case-insensitive or not casei: bool, }, /// Concatenation of multiple expressions, must match in order, e.g. `a.` is a concatenation of /// the literal `a` and `.` for any character Concat(Vec), /// Alternative of multiple expressions, one of them must match, e.g. `a|b` is an alternative /// where either the literal `a` or `b` must match Alt(Vec), /// Capturing group of expression, e.g. `(a.)` matches `a` and any character and "captures" /// (remembers) the match Group(Box), /// Look-around (e.g. positive/negative look-ahead or look-behind) with an expression, e.g. /// `(?=a)` means the next character must be `a` (but the match is not consumed) LookAround(Box, LookAround), /// Repeat of an expression, e.g. `a*` or `a+` or `a{1,3}` Repeat { /// The expression that is being repeated child: Box, /// The minimum number of repetitions lo: usize, /// The maximum number of repetitions (or `usize::MAX`) hi: usize, /// Greedy means as much as possible is matched, e.g. `.*b` would match all of `abab`. /// Non-greedy means as little as possible, e.g. `.*?b` would match only `ab` in `abab`. greedy: bool, }, /// Delegate a regex to the regex crate. This is used as a simplification so that we don't have /// to represent all the expressions in the AST, e.g. character classes. Delegate { /// The regex inner: String, /// How many characters the regex matches size: usize, // TODO: move into analysis result /// Whether the matching is case-insensitive or not casei: bool, }, /// Back reference to a capture group, e.g. `\1` in `(abc|def)\1` references the captured group /// and the whole regex matches either `abcabc` or `defdef`. Backref(usize), /// Back reference to a named capture group. NamedBackref(String), /// Atomic non-capturing group, e.g. `(?>ab|a)` in text that contains `ab` will match `ab` and /// never backtrack and try `a`, even if matching fails after the atomic group. AtomicGroup(Box), } /// Type of look-around assertion as used for a look-around expression. #[derive(Debug, PartialEq, Eq, Clone, Copy)] pub enum LookAround { /// Look-ahead assertion, e.g. `(?=a)` LookAhead, /// Negative look-ahead assertion, e.g. `(?!a)` LookAheadNeg, /// Look-behind assertion, e.g. `(?<=a)` LookBehind, /// Negative look-behind assertion, e.g. `(?(std::vec::IntoIter>); impl Debug for CaptureNames<'_> { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str("") } } impl<'r> Iterator for CaptureNames<'r> { type Item = Option<&'r str>; fn next(&mut self) -> Option { self.0.next() } } // silly to write my own, but this is super-fast for the common 1-digit // case. fn push_usize(s: &mut String, x: usize) { if x >= 10 { push_usize(s, x / 10); s.push((b'0' + (x % 10) as u8) as char); } else { s.push((b'0' + (x as u8)) as char); } } fn is_special(c: char) -> bool { match c { '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' | '[' | ']' | '{' | '}' | '^' | '$' | '#' => true, _ => false, } } fn push_quoted(buf: &mut String, s: &str) { for c in s.chars() { if is_special(c) { buf.push('\\'); } buf.push(c); } } /// Escapes special characters in `text` with '\\'. Returns a string which, when interpreted /// as a regex, matches exactly `text`. pub fn escape(text: &str) -> Cow { // Using bytes() is OK because all special characters are single bytes. match text.bytes().filter(|&b| is_special(b as char)).count() { 0 => Cow::Borrowed(text), n => { // The capacity calculation is exact because '\\' is a single byte. let mut buf = String::with_capacity(text.len() + n); push_quoted(&mut buf, text); Cow::Owned(buf) } } } impl Expr { /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups /// that are referenced by backrefs. pub fn parse_tree(re: &str) -> Result { Parser::parse(re) } /// Convert expression to a regex string in the regex crate's syntax. /// /// # Panics /// /// Panics for expressions that are hard, i.e. can not be handled by the regex crate. pub fn to_str(&self, buf: &mut String, precedence: u8) { match *self { Expr::Empty => (), Expr::Any { newline } => buf.push_str(if newline { "(?s:.)" } else { "." }), Expr::Literal { ref val, casei } => { if casei { buf.push_str("(?i:"); } push_quoted(buf, val); if casei { buf.push_str(")"); } } Expr::StartText => buf.push('^'), Expr::EndText => buf.push('$'), Expr::StartLine => buf.push_str("(?m:^)"), Expr::EndLine => buf.push_str("(?m:$)"), Expr::Concat(ref children) => { if precedence > 1 { buf.push_str("(?:"); } for child in children { child.to_str(buf, 2); } if precedence > 1 { buf.push(')') } } Expr::Alt(ref children) => { if precedence > 0 { buf.push_str("(?:"); } let is_empty = |e: &Expr| match e { Expr::Empty => true, _ => false, }; let contains_empty = children.iter().any(is_empty); if contains_empty { buf.push_str("(?:"); } for (i, child) in children.iter().filter(|&c| !is_empty(c)).enumerate() { if i != 0 { buf.push('|'); } child.to_str(buf, 1); } if contains_empty { // regex fails with `(a|b|)`, so transform to `((?:a|b)?)` buf.push_str(")?"); } if precedence > 0 { buf.push(')'); } } Expr::Group(ref child) => { buf.push('('); child.to_str(buf, 0); buf.push(')'); } Expr::Repeat { ref child, lo, hi, greedy, } => { if precedence > 2 { buf.push_str("(?:"); } child.to_str(buf, 3); match (lo, hi) { (0, 1) => buf.push('?'), (0, usize::MAX) => buf.push('*'), (1, usize::MAX) => buf.push('+'), (lo, hi) => { buf.push('{'); push_usize(buf, lo); if lo != hi { buf.push(','); if hi != usize::MAX { push_usize(buf, hi); } } buf.push('}'); } } if !greedy { buf.push('?'); } if precedence > 2 { buf.push(')'); } } Expr::Delegate { ref inner, casei, .. } => { // at the moment, delegate nodes are just atoms if casei { buf.push_str("(?i:"); } buf.push_str(inner); if casei { buf.push_str(")"); } } _ => panic!("attempting to format hard expr"), } } } // precondition: ix > 0 fn prev_codepoint_ix(s: &str, mut ix: usize) -> usize { let bytes = s.as_bytes(); loop { ix -= 1; // fancy bit magic for ranges 0..0x80 + 0xc0.. if (bytes[ix] as i8) >= -0x40 { break; } } ix } fn codepoint_len(b: u8) -> usize { match b { b if b < 0x80 => 1, b if b < 0xe0 => 2, b if b < 0xf0 => 3, _ => 4, } } /// Returns the smallest possible index of the next valid UTF-8 sequence /// starting after `i`. /// Adapted from a function with the same name in the `regex` crate. fn next_utf8(text: &str, i: usize) -> usize { let b = match text.as_bytes().get(i) { None => return i + 1, Some(&b) => b, }; i + codepoint_len(b) } // If this returns false, then there is no possible backref in the re // Both potential implementations are turned off, because we currently // always need to do a deeper analysis because of 1-character // look-behind. If we could call a find_from_pos method of regex::Regex, // it would make sense to bring this back. /* pub fn detect_possible_backref(re: &str) -> bool { let mut last = b'\x00'; for b in re.as_bytes() { if b'0' <= *b && *b <= b'9' && last == b'\\' { return true; } last = *b; } false } pub fn detect_possible_backref(re: &str) -> bool { let mut bytes = re.as_bytes(); loop { match memchr::memchr(b'\\', &bytes[..bytes.len() - 1]) { Some(i) => { bytes = &bytes[i + 1..]; let c = bytes[0]; if b'0' <= c && c <= b'9' { return true; } } None => return false } } } */ /// The internal module only exists so that the toy example can access internals for debugging and /// experimenting. #[doc(hidden)] pub mod internal { pub use crate::analyze::analyze; pub use crate::compile::compile; pub use crate::vm::{run_default, run_trace, Insn, Prog}; } #[cfg(test)] mod tests { use crate::parse::make_literal; use crate::Expr; use crate::Regex; use std::borrow::Cow; use std::usize; //use detect_possible_backref; // tests for to_str fn to_str(e: Expr) -> String { let mut s = String::new(); e.to_str(&mut s, 0); s } #[test] fn to_str_concat_alt() { let e = Expr::Concat(vec![ Expr::Alt(vec![make_literal("a"), make_literal("b")]), make_literal("c"), ]); assert_eq!(to_str(e), "(?:a|b)c"); } #[test] fn to_str_rep_concat() { let e = Expr::Repeat { child: Box::new(Expr::Concat(vec![make_literal("a"), make_literal("b")])), lo: 2, hi: 3, greedy: true, }; assert_eq!(to_str(e), "(?:ab){2,3}"); } #[test] fn to_str_group_alt() { let e = Expr::Group(Box::new(Expr::Alt(vec![ make_literal("a"), make_literal("b"), ]))); assert_eq!(to_str(e), "(a|b)"); } #[test] fn as_str_debug() { let s = r"(a+)b\1"; let regex = Regex::new(s).unwrap(); assert_eq!(s, regex.as_str()); assert_eq!(s, format!("{:?}", regex)); } #[test] fn display() { let s = r"(a+)b\1"; let regex = Regex::new(s).unwrap(); assert_eq!(s, format!("{}", regex)); } #[test] fn from_str() { let s = r"(a+)b\1"; let regex = s.parse::().unwrap(); assert_eq!(regex.as_str(), s); } #[test] fn to_str_repeat() { fn repeat(lo: usize, hi: usize, greedy: bool) -> Expr { Expr::Repeat { child: Box::new(make_literal("a")), lo, hi, greedy, } } assert_eq!(to_str(repeat(2, 2, true)), "a{2}"); assert_eq!(to_str(repeat(2, 2, false)), "a{2}?"); assert_eq!(to_str(repeat(2, 3, true)), "a{2,3}"); assert_eq!(to_str(repeat(2, 3, false)), "a{2,3}?"); assert_eq!(to_str(repeat(2, usize::MAX, true)), "a{2,}"); assert_eq!(to_str(repeat(2, usize::MAX, false)), "a{2,}?"); assert_eq!(to_str(repeat(0, 1, true)), "a?"); assert_eq!(to_str(repeat(0, 1, false)), "a??"); assert_eq!(to_str(repeat(0, usize::MAX, true)), "a*"); assert_eq!(to_str(repeat(0, usize::MAX, false)), "a*?"); assert_eq!(to_str(repeat(1, usize::MAX, true)), "a+"); assert_eq!(to_str(repeat(1, usize::MAX, false)), "a+?"); } #[test] fn escape() { // Check that strings that need no quoting are borrowed, and that non-special punctuation // is not quoted. match crate::escape("@foo") { Cow::Borrowed(s) => assert_eq!(s, "@foo"), _ => panic!("Value should be borrowed."), } // Check typical usage. assert_eq!(crate::escape("fo*o").into_owned(), "fo\\*o"); // Check that multibyte characters are handled correctly. assert_eq!(crate::escape("fø*ø").into_owned(), "fø\\*ø"); } /* #[test] fn detect_backref() { assert_eq!(detect_possible_backref("a0a1a2"), false); assert_eq!(detect_possible_backref("a0a1\\a2"), false); assert_eq!(detect_possible_backref("a0a\\1a2"), true); assert_eq!(detect_possible_backref("a0a1a2\\"), false); } */ } fancy-regex-0.7.1/src/parse.rs000064400000000000000000001237070000000000000142770ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! A regex parser yielding an AST. use bit_set::BitSet; use regex::escape; use std::collections::HashMap; use std::str::FromStr; use std::usize; use crate::codepoint_len; use crate::Error; use crate::Expr; use crate::LookAround::*; use crate::Result; use crate::MAX_RECURSION; const FLAG_CASEI: u32 = 1; const FLAG_MULTI: u32 = 1 << 1; const FLAG_DOTNL: u32 = 1 << 2; const FLAG_SWAP_GREED: u32 = 1 << 3; const FLAG_IGNORE_SPACE: u32 = 1 << 4; const FLAG_UNICODE: u32 = 1 << 5; pub(crate) type NamedGroups = HashMap; #[derive(Debug)] pub struct ExprTree { pub expr: Expr, pub backrefs: BitSet, pub named_groups: NamedGroups, } #[derive(Debug)] pub(crate) struct Parser<'a> { re: &'a str, // source backrefs: BitSet, flags: u32, named_groups: NamedGroups, numeric_backrefs: bool, curr_group: usize, // need to keep track of which group number we're parsing } impl<'a> Parser<'a> { /// Parse the regex and return an expression (AST) and a bit set with the indexes of groups /// that are referenced by backrefs. pub(crate) fn parse(re: &str) -> Result { let mut p = Parser::new(re); let (ix, expr) = p.parse_re(0, 0)?; if ix < re.len() { return Err(Error::ParseError); } Ok(ExprTree { expr, backrefs: Default::default(), named_groups: p.named_groups, }) } fn new(re: &str) -> Parser<'_> { Parser { re, backrefs: Default::default(), named_groups: Default::default(), numeric_backrefs: false, flags: FLAG_UNICODE, curr_group: 0, } } fn parse_re(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let (ix, child) = self.parse_branch(ix, depth)?; let mut ix = self.optional_whitespace(ix)?; if self.re[ix..].starts_with('|') { let mut children = vec![child]; while self.re[ix..].starts_with('|') { ix += 1; let (next, child) = self.parse_branch(ix, depth)?; children.push(child); ix = self.optional_whitespace(next)?; } return Ok((ix, Expr::Alt(children))); } // can't have numeric backrefs and named backrefs if self.numeric_backrefs && !self.named_groups.is_empty() { return Err(Error::NamedBackrefOnly); } Ok((ix, child)) } fn parse_branch(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let mut children = Vec::new(); let mut ix = ix; while ix < self.re.len() { let (next, child) = self.parse_piece(ix, depth)?; if next == ix { break; } if child != Expr::Empty { children.push(child); } ix = next; } match children.len() { 0 => Ok((ix, Expr::Empty)), 1 => Ok((ix, children.pop().unwrap())), _ => Ok((ix, Expr::Concat(children))), } } fn parse_piece(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let (ix, child) = self.parse_atom(ix, depth)?; let mut ix = self.optional_whitespace(ix)?; if ix < self.re.len() { // fail when child is empty? let (lo, hi) = match self.re.as_bytes()[ix] { b'?' => (0, 1), b'*' => (0, usize::MAX), b'+' => (1, usize::MAX), b'{' => { match self.parse_repeat(ix) { Ok((next, lo, hi)) => { ix = next - 1; (lo, hi) } Err(_) => { // Invalid repeat syntax, which results in `{` being treated as a literal return Ok((ix, child)); } } } _ => return Ok((ix, child)), }; if !self.is_repeatable(&child) { return Err(Error::TargetNotRepeatable); } ix += 1; ix = self.optional_whitespace(ix)?; let mut greedy = true; if ix < self.re.len() && self.re.as_bytes()[ix] == b'?' { greedy = false; ix += 1; } greedy ^= self.flag(FLAG_SWAP_GREED); let mut node = Expr::Repeat { child: Box::new(child), lo, hi, greedy, }; if ix < self.re.len() && self.re.as_bytes()[ix] == b'+' { ix += 1; node = Expr::AtomicGroup(Box::new(node)); } return Ok((ix, node)); } Ok((ix, child)) } fn is_repeatable(&self, child: &Expr) -> bool { match child { Expr::LookAround(_, _) => false, Expr::Empty => false, Expr::StartText => false, Expr::EndText => false, Expr::StartLine => false, Expr::EndLine => false, _ => true, } } // ix, lo, hi fn parse_repeat(&self, ix: usize) -> Result<(usize, usize, usize)> { let ix = self.optional_whitespace(ix + 1)?; // skip opening '{' let bytes = self.re.as_bytes(); if ix == self.re.len() { return Err(Error::InvalidRepeat); } let mut end = ix; let lo = if bytes[ix] == b',' { 0 } else if let Some((next, lo)) = parse_decimal(self.re, ix) { end = next; lo } else { return Err(Error::InvalidRepeat); }; let ix = self.optional_whitespace(end)?; // past lo number if ix == self.re.len() { return Err(Error::InvalidRepeat); } end = ix; let hi = match bytes[ix] { b'}' => lo, b',' => { end = self.optional_whitespace(ix + 1)?; // past ',' if let Some((next, hi)) = parse_decimal(self.re, end) { end = next; hi } else { usize::MAX } } _ => return Err(Error::InvalidRepeat), }; let ix = self.optional_whitespace(end)?; // past hi number if ix == self.re.len() || bytes[ix] != b'}' { return Err(Error::InvalidRepeat); } Ok((ix + 1, lo, hi)) } fn parse_atom(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let ix = self.optional_whitespace(ix)?; if ix == self.re.len() { return Ok((ix, Expr::Empty)); } match self.re.as_bytes()[ix] { b'.' => Ok(( ix + 1, Expr::Any { newline: self.flag(FLAG_DOTNL), }, )), b'^' => Ok(( ix + 1, if self.flag(FLAG_MULTI) { Expr::StartLine } else { Expr::StartText }, )), b'$' => Ok(( ix + 1, if self.flag(FLAG_MULTI) { Expr::EndLine } else { Expr::EndText }, )), b'(' => self.parse_group(ix, depth), b'\\' => { let (next, expr) = self.parse_escape(ix)?; if let Expr::Backref(group) = expr { self.backrefs.insert(group); } Ok((next, expr)) } b'+' | b'*' | b'?' | b'|' | b')' => Ok((ix, Expr::Empty)), b'[' => self.parse_class(ix), b => { // TODO: maybe want to match multiple codepoints? let next = ix + codepoint_len(b); Ok(( next, Expr::Literal { val: String::from(&self.re[ix..next]), casei: self.flag(FLAG_CASEI), }, )) } } } fn parse_backref(&self, ix: usize, open: &str, close: &str) -> Result<(usize, Expr)> { if let Some((id, skip)) = parse_id(&self.re[ix..], open, close) { let group = if let Some(group) = self.named_groups.get(id) { Some(*group) } else if let Ok(group) = id.parse() { Some(group) } else { None }; if let Some(group) = group { return Ok((ix + skip, Expr::Backref(group))); } // here the name is parsed but it is invalid Err(Error::InvalidGroupNameBackref(id.to_string())) } else { // in this case the name can't be parsed Err(Error::InvalidGroupName) } } // ix points to \ character fn parse_escape(&mut self, ix: usize) -> Result<(usize, Expr)> { if ix + 1 == self.re.len() { return Err(Error::TrailingBackslash); } let bytes = self.re.as_bytes(); let b = bytes[ix + 1]; let mut end = ix + 1 + codepoint_len(b); let mut size = 1; if is_digit(b) { if let Some((end, group)) = parse_decimal(self.re, ix + 1) { // protect BitSet against unreasonably large value if group < self.re.len() / 2 { self.numeric_backrefs = true; return Ok((end, Expr::Backref(group))); } } return Err(Error::InvalidBackref); } else if b == b'k' { // Named backref: \k return self.parse_backref(ix + 2, "<", ">"); } else if b == b'A' || b == b'z' || b == b'b' || b == b'B' { size = 0; } else if (b | 32) == b'd' || (b | 32) == b's' || (b | 32) == b'w' || b == b'a' || b == b'f' || b == b'n' || b == b'r' || b == b't' || b == b'v' { // size = 1 } else if b == b'e' { let inner = String::from(r"\x1B"); return Ok(( end, Expr::Delegate { inner, size, casei: false, }, )); } else if (b | 32) == b'h' { let s = if b == b'h' { "[0-9A-Fa-f]" } else { "[^0-9A-Fa-f]" }; let inner = String::from(s); return Ok(( end, Expr::Delegate { inner, size, casei: false, }, )); } else if b == b'x' { return self.parse_hex(end, 2); } else if b == b'u' { return self.parse_hex(end, 4); } else if b == b'U' { return self.parse_hex(end, 8); } else if (b | 32) == b'p' { // allow whitespace? if end == self.re.len() { return Err(Error::TrailingBackslash); // better name? } let b = bytes[end]; end += codepoint_len(b); if b == b'{' { loop { if end == self.re.len() { return Err(Error::UnclosedUnicodeName); } let b = bytes[end]; if b == b'}' { end += 1; break; } end += codepoint_len(b); } } } else if b'a' <= (b | 32) && (b | 32) <= b'z' { return Err(Error::InvalidEscape(format!("\\{}", &self.re[ix + 1..end]))); } else if 0x20 <= b && b <= 0x7f { // printable ASCII (including space, see issue #29) return Ok((end, make_literal(&self.re[ix + 1..end]))); } // what to do with characters outside printable ASCII? let inner = String::from(&self.re[ix..end]); Ok(( end, Expr::Delegate { inner, size, casei: self.flag(FLAG_CASEI), }, )) } // ix points after '\x', eg to 'A0' or '{12345}', or after `\u` or `\U` fn parse_hex(&self, ix: usize, digits: usize) -> Result<(usize, Expr)> { if ix >= self.re.len() { // Incomplete escape sequence return Err(Error::InvalidHex); } let bytes = self.re.as_bytes(); let b = bytes[ix]; let (end, s) = if ix + digits <= self.re.len() && bytes[ix..ix + digits].iter().all(|&b| is_hex_digit(b)) { let end = ix + digits; (end, &self.re[ix..end]) } else if b == b'{' { let starthex = ix + 1; let mut endhex = starthex; loop { if endhex == self.re.len() { return Err(Error::InvalidHex); } let b = bytes[endhex]; if endhex > starthex && b == b'}' { break; } if is_hex_digit(b) && endhex < starthex + 8 { endhex += 1; } else { return Err(Error::InvalidHex); } } (endhex + 1, &self.re[starthex..endhex]) } else { return Err(Error::InvalidHex); }; let codepoint = u32::from_str_radix(s, 16).unwrap(); if let Some(c) = ::std::char::from_u32(codepoint) { let mut inner = String::with_capacity(4); inner.push(c); Ok(( end, Expr::Literal { val: inner, casei: self.flag(FLAG_CASEI), }, )) } else { Err(Error::InvalidCodepointValue) } } fn parse_class(&mut self, ix: usize) -> Result<(usize, Expr)> { let bytes = self.re.as_bytes(); let mut ix = ix + 1; // skip opening '[' let mut class = String::new(); let mut nest = 1; class.push('['); // Negated character class if ix < self.re.len() && bytes[ix] == b'^' { class.push('^'); ix += 1; } // `]` does not have to be escaped after opening `[` or `[^` if ix < self.re.len() && bytes[ix] == b']' { class.push(']'); ix += 1; } loop { if ix == self.re.len() { return Err(Error::InvalidClass); } let end = match bytes[ix] { b'\\' => { if ix + 1 == self.re.len() { return Err(Error::InvalidClass); } // We support more escapes than regex, so parse it ourselves before delegating. let (end, expr) = self.parse_escape(ix)?; match expr { Expr::Literal { val, .. } => { class.push_str(&escape(&val)); } Expr::Delegate { inner, .. } => { class.push_str(&inner); } _ => { return Err(Error::InvalidClass); } } end } b'[' => { nest += 1; class.push('['); ix + 1 } b']' => { nest -= 1; if nest == 0 { break; } class.push(']'); ix + 1 } b => { let end = ix + codepoint_len(b); class.push_str(&self.re[ix..end]); end } }; ix = end; } class.push(']'); let ix = ix + 1; // skip closing ']' Ok(( ix, Expr::Delegate { inner: class, size: 1, casei: self.flag(FLAG_CASEI), }, )) } fn parse_group(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let depth = depth + 1; if depth >= MAX_RECURSION { return Err(Error::RecursionExceeded); } let ix = self.optional_whitespace(ix + 1)?; let (la, skip) = if self.re[ix..].starts_with("?=") { (Some(LookAhead), 2) } else if self.re[ix..].starts_with("?!") { (Some(LookAheadNeg), 2) } else if self.re[ix..].starts_with("?<=") { (Some(LookBehind), 3) } else if self.re[ix..].starts_with("?...) self.curr_group += 1; if let Some((id, skip)) = parse_id(&self.re[ix + 1..], "<", ">") { self.named_groups.insert(id.to_string(), self.curr_group); (None, skip + 1) } else { return Err(Error::InvalidGroupName); } } else if self.re[ix..].starts_with("?P<") { // Named capture group using Python syntax: (?P...) self.curr_group += 1; // this is a capture group if let Some((id, skip)) = parse_id(&self.re[ix + 2..], "<", ">") { self.named_groups.insert(id.to_string(), self.curr_group); (None, skip + 2) } else { return Err(Error::InvalidGroupName); } } else if self.re[ix..].starts_with("?P=") { // Backref using Python syntax: (?P=name) return self.parse_backref(ix + 3, "", ")"); } else if self.re[ix..].starts_with("?>") { (None, 2) } else if self.re[ix..].starts_with('?') { return self.parse_flags(ix, depth); } else { self.curr_group += 1; // this is a capture group (None, 0) }; let ix = ix + skip; let (ix, child) = self.parse_re(ix, depth)?; let ix = self.optional_whitespace(ix)?; if ix == self.re.len() { return Err(Error::UnclosedOpenParen); } else if self.re.as_bytes()[ix] != b')' { return Err(Error::ParseError); }; let result = match (la, skip) { (Some(la), _) => Expr::LookAround(Box::new(child), la), (None, 2) => Expr::AtomicGroup(Box::new(child)), _ => Expr::Group(Box::new(child)), }; Ok((ix + 1, result)) } // ix points to `?` in `(?` fn parse_flags(&mut self, ix: usize, depth: usize) -> Result<(usize, Expr)> { let start = ix + 1; fn unknown_flag(re: &str, start: usize, end: usize) -> Error { let after_end = end + codepoint_len(re.as_bytes()[end]); let s = format!("(?{}", &re[start..after_end]); Error::UnknownFlag(s) } let mut ix = start; let mut neg = false; let oldflags = self.flags; loop { ix = self.optional_whitespace(ix)?; if ix == self.re.len() { return Err(Error::UnclosedOpenParen); } let b = self.re.as_bytes()[ix]; match b { b'i' => self.update_flag(FLAG_CASEI, neg), b'm' => self.update_flag(FLAG_MULTI, neg), b's' => self.update_flag(FLAG_DOTNL, neg), b'U' => self.update_flag(FLAG_SWAP_GREED, neg), b'x' => self.update_flag(FLAG_IGNORE_SPACE, neg), b'u' => { if neg { return Err(Error::NonUnicodeUnsupported); } } b'-' => { if neg { return Err(unknown_flag(self.re, start, ix)); } neg = true; } b')' => { if ix == start || neg && ix == start + 1 { return Err(unknown_flag(self.re, start, ix)); } return Ok((ix + 1, Expr::Empty)); } b':' => { if neg && ix == start + 1 { return Err(unknown_flag(self.re, start, ix)); } ix += 1; let (ix, child) = self.parse_re(ix, depth)?; if ix == self.re.len() { return Err(Error::UnclosedOpenParen); } else if self.re.as_bytes()[ix] != b')' { return Err(Error::ParseError); }; self.flags = oldflags; return Ok((ix + 1, child)); } _ => return Err(unknown_flag(self.re, start, ix)), } ix += 1; } } fn flag(&self, flag: u32) -> bool { (self.flags & flag) != 0 } fn update_flag(&mut self, flag: u32, neg: bool) { if neg { self.flags &= !flag; } else { self.flags |= flag; } } fn optional_whitespace(&self, mut ix: usize) -> Result { let bytes = self.re.as_bytes(); loop { if ix == self.re.len() { return Ok(ix); } match bytes[ix] { b'#' if self.flag(FLAG_IGNORE_SPACE) => { match bytes[ix..].iter().position(|&c| c == b'\n') { Some(x) => ix += x + 1, None => return Ok(self.re.len()), } } b' ' | b'\r' | b'\n' | b'\t' if self.flag(FLAG_IGNORE_SPACE) => ix += 1, b'(' if bytes[ix..].starts_with(b"(?#") => { ix += 3; loop { if ix >= self.re.len() { return Err(Error::UnclosedOpenParen); } match bytes[ix] { b')' => { ix += 1; break; } b'\\' => ix += 2, _ => ix += 1, } } } _ => return Ok(ix), } } } } // return (ix, value) pub(crate) fn parse_decimal(s: &str, ix: usize) -> Option<(usize, usize)> { let mut end = ix; while end < s.len() && is_digit(s.as_bytes()[end]) { end += 1; } usize::from_str(&s[ix..end]).ok().map(|val| (end, val)) } /// Attempts to parse an identifier between the specified opening and closing /// delimiters. On success, returns `Some((id, skip))`, where `skip` is how much /// of the string was used. pub(crate) fn parse_id<'a>(s: &'a str, open: &'_ str, close: &'_ str) -> Option<(&'a str, usize)> { debug_assert!(!close.starts_with(is_id_char)); if !s.starts_with(open) { return None; } let id_start = open.len(); let id_len = match s[id_start..].find(|c: char| !is_id_char(c)) { Some(id_len) if s[id_start + id_len..].starts_with(close) => Some(id_len), None if close.is_empty() => Some(s.len()), _ => None, }; match id_len { Some(0) => None, Some(id_len) => { let id_end = id_start + id_len; Some((&s[id_start..id_end], id_end + close.len())) } _ => None, } } fn is_id_char(c: char) -> bool { c.is_alphanumeric() || c == '_' } fn is_digit(b: u8) -> bool { b'0' <= b && b <= b'9' } fn is_hex_digit(b: u8) -> bool { is_digit(b) || (b'a' <= (b | 32) && (b | 32) <= b'f') } pub(crate) fn make_literal(s: &str) -> Expr { Expr::Literal { val: String::from(s), casei: false, } } #[cfg(test)] mod tests { use crate::parse::{make_literal, parse_id}; use crate::Expr; use crate::LookAround::*; use std::usize; fn p(s: &str) -> Expr { Expr::parse_tree(s).unwrap().expr } #[cfg_attr(feature = "track_caller", track_caller)] fn fail(s: &str) { assert!(Expr::parse_tree(s).is_err()); } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_error(re: &str, expected_error: &str) { let result = Expr::parse_tree(re); assert!(result.is_err()); assert_eq!(&format!("{}", result.err().unwrap()), expected_error); } #[test] fn empty() { assert_eq!(p(""), Expr::Empty); } #[test] fn any() { assert_eq!(p("."), Expr::Any { newline: false }); assert_eq!(p("(?s:.)"), Expr::Any { newline: true }); } #[test] fn start_text() { assert_eq!(p("^"), Expr::StartText); } #[test] fn end_text() { assert_eq!(p("$"), Expr::EndText); } #[test] fn literal() { assert_eq!(p("a"), make_literal("a")); } #[test] fn literal_special() { assert_eq!(p("}"), make_literal("}")); assert_eq!(p("]"), make_literal("]")); } #[test] fn parse_id_test() { assert_eq!(parse_id("foo.", "", ""), Some(("foo", 3))); assert_eq!(parse_id("{foo}", "{", "}"), Some(("foo", 5))); assert_eq!(parse_id("{foo.", "{", "}"), None); assert_eq!(parse_id("{foo", "{", "}"), None); assert_eq!(parse_id("{}", "{", "}"), None); assert_eq!(parse_id("", "", ""), None); } #[test] fn literal_unescaped_opening_curly() { // `{` in position where quantifier is not allowed results in literal `{` assert_eq!(p("{"), make_literal("{")); assert_eq!(p("({)"), Expr::Group(Box::new(make_literal("{"),))); assert_eq!( p("a|{"), Expr::Alt(vec![make_literal("a"), make_literal("{"),]) ); assert_eq!( p("{{2}"), Expr::Repeat { child: Box::new(make_literal("{")), lo: 2, hi: 2, greedy: true } ); } #[test] fn literal_escape() { assert_eq!(p("\\'"), make_literal("'")); assert_eq!(p("\\\""), make_literal("\"")); assert_eq!(p("\\ "), make_literal(" ")); assert_eq!(p("\\xA0"), make_literal("\u{A0}")); assert_eq!(p("\\x{1F4A9}"), make_literal("\u{1F4A9}")); assert_eq!(p("\\x{000000B7}"), make_literal("\u{B7}")); assert_eq!(p("\\u21D2"), make_literal("\u{21D2}")); assert_eq!(p("\\u{21D2}"), make_literal("\u{21D2}")); assert_eq!(p("\\u21D2x"), p("\u{21D2}x")); assert_eq!(p("\\U0001F60A"), make_literal("\u{1F60A}")); assert_eq!(p("\\U{0001F60A}"), make_literal("\u{1F60A}")); } #[test] fn hex_escape() { assert_eq!( p("\\h"), Expr::Delegate { inner: String::from("[0-9A-Fa-f]"), size: 1, casei: false } ); assert_eq!( p("\\H"), Expr::Delegate { inner: String::from("[^0-9A-Fa-f]"), size: 1, casei: false } ); } #[test] fn invalid_escape() { assert_error("\\", "Backslash without following character"); assert_error("\\q", "Invalid escape: \\q"); assert_error("\\xAG", "Invalid hex escape"); assert_error("\\xA", "Invalid hex escape"); assert_error("\\x{}", "Invalid hex escape"); assert_error("\\x{AG}", "Invalid hex escape"); assert_error("\\x{42", "Invalid hex escape"); assert_error("\\x{D800}", "Invalid codepoint for hex or unicode escape"); assert_error("\\x{110000}", "Invalid codepoint for hex or unicode escape"); assert_error("\\u123", "Invalid hex escape"); assert_error("\\u123x", "Invalid hex escape"); assert_error("\\u{}", "Invalid hex escape"); assert_error("\\U1234567", "Invalid hex escape"); assert_error("\\U{}", "Invalid hex escape"); } #[test] fn concat() { assert_eq!( p("ab"), Expr::Concat(vec![make_literal("a"), make_literal("b"),]) ); } #[test] fn alt() { assert_eq!( p("a|b"), Expr::Alt(vec![make_literal("a"), make_literal("b"),]) ); } #[test] fn group() { assert_eq!(p("(a)"), Expr::Group(Box::new(make_literal("a"),))); } #[test] fn group_repeat() { assert_eq!( p("(a){2}"), Expr::Repeat { child: Box::new(Expr::Group(Box::new(make_literal("a")))), lo: 2, hi: 2, greedy: true } ); } #[test] fn repeat() { assert_eq!( p("a{2,42}"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: 42, greedy: true } ); assert_eq!( p("a{2,}"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: usize::MAX, greedy: true } ); assert_eq!( p("a{2}"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: 2, greedy: true } ); assert_eq!( p("a{,2}"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 0, hi: 2, greedy: true } ); assert_eq!( p("a{2,42}?"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: 42, greedy: false } ); assert_eq!( p("a{2,}?"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: usize::MAX, greedy: false } ); assert_eq!( p("a{2}?"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 2, hi: 2, greedy: false } ); assert_eq!( p("a{,2}?"), Expr::Repeat { child: Box::new(make_literal("a")), lo: 0, hi: 2, greedy: false } ); } #[test] fn invalid_repeat() { // Invalid repeat syntax results in literal assert_eq!( p("a{"), Expr::Concat(vec![make_literal("a"), make_literal("{"),]) ); assert_eq!( p("a{6"), Expr::Concat(vec![ make_literal("a"), make_literal("{"), make_literal("6"), ]) ); assert_eq!( p("a{6,"), Expr::Concat(vec![ make_literal("a"), make_literal("{"), make_literal("6"), make_literal(","), ]) ); } #[test] fn delegate_zero() { assert_eq!( p("\\b"), Expr::Delegate { inner: String::from("\\b"), size: 0, casei: false } ); assert_eq!( p("\\B"), Expr::Delegate { inner: String::from("\\B"), size: 0, casei: false } ); } #[test] fn delegate_named_group() { assert_eq!( p("\\p{Greek}"), Expr::Delegate { inner: String::from("\\p{Greek}"), size: 1, casei: false } ); assert_eq!( p("\\pL"), Expr::Delegate { inner: String::from("\\pL"), size: 1, casei: false } ); assert_eq!( p("\\P{Greek}"), Expr::Delegate { inner: String::from("\\P{Greek}"), size: 1, casei: false } ); assert_eq!( p("\\PL"), Expr::Delegate { inner: String::from("\\PL"), size: 1, casei: false } ); assert_eq!( p("(?i)\\p{Ll}"), Expr::Delegate { inner: String::from("\\p{Ll}"), size: 1, casei: true } ); } #[test] fn backref() { assert_eq!( p("(.)\\1"), Expr::Concat(vec![ Expr::Group(Box::new(Expr::Any { newline: false })), Expr::Backref(1), ]) ); } #[test] fn named_backref() { assert_eq!( p("(?.)\\k"), Expr::Concat(vec![ Expr::Group(Box::new(Expr::Any { newline: false })), Expr::Backref(1), ]) ); } #[test] fn lookaround() { assert_eq!( p("(?=a)"), Expr::LookAround(Box::new(make_literal("a")), LookAhead) ); assert_eq!( p("(?!a)"), Expr::LookAround(Box::new(make_literal("a")), LookAheadNeg) ); assert_eq!( p("(?<=a)"), Expr::LookAround(Box::new(make_literal("a")), LookBehind) ); assert_eq!( p("(?a)"), Expr::AtomicGroup(Box::new(make_literal("a")))); } #[test] fn possessive() { assert_eq!( p("a++"), Expr::AtomicGroup(Box::new(Expr::Repeat { child: Box::new(make_literal("a")), lo: 1, hi: usize::MAX, greedy: true })) ); assert_eq!( p("a*+"), Expr::AtomicGroup(Box::new(Expr::Repeat { child: Box::new(make_literal("a")), lo: 0, hi: usize::MAX, greedy: true })) ); assert_eq!( p("a?+"), Expr::AtomicGroup(Box::new(Expr::Repeat { child: Box::new(make_literal("a")), lo: 0, hi: 1, greedy: true })) ); } #[test] fn invalid_backref() { // only syntactic tests; see similar test in analyze module fail(".\\12345678"); // unreasonably large number fail(".\\c"); // not decimal } #[test] fn invalid_group_name_backref() { assert_error( "\\k(?.)", "Invalid group name in back reference: id", ); } #[test] fn named_backref_only() { assert_error("(?.)\\1", "Numbered backref/call not allowed because named group was used, use a named backref instead"); assert_error("(a)\\1(?b)", "Numbered backref/call not allowed because named group was used, use a named backref instead"); } #[test] fn invalid_group_name() { assert_error("(?)", "Could not parse group name"); assert_error("(?<#>)", "Could not parse group name"); assert_error("\\kxxx", "Could not parse group name"); } #[test] fn unknown_flag() { assert_error("(?-:a)", "Unknown group flag: (?-:"); assert_error("(?)", "Unknown group flag: (?)"); assert_error("(?--)", "Unknown group flag: (?--"); // Check that we don't split on char boundary assert_error("(?\u{1F60A})", "Unknown group flag: (?\u{1F60A}"); } #[test] fn no_quantifiers_on_lookarounds() { assert_error("(?=hello)+", "Target of repeat operator is invalid"); assert_error("(? String` (or any /// `FnMut(&Captures) -> T` where `T: AsRef`), which covers most use cases. pub trait Replacer { /// Appends text to `dst` to replace the current match. /// /// The current match is represented by `caps`, which is guaranteed to /// have a match at capture group `0`. /// /// For example, a no-op replacement would be /// `dst.push_str(caps.get(0).unwrap().as_str())`. fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String); /// Return a fixed unchanging replacement string. /// /// When doing replacements, if access to `Captures` is not needed (e.g., /// the replacement byte string does not need `$` expansion), then it can /// be beneficial to avoid finding sub-captures. /// /// In general, this is called once for every call to `replacen`. fn no_expansion(&mut self) -> Option> { None } /// Return a `Replacer` that borrows and wraps this `Replacer`. /// /// This is useful when you want to take a generic `Replacer` (which might /// not be cloneable) and use it without consuming it, so it can be used /// more than once. /// /// # Example /// /// ``` /// use fancy_regex::{Regex, Replacer}; /// /// fn replace_all_twice( /// re: Regex, /// src: &str, /// mut rep: R, /// ) -> String { /// let dst = re.replace_all(src, rep.by_ref()); /// let dst = re.replace_all(&dst, rep.by_ref()); /// dst.into_owned() /// } /// ``` fn by_ref(&mut self) -> ReplacerRef { ReplacerRef(self) } } /// By-reference adaptor for a `Replacer` /// /// Returned by [`Replacer::by_ref`](trait.Replacer.html#method.by_ref). #[derive(Debug)] pub struct ReplacerRef<'a, R: ?Sized>(&'a mut R); impl<'a, R: Replacer + ?Sized + 'a> Replacer for ReplacerRef<'a, R> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.0.replace_append(caps, dst) } fn no_expansion(&mut self) -> Option> { self.0.no_expansion() } } impl<'a> Replacer for &'a str { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { caps.expand(*self, dst); } fn no_expansion(&mut self) -> Option> { no_expansion(self) } } impl<'a> Replacer for &'a String { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_str().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option> { no_expansion(self) } } impl Replacer for String { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_str().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option> { no_expansion(self) } } impl<'a> Replacer for Cow<'a, str> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_ref().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option> { no_expansion(self) } } impl<'a> Replacer for &'a Cow<'a, str> { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { self.as_ref().replace_append(caps, dst) } fn no_expansion(&mut self) -> Option> { no_expansion(self) } } fn no_expansion>(t: &T) -> Option> { let s = t.as_ref(); if s.contains('$') { None } else { Some(Cow::Borrowed(s)) } } impl Replacer for F where F: FnMut(&Captures<'_>) -> T, T: AsRef, { fn replace_append(&mut self, caps: &Captures<'_>, dst: &mut String) { dst.push_str((*self)(caps).as_ref()); } } /// `NoExpand` indicates literal string replacement. /// /// It can be used with `replace` and `replace_all` to do a literal string /// replacement without expanding `$name` to their corresponding capture /// groups. This can be both convenient (to avoid escaping `$`, for example) /// and performant (since capture groups don't need to be found). /// /// `'t` is the lifetime of the literal text. #[derive(Clone, Debug)] pub struct NoExpand<'t>(pub &'t str); impl<'t> Replacer for NoExpand<'t> { fn replace_append(&mut self, _: &Captures<'_>, dst: &mut String) { dst.push_str(self.0); } fn no_expansion(&mut self) -> Option> { Some(Cow::Borrowed(self.0)) } } fancy-regex-0.7.1/src/vm.rs000064400000000000000000000735000000000000000136020ustar 00000000000000// Copyright 2016 The Fancy Regex Authors. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. //! Backtracking VM for implementing fancy regexes. //! //! Read https://swtch.com/~rsc/regexp/regexp2.html for a good introduction for how this works. //! //! The VM executes a sequence of instructions (a program) against an input string. It keeps track //! of a program counter (PC) and an index into the string (IX). Execution can have one or more //! threads. //! //! One of the basic instructions is `Lit`, which matches a string against the input. If it matches, //! the PC advances to the next instruction and the IX to the position after the matched string. //! If not, the current thread is stopped because it failed. //! //! If execution reaches an `End` instruction, the program is successful because a match was found. //! If there are no more threads to execute, the program has failed to match. //! //! A very simple program for the regex `a`: //! //! ```text //! 0: Lit("a") //! 1: End //! ``` //! //! The `Split` instruction causes execution to split into two threads. The first thread is executed //! with the current string index. If it fails, we reset the string index and resume execution with //! the second thread. That is what "backtracking" refers to. In order to do that, we keep a stack //! of threads (PC and IX) to try. //! //! Example program for the regex `ab|ac`: //! //! ```text //! 0: Split(1, 4) //! 1: Lit("a") //! 2: Lit("b") //! 3: Jmp(6) //! 4: Lit("a") //! 5: Lit("c") //! 6: End //! ``` //! //! The `Jmp` instruction causes execution to jump to the specified instruction. In the example it //! is needed to separate the two threads. //! //! Let's step through execution with that program for the input `ac`: //! //! 1. We're at PC 0 and IX 0 //! 2. `Split(1, 4)` means we save a thread with PC 4 and IX 0 for trying later //! 3. Continue at `Lit("a")` which matches, so we advance IX to 1 //! 4. `Lit("b")` doesn't match at IX 1 (`"b" != "c"`), so the thread fails //! 5. We continue with the previously saved thread at PC 4 and IX 0 (backtracking) //! 6. Both `Lit("a")` and `Lit("c")` match and we reach `End` -> successful match (index 0 to 2) use regex::Regex; use std::collections::BTreeSet; use std::usize; use crate::prev_codepoint_ix; use crate::Error; use crate::Result; use crate::{codepoint_len, RegexOptions}; const OPTION_TRACE: u32 = 1; // TODO: make configurable const MAX_STACK: usize = 1_000_000; /// Instruction of the VM. #[derive(Debug, Clone)] pub enum Insn { /// Successful end of program End, /// Match any character (including newline) Any, /// Match any character (not including newline) AnyNoNL, /// Match the literal string at the current index Lit(String), // should be cow? /// Split execution into two threads. The two fields are positions of instructions. Execution /// first tries the first thread. If that fails, the second position is tried. Split(usize, usize), /// Jump to instruction at position Jmp(usize), /// Save the current string index into the specified slot Save(usize), /// Save `0` into the specified slot Save0(usize), /// Set the string index to the value that was saved in the specified slot Restore(usize), /// Repeat greedily (match as much as possible) RepeatGr { /// Minimum number of matches lo: usize, /// Maximum number of matches hi: usize, /// The instruction after the repeat next: usize, /// The slot for keeping track of the number of repetitions repeat: usize, }, /// Repeat non-greedily (prefer matching as little as possible) RepeatNg { /// Minimum number of matches lo: usize, /// Maximum number of matches hi: usize, /// The instruction after the repeat next: usize, /// The slot for keeping track of the number of repetitions repeat: usize, }, /// Repeat greedily and prevent infinite loops from empty matches RepeatEpsilonGr { /// Minimum number of matches lo: usize, /// The instruction after the repeat next: usize, /// The slot for keeping track of the number of repetitions repeat: usize, /// The slot for saving the previous IX to check if we had an empty match check: usize, }, /// Repeat non-greedily and prevent infinite loops from empty matches RepeatEpsilonNg { /// Minimum number of matches lo: usize, /// The instruction after the repeat next: usize, /// The slot for keeping track of the number of repetitions repeat: usize, /// The slot for saving the previous IX to check if we had an empty match check: usize, }, /// Negative look-around failed FailNegativeLookAround, /// Set IX back by the specified number of characters GoBack(usize), /// Back reference to a group number to check Backref(usize), /// Begin of atomic group BeginAtomic, /// End of atomic group EndAtomic, /// Delegate matching to the regex crate for a fixed size DelegateSized(Box, usize), /// Delegate matching to the regex crate Delegate { /// The regex inner: Box, /// The same regex but matching an additional character on the left. /// /// E.g. if `inner` is `^\b`, `inner1` is `^(?s:.)\b`. Why do we need this? Because `\b` /// needs to know the previous character to work correctly. Let's say we're currently at the /// second character of the string `xy`. Should `\b` match there? No. But if we'd run `^\b` /// against `y`, it would match (incorrect). To do the right thing, we run `^(?s:.)\b` /// against `xy`, which does not match. /// /// We only need this for regexes that "look left", i.e. need to know what the previous /// character was. inner1: Option>, /// The first group number that this regex captures (if it contains groups) start_group: usize, /// The last group number end_group: usize, }, } /// Sequence of instructions for the VM to execute. #[derive(Debug, Clone)] pub struct Prog { /// Instructions of the program pub body: Vec, n_saves: usize, } impl Prog { pub(crate) fn new(body: Vec, n_saves: usize) -> Prog { Prog { body, n_saves } } #[doc(hidden)] pub(crate) fn debug_print(&self) { for (i, insn) in self.body.iter().enumerate() { println!("{:3}: {:?}", i, insn); } } } #[derive(Debug)] struct Branch { pc: usize, ix: usize, nsave: usize, } #[derive(Debug)] struct Save { slot: usize, value: usize, } struct State { /// Saved values indexed by slot. Mostly indices to s, but can be repeat values etc. /// Always contains the saves of the current state. saves: Vec, /// Stack of backtrack branches. stack: Vec, /// Old saves (slot, value) oldsave: Vec, /// Number of saves at the end of `oldsave` that need to be restored to `saves` on pop nsave: usize, explicit_sp: usize, /// Maximum size of the stack. If the size would be exceeded during execution, a `StackOverflow` /// error is raised. max_stack: usize, options: u32, } // Each element in the stack conceptually represents the entire state // of the machine: the pc (index into prog), the index into the // string, and the entire vector of saves. However, copying the save // vector on every push/pop would be inefficient, so instead we use a // copy-on-write approach for each slot within the save vector. The // top `nsave` elements in `oldsave` represent the delta from the // current machine state to the top of stack. impl State { fn new(n_saves: usize, max_stack: usize, options: u32) -> State { State { saves: vec![usize::MAX; n_saves], stack: Vec::new(), oldsave: Vec::new(), nsave: 0, explicit_sp: n_saves, max_stack, options, } } // push a backtrack branch fn push(&mut self, pc: usize, ix: usize) -> Result<()> { if self.stack.len() < self.max_stack { let nsave = self.nsave; self.stack.push(Branch { pc, ix, nsave }); self.nsave = 0; self.trace_stack("push"); Ok(()) } else { Err(Error::StackOverflow) } } // pop a backtrack branch fn pop(&mut self) -> (usize, usize) { for _ in 0..self.nsave { let Save { slot, value } = self.oldsave.pop().unwrap(); self.saves[slot] = value; } let Branch { pc, ix, nsave } = self.stack.pop().unwrap(); self.nsave = nsave; self.trace_stack("pop"); (pc, ix) } fn save(&mut self, slot: usize, val: usize) { for i in 0..self.nsave { // could avoid this iteration with some overhead; worth it? if self.oldsave[self.oldsave.len() - i - 1].slot == slot { // already saved, just update self.saves[slot] = val; return; } } self.oldsave.push(Save { slot, value: self.saves[slot], }); self.nsave += 1; self.saves[slot] = val; if self.options & OPTION_TRACE != 0 { println!("saves: {:?}", self.saves); } } fn get(&self, slot: usize) -> usize { self.saves[slot] } // push a value onto the explicit stack; note: the entire contents of // the explicit stack is saved and restored on backtrack. fn stack_push(&mut self, val: usize) { if self.saves.len() == self.explicit_sp { self.saves.push(self.explicit_sp + 1); } let explicit_sp = self.explicit_sp; let sp = self.get(explicit_sp); if self.saves.len() == sp { self.saves.push(val); } else { self.save(sp, val); } self.save(explicit_sp, sp + 1); } // pop a value from the explicit stack fn stack_pop(&mut self) -> usize { let explicit_sp = self.explicit_sp; let sp = self.get(explicit_sp) - 1; let result = self.get(sp); self.save(explicit_sp, sp); result } /// Get the current number of backtrack branches fn backtrack_count(&self) -> usize { self.stack.len() } /// Discard backtrack branches that were pushed since the call to `backtrack_count`. /// /// What we want: /// * Keep the current `saves` as they are /// * Only keep `count` backtrack branches on `stack`, discard the rest /// * Keep the first `oldsave` for each slot, discard the rest (multiple pushes might have /// happened with saves to the same slot) fn backtrack_cut(&mut self, count: usize) { if self.stack.len() == count { // no backtrack branches to discard, all good return; } // start and end indexes of old saves for the branch we're cutting to let (oldsave_start, oldsave_end) = { let mut end = self.oldsave.len() - self.nsave; for &Branch { nsave, .. } in &self.stack[count + 1..] { end -= nsave; } let start = end - self.stack[count].nsave; (start, end) }; let mut saved = BTreeSet::new(); // keep all the old saves of our branch (they're all for different slots) for &Save { slot, .. } in &self.oldsave[oldsave_start..oldsave_end] { saved.insert(slot); } let mut oldsave_ix = oldsave_end; // for other old saves, keep them only if they're for a slot that we haven't saved yet for ix in oldsave_end..self.oldsave.len() { let Save { slot, .. } = self.oldsave[ix]; let new_slot = saved.insert(slot); if new_slot { // put the save we want to keep (ix) after the ones we already have (oldsave_ix) // note that it's fine if the indexes are the same (then swapping is a no-op) self.oldsave.swap(oldsave_ix, ix); oldsave_ix += 1; } } self.stack.truncate(count); self.oldsave.truncate(oldsave_ix); self.nsave = oldsave_ix - oldsave_start; } #[inline] fn trace_stack(&self, operation: &str) { if self.options & OPTION_TRACE != 0 { println!("stack after {}: {:?}", operation, self.stack); } } } fn codepoint_len_at(s: &str, ix: usize) -> usize { codepoint_len(s.as_bytes()[ix]) } #[inline] fn matches_literal(s: &str, ix: usize, end: usize, literal: &str) -> bool { // Compare as bytes because the literal might be a single byte char whereas ix // points to a multibyte char. Comparing with str would result in an error like // "byte index N is not a char boundary". end <= s.len() && &s.as_bytes()[ix..end] == literal.as_bytes() } /// Run the program with trace printing for debugging. pub fn run_trace(prog: &Prog, s: &str, pos: usize) -> Result>> { run(prog, s, pos, OPTION_TRACE, &RegexOptions::default()) } /// Run the program with default options. pub fn run_default(prog: &Prog, s: &str, pos: usize) -> Result>> { run(prog, s, pos, 0, &RegexOptions::default()) } /// Run the program with options. #[allow(clippy::cognitive_complexity)] pub(crate) fn run( prog: &Prog, s: &str, pos: usize, option_flags: u32, options: &RegexOptions, ) -> Result>> { let mut state = State::new(prog.n_saves, MAX_STACK, option_flags); if option_flags & OPTION_TRACE != 0 { println!("pos\tinstruction"); } let mut backtrack_count = 0; let mut pc = 0; let mut ix = pos; loop { // break from this loop to fail, causes stack to pop 'fail: loop { if option_flags & OPTION_TRACE != 0 { println!("{}\t{} {:?}", ix, pc, prog.body[pc]); } match prog.body[pc] { Insn::End => { // save of end position into slot 1 is now done // with an explicit group; we might want to // optimize that. //state.saves[1] = ix; if option_flags & OPTION_TRACE != 0 { println!("saves: {:?}", state.saves); } return Ok(Some(state.saves)); } Insn::Any => { if ix < s.len() { ix += codepoint_len_at(s, ix); } else { break 'fail; } } Insn::AnyNoNL => { if ix < s.len() && s.as_bytes()[ix] != b'\n' { ix += codepoint_len_at(s, ix); } else { break 'fail; } } Insn::Lit(ref val) => { let ix_end = ix + val.len(); if !matches_literal(s, ix, ix_end, val) { break 'fail; } ix = ix_end; } Insn::Split(x, y) => { state.push(y, ix)?; pc = x; continue; } Insn::Jmp(target) => { pc = target; continue; } Insn::Save(slot) => state.save(slot, ix), Insn::Save0(slot) => state.save(slot, 0), Insn::Restore(slot) => ix = state.get(slot), Insn::RepeatGr { lo, hi, next, repeat, } => { let repcount = state.get(repeat); if repcount == hi { pc = next; continue; } state.save(repeat, repcount + 1); if repcount >= lo { state.push(next, ix)?; } } Insn::RepeatNg { lo, hi, next, repeat, } => { let repcount = state.get(repeat); if repcount == hi { pc = next; continue; } state.save(repeat, repcount + 1); if repcount >= lo { state.push(pc + 1, ix)?; pc = next; continue; } } Insn::RepeatEpsilonGr { lo, next, repeat, check, } => { let repcount = state.get(repeat); if repcount > lo && state.get(check) == ix { // prevent zero-length match on repeat break 'fail; } state.save(repeat, repcount + 1); if repcount >= lo { state.save(check, ix); state.push(next, ix)?; } } Insn::RepeatEpsilonNg { lo, next, repeat, check, } => { let repcount = state.get(repeat); if repcount > lo && state.get(check) == ix { // prevent zero-length match on repeat break 'fail; } state.save(repeat, repcount + 1); if repcount >= lo { state.save(check, ix); state.push(pc + 1, ix)?; pc = next; continue; } } Insn::GoBack(count) => { for _ in 0..count { if ix == 0 { break 'fail; } ix = prev_codepoint_ix(s, ix); } } Insn::FailNegativeLookAround => { // Reaching this instruction means that the body of the // look-around matched. Because it's a *negative* look-around, // that means the look-around itself should fail (not match). // But before, we need to discard all the states that have // been pushed with the look-around, because we don't want to // explore them. loop { let (popped_pc, _) = state.pop(); if popped_pc == pc + 1 { // We've reached the state that would jump us to // after the look-around (in case the look-around // succeeded). That means we popped enough states. break; } } break 'fail; } Insn::Backref(slot) => { let lo = state.get(slot); if lo == usize::MAX { // Referenced group hasn't matched, so the backref doesn't match either break 'fail; } let hi = state.get(slot + 1); let ref_text = &s[lo..hi]; let ix_end = ix + ref_text.len(); if !matches_literal(s, ix, ix_end, ref_text) { break 'fail; } ix = ix_end; } Insn::BeginAtomic => { let count = state.backtrack_count(); state.stack_push(count); } Insn::EndAtomic => { let count = state.stack_pop(); state.backtrack_cut(count); } Insn::DelegateSized(ref inner, size) => { if inner.is_match(&s[ix..]) { // We could analyze for ascii-only, and ix += size in // that case. Unlikely to be speed-limiting though. for _ in 0..size { ix += codepoint_len_at(s, ix); } } else { break 'fail; } } Insn::Delegate { ref inner, ref inner1, start_group, end_group, } => { // Note: Why can't we use `find_at` or `captures_read_at` here instead of the // `inner1` regex? We only want to match at the current location, so our regexes // need to have an anchor: `^foo` (without `^`, it would match `foo` anywhere). // But regex like `^foo` won't match in `bar foo` with `find_at(s, 4)` because // `^` only matches at the beginning of the text. let re = match *inner1 { Some(ref inner1) if ix > 0 => { ix = prev_codepoint_ix(s, ix); inner1 } _ => inner, }; if start_group == end_group { // No groups, so we can use `find` which is faster than `captures_read` match re.find(&s[ix..]) { Some(m) => ix += m.end(), _ => break 'fail, } } else { let mut locations = re.capture_locations(); if let Some(m) = re.captures_read(&mut locations, &s[ix..]) { for i in 0..(end_group - start_group) { let slot = (start_group + i) * 2; if let Some((start, end)) = locations.get(i + 1) { state.save(slot, ix + start); state.save(slot + 1, ix + end); } else { state.save(slot, usize::MAX); state.save(slot + 1, usize::MAX); } } ix += m.end(); } else { break 'fail; } } } } pc += 1; } if option_flags & OPTION_TRACE != 0 { println!("fail"); } // "break 'fail" goes here if state.stack.is_empty() { return Ok(None); } backtrack_count += 1; if backtrack_count > options.backtrack_limit { return Err(Error::BacktrackLimitExceeded); } let (newpc, newix) = state.pop(); pc = newpc; ix = newix; } } #[cfg(test)] mod tests { use super::*; use quickcheck::{quickcheck, Arbitrary, Gen}; #[test] fn state_push_pop() { let mut state = State::new(1, MAX_STACK, 0); state.push(0, 0).unwrap(); state.push(1, 1).unwrap(); assert_eq!(state.pop(), (1, 1)); assert_eq!(state.pop(), (0, 0)); assert!(state.stack.is_empty()); state.push(2, 2).unwrap(); assert_eq!(state.pop(), (2, 2)); assert!(state.stack.is_empty()); } #[test] fn state_save_override() { let mut state = State::new(1, MAX_STACK, 0); state.save(0, 10); state.push(0, 0).unwrap(); state.save(0, 20); assert_eq!(state.pop(), (0, 0)); assert_eq!(state.get(0), 10); } #[test] fn state_save_override_twice() { let mut state = State::new(1, MAX_STACK, 0); state.save(0, 10); state.push(0, 0).unwrap(); state.save(0, 20); state.push(1, 1).unwrap(); state.save(0, 30); assert_eq!(state.get(0), 30); assert_eq!(state.pop(), (1, 1)); assert_eq!(state.get(0), 20); assert_eq!(state.pop(), (0, 0)); assert_eq!(state.get(0), 10); } #[test] fn state_explicit_stack() { let mut state = State::new(1, MAX_STACK, 0); state.stack_push(11); state.stack_push(12); state.push(100, 101).unwrap(); state.stack_push(13); assert_eq!(state.stack_pop(), 13); state.stack_push(14); assert_eq!(state.pop(), (100, 101)); // Note: 14 is not there because it was pushed as part of the backtrack branch assert_eq!(state.stack_pop(), 12); assert_eq!(state.stack_pop(), 11); } #[test] fn state_backtrack_cut_simple() { let mut state = State::new(2, MAX_STACK, 0); state.save(0, 1); state.save(1, 2); let count = state.backtrack_count(); state.push(0, 0).unwrap(); state.save(0, 3); assert_eq!(state.backtrack_count(), 1); state.backtrack_cut(count); assert_eq!(state.backtrack_count(), 0); assert_eq!(state.get(0), 3); assert_eq!(state.get(1), 2); } #[test] fn state_backtrack_cut_complex() { let mut state = State::new(2, MAX_STACK, 0); state.save(0, 1); state.save(1, 2); state.push(0, 0).unwrap(); state.save(0, 3); let count = state.backtrack_count(); state.push(1, 1).unwrap(); state.save(0, 4); state.push(2, 2).unwrap(); state.save(1, 5); assert_eq!(state.backtrack_count(), 3); state.backtrack_cut(count); assert_eq!(state.backtrack_count(), 1); assert_eq!(state.get(0), 4); assert_eq!(state.get(1), 5); state.pop(); assert_eq!(state.backtrack_count(), 0); // Check that oldsave were set correctly assert_eq!(state.get(0), 1); assert_eq!(state.get(1), 2); } #[derive(Clone, Debug)] enum Operation { Push, Pop, Save(usize, usize), } impl Arbitrary for Operation { fn arbitrary(g: &mut Gen) -> Self { match g.choose(&[0, 1, 2]) { Some(0) => Operation::Push, Some(1) => Operation::Pop, _ => Operation::Save( *g.choose(&[0usize, 1, 2, 3, 4]).unwrap(), usize::arbitrary(g), ), } } } fn check_saves_for_operations(operations: Vec) -> bool { let slots = operations .iter() .map(|o| match o { &Operation::Save(slot, _) => slot + 1, _ => 0, }) .max() .unwrap_or(0); if slots == 0 { // No point checking if there's no save instructions return true; } // Stack with the complete VM state (including saves) let mut stack = Vec::new(); let mut saves = vec![usize::MAX; slots]; let mut state = State::new(slots, MAX_STACK, 0); let mut expected = Vec::new(); let mut actual = Vec::new(); for operation in operations { match operation { Operation::Push => { // We're not checking pc and ix later, so don't bother // putting in random values. stack.push((0, 0, saves.clone())); state.push(0, 0).unwrap(); } Operation::Pop => { // Note that because we generate the operations randomly // there might be more pops than pushes. So ignore a pop // if the stack was empty. if let Some((_, _, previous_saves)) = stack.pop() { saves = previous_saves; state.pop(); } } Operation::Save(slot, value) => { saves[slot] = value; state.save(slot, value); } } // Remember state of saves for checking later expected.push(saves.clone()); let mut actual_saves = vec![usize::MAX; slots]; for i in 0..slots { actual_saves[i] = state.get(i); } actual.push(actual_saves); } expected == actual } quickcheck! { fn state_save_quickcheck(operations: Vec) -> bool { check_saves_for_operations(operations) } } } fancy-regex-0.7.1/tests/captures.rs000064400000000000000000000246160000000000000153650ustar 00000000000000use fancy_regex::{Captures, Error, Expander, Match, Result}; use std::borrow::Cow; use std::ops::Index; mod common; #[test] fn capture_names() { let regex = common::regex("(?)()(?P)"); let capture_names = regex.capture_names().collect::>(); assert_eq!(capture_names, vec![None, Some("foo"), None, Some("bar")]); } #[test] fn captures_fancy() { let captures = captures(r"\s*(\w+)(?=\.)", "foo bar."); assert_eq!(captures.len(), 2); assert_match(captures.get(0), " bar", 3, 7); assert_match(captures.get(1), "bar", 4, 7); assert!(captures.get(2).is_none()); } #[test] fn captures_fancy_named() { let captures = captures(r"\s*(?\w+)(?=\.)", "foo bar."); assert_eq!(captures.len(), 2); assert_match(captures.get(0), " bar", 3, 7); assert_match(captures.name("name"), "bar", 4, 7); assert_eq!(captures.index(0), " bar"); assert_eq!(captures.index("name"), "bar"); assert!(captures.get(2).is_none()); } #[test] fn captures_fancy_unmatched_group() { let captures = captures(r"(\w+)(?=\.)|(\w+)(?=!)", "foo! bar."); assert_eq!(captures.len(), 3); assert_match(captures.get(0), "foo", 0, 3); assert!(captures.get(1).is_none()); assert_match(captures.get(2), "foo", 0, 3); } #[test] fn captures_after_lookbehind() { let captures = captures( r"\s*(?<=[() ])(@\w+)(\([^)]*\))?\s*", " @another(foo bar) ", ); assert_match(captures.get(1), "@another", 1, 9); assert_match(captures.get(2), "(foo bar)", 9, 18); } #[test] fn captures_iter() { let text = "11 21 33"; for (i, captures) in common::regex(r"(?P\d)\d") .captures_iter(text) .enumerate() { let captures = captures.unwrap(); match i { 0 => { assert_eq!(captures.len(), 2); assert_match(captures.get(0), "11", 0, 2); assert_match(captures.name("num"), "1", 0, 1); } 1 => { assert_eq!(captures.len(), 2); assert_match(captures.get(0), "21", 3, 5); assert_match(captures.name("num"), "2", 3, 4); } 2 => { assert_eq!(captures.len(), 2); assert_match(captures.get(0), "33", 6, 8); assert_match(captures.name("num"), "3", 6, 7); } i => panic!("Expected 3 captures, got {}", i + 1), } } } #[test] fn captures_iter_attributes() { let text = "11 21 33"; let regex = common::regex(r"(?P\d)\d"); let all_captures = regex.captures_iter(text); assert_eq!(all_captures.text(), text); assert_eq!(regex.as_str(), all_captures.regex().as_str()); } #[test] fn captures_from_pos() { let text = "11 21 33"; let regex = common::regex(r"(\d)\d"); let captures = assert_captures(regex.captures_from_pos(text, 3)); assert_eq!(captures.len(), 2); assert_match(captures.get(0), "21", 3, 5); assert_match(captures.get(1), "2", 3, 4); let matches: Vec<_> = captures.iter().collect(); assert_eq!(matches.len(), 2); assert_match(matches[0], "21", 3, 5); assert_match(matches[1], "2", 3, 4); let regex = common::regex(r"(\d+)\1"); let captures = assert_captures(regex.captures_from_pos(text, 3)); assert_eq!(captures.len(), 2); assert_match(captures.get(0), "33", 6, 8); assert_match(captures.get(1), "3", 6, 7); let matches: Vec<_> = captures.iter().collect(); assert_eq!(matches.len(), 2); assert_match(matches[0], "33", 6, 8); assert_match(matches[1], "3", 6, 7); let regex = common::regex(r"(?P\d+)\k"); let captures = assert_captures(regex.captures_from_pos(text, 3)); assert_eq!(captures.len(), 2); assert_match(captures.get(0), "33", 6, 8); assert_match(captures.name("foo"), "3", 6, 7); let matches: Vec<_> = captures.iter().collect(); assert_eq!(matches.len(), 2); assert_match(matches[0], "33", 6, 8); assert_match(matches[1], "3", 6, 7); let regex = common::regex(r"(?P\d+)(?P=foo)"); let captures = assert_captures(regex.captures_from_pos(text, 3)); assert_eq!(captures.len(), 2); assert_match(captures.get(0), "33", 6, 8); assert_match(captures.name("foo"), "3", 6, 7); let matches: Vec<_> = captures.iter().collect(); assert_eq!(matches.len(), 2); assert_match(matches[0], "33", 6, 8); assert_match(matches[1], "3", 6, 7); } #[test] fn captures_from_pos_looking_left() { let regex = common::regex(r"\b(\w)"); // This should *not* match because `\b` doesn't match between a and x let result = regex.captures_from_pos("ax", 1).unwrap(); assert!(result.is_none()); let captures = assert_captures(regex.captures_from_pos(".x", 1)); assert_eq!(captures.len(), 2); assert_match(captures.get(0), "x", 1, 2); assert_match(captures.get(1), "x", 1, 2); } #[cfg_attr(feature = "track_caller", track_caller)] fn captures<'a>(re: &str, text: &'a str) -> Captures<'a> { let regex = common::regex(re); let result = regex.captures(text); assert_captures(result) } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_captures(result: Result>>) -> Captures<'_> { assert!( result.is_ok(), "Expected captures to succeed, but was {:?}", result ); let captures = result.unwrap(); assert!( captures.is_some(), "Expected captures, but was {:?}", captures ); captures.unwrap() } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_match(m: Option>, expected_text: &str, start: usize, end: usize) { assert!(m.is_some(), "Expected match, but was {:?}", m); let m = m.unwrap(); assert_eq!(m.as_str(), expected_text); assert_eq!(m.start(), start); assert_eq!(m.end(), end); } #[test] fn expand() { let regex = common::regex("(a)(b)(?<π>c)(?Pd)"); let cap = regex.captures("abcd").unwrap().expect("matched"); assert_expansion(&cap, "$0", "abcd"); assert_expansion(&cap, "$1", "a"); assert_expansion(&cap, "$2", "b"); assert_expansion(&cap, "$3", "c"); assert_expansion(&cap, "$4", "d"); assert_expansion(&cap, "$π", "c"); assert_expansion(&cap, "$x", "d"); assert_expansion(&cap, "$0π", ""); assert_expansion(&cap, "$1π", ""); assert_expansion(&cap, "$2π", ""); assert_expansion(&cap, "$3π", ""); assert_expansion(&cap, "$4π", ""); assert_expansion(&cap, "$ππ", ""); assert_expansion(&cap, "$xπ", ""); assert_expansion(&cap, "${0}π", "abcdπ"); assert_expansion(&cap, "${1}π", "aπ"); assert_expansion(&cap, "${2}π", "bπ"); assert_expansion(&cap, "${3}π", "cπ"); assert_expansion(&cap, "${4}π", "dπ"); assert_expansion(&cap, "${π}π", "cπ"); assert_expansion(&cap, "${x}π", "dπ"); assert_expansion(&cap, "$", "$"); assert_expansion(&cap, "$π√", "c√"); assert_expansion(&cap, "$x√", "d√"); assert_expansion(&cap, "$$π", "$π"); assert_expansion(&cap, "${π", "${π"); assert_python_expansion(&cap, "\\0", "abcd"); assert_python_expansion(&cap, "\\1", "a"); assert_python_expansion(&cap, "\\2", "b"); assert_python_expansion(&cap, "\\3", "c"); assert_python_expansion(&cap, "\\4", "d"); assert_python_expansion(&cap, "\\π", "\\π"); assert_python_expansion(&cap, "\\x", "\\x"); assert_python_expansion(&cap, "\\0π", "abcdπ"); assert_python_expansion(&cap, "\\1π", "aπ"); assert_python_expansion(&cap, "\\2π", "bπ"); assert_python_expansion(&cap, "\\3π", "cπ"); assert_python_expansion(&cap, "\\4π", "dπ"); assert_python_expansion(&cap, "\\ππ", "\\ππ"); assert_python_expansion(&cap, "\\xπ", "\\xπ"); assert_python_expansion(&cap, "\\g<0>π", "abcdπ"); assert_python_expansion(&cap, "\\g<1>π", "aπ"); assert_python_expansion(&cap, "\\g<2>π", "bπ"); assert_python_expansion(&cap, "\\g<3>π", "cπ"); assert_python_expansion(&cap, "\\g<4>π", "dπ"); assert_python_expansion(&cap, "\\g<π>π", "cπ"); assert_python_expansion(&cap, "\\gπ", "dπ"); assert_python_expansion(&cap, "\\", "\\"); assert_python_expansion(&cap, "\\\\π", "\\π"); assert_python_expansion(&cap, "\\g<π", "\\g<π"); } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_expansion(cap: &Captures, replacement: &str, text: &str) { let mut buf = "before".to_string(); cap.expand(replacement, &mut buf); assert_eq!(buf, format!("before{}", text)); } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_python_expansion(cap: &Captures, replacement: &str, text: &str) { assert_eq!(Expander::python().expansion(replacement, cap), text); } #[test] fn expander_escape() { match Expander::default().escape("hello") { Cow::Borrowed(s) => assert_eq!(s, "hello"), _ => panic!("string should be borrowed"), } assert_eq!(Expander::default().escape("a$b\\c"), "a$$b\\c"); assert_eq!(Expander::python().escape("a$b\\c"), "a$b\\\\c"); } #[test] fn expander_errors() { let with_names = common::regex("(?a)"); let without_names = common::regex("(a)"); let exp = Expander::default(); macro_rules! assert_err { ($expr:expr, $err:pat) => { match $expr { Err($err) => {} x => panic!("wrong result: {:?}", x), } }; } // Substitution char at end of template. assert_err!(exp.check("$", &with_names), Error::ParseError); // Substitution char not followed by a name or number. assert_err!(exp.check("$.", &with_names), Error::ParseError); // Empty delimiter pair. assert_err!(exp.check("${}", &with_names), Error::ParseError); // Unterminated delimiter pair. assert_err!(exp.check("${", &with_names), Error::ParseError); // Group 0 is always OK. assert!(exp.check("$0", &with_names).is_ok()); assert!(exp.check("$0", &without_names).is_ok()); // Can't use numbers with named groups. assert_err!(exp.check("$1", &with_names), Error::NamedBackrefOnly); assert_err!(exp.check("${1}", &with_names), Error::NamedBackrefOnly); // Unmatched group number. assert_err!(exp.check("$2", &without_names), Error::InvalidBackref); assert_err!(exp.check("${2}", &without_names), Error::InvalidBackref); // Unmatched group name. assert_err!(exp.check("$xx", &with_names), Error::InvalidBackref); assert_err!(exp.check("${xx}", &with_names), Error::InvalidBackref); } fancy-regex-0.7.1/tests/common/mod.rs000064400000000000000000000004340000000000000155760ustar 00000000000000use fancy_regex::Regex; pub fn regex(re: &str) -> Regex { let parse_result = Regex::new(re); assert!( parse_result.is_ok(), "Expected regex '{}' to be compiled successfully, got {:?}", re, parse_result.err() ); parse_result.unwrap() } fancy-regex-0.7.1/tests/finding.rs000064400000000000000000000154400000000000000151500ustar 00000000000000mod common; use fancy_regex::{Match, Regex}; use std::ops::Range; #[test] fn match_api() { let m = find_match(r"(\w+)", "... test").unwrap(); assert_eq!(m.range(), (4..8)); assert_eq!(Range::from(m), (4..8)); assert_eq!(m.as_str(), "test"); } #[test] fn find_wrap() { assert_eq!(find(r"(\w+)", "... test"), Some((4, 8))); assert_eq!(find(r"(?m)^yes$", "foo\nyes\n"), Some((4, 7))); } #[test] fn find_fancy_case_insensitive() { assert_eq!(find(r"(x|xy)\1", "XX"), None); assert_eq!(find(r"(x|xy)\1", "xx"), Some((0, 2))); assert_eq!(find(r"((?i:x|xy))\1", "XX"), Some((0, 2))); } #[test] fn lookahead_grouping_single_expression() { // These would fail if the delegate expression was `^x|a` (if we didn't // group as `^(?:x|a)`). assert_eq!(find(r"(?=x|a)", "a"), Some((0, 0))); assert_eq!(find(r"(?=x|a)", "bbba"), Some((3, 3))); } #[test] fn lookahead_grouping_multiple_expressions() { // These would fail if the delegate expression was `^ab|Bc` (if we didn't // preserve grouping of `(?:b|B)`). assert_eq!(find(r"(?=(?!x)a(?:b|B)c)", "aBc"), Some((0, 0))); assert_eq!(find(r"(?=(?!x)a(?:b|B)c)", "Bc"), None); } #[test] fn lookbehind_grouping_single_expression() { assert_eq!(find(r"(?<=x|a)", "a"), Some((1, 1))); assert_eq!(find(r"(?<=x|a)", "ba"), Some((2, 2))); assert_eq!(find(r"(?<=^a)", "a"), Some((1, 1))); assert_eq!(find(r"(?<=^a)", "ba"), None); } #[test] fn lookbehind_variable_sized_alt() { assert_eq!(find(r"(?<=a|bc)", "xxa"), Some((3, 3))); assert_eq!(find(r"(?<=a|bc)", "xxbc"), Some((4, 4))); assert_eq!(find(r"(?<=a|bc)", "xx"), None); assert_eq!(find(r"(?<=a|bc)", "xxb"), None); assert_eq!(find(r"(?<=a|bc)", "xxc"), None); assert!(Regex::new(r"(?<=a(?:b|cd))").is_err()); assert!(Regex::new(r"(?<=a+b+))").is_err()); } #[test] fn negative_lookbehind_variable_sized_alt() { assert_eq!(find(r"(? assert_eq!((mat.start(), mat.end()), (0, 2)), 1 => assert_eq!((mat.start(), mat.end()), (3, 5)), 2 => assert_eq!((mat.start(), mat.end()), (6, 8)), i => panic!("Expected 3 captures, got {}", i + 1), } } } #[test] fn find_iter_overlapping_lookahead() { let text = "abcdef"; for (i, mat) in common::regex(r"[a-z]{2}(?=[a-z])") .find_iter(text) .enumerate() { let mat = mat.unwrap(); match i { 0 => assert_eq!((mat.start(), mat.end()), (0, 2)), 1 => assert_eq!((mat.start(), mat.end()), (2, 4)), i => panic!("Expected 2 captures, got {}", i + 1), } } } #[test] fn find_iter_zero_length() { let text = "ab1c2"; for (i, mat) in common::regex(r"\d*(?=[a-z])").find_iter(text).enumerate() { let mat = mat.unwrap(); match i { 0 => assert_eq!((mat.start(), mat.end()), (0, 0)), 1 => assert_eq!((mat.start(), mat.end()), (1, 1)), 2 => assert_eq!((mat.start(), mat.end()), (2, 3)), i => panic!("Expected 3 captures, got {}", i + 1), } } } #[test] fn find_iter_zero_length_longer_codepoint() { let text = "é1é"; for (i, mat) in common::regex(r"\d*(?=é)").find_iter(text).enumerate() { let mat = mat.unwrap(); match i { 0 => assert_eq!((mat.start(), mat.end()), (0, 0)), 1 => assert_eq!((mat.start(), mat.end()), (2, 3)), i => panic!("Expected 2 captures, got {}", i + 1), } } } #[test] fn find_iter_attributes() { let text = "ab1c2"; let regex = common::regex(r"\d*(?=[a-z])"); let matches = regex.find_iter(text); assert_eq!(matches.text(), text); assert_eq!(regex.as_str(), matches.regex().as_str()); } fn find(re: &str, text: &str) -> Option<(usize, usize)> { find_match(re, text).map(|m| (m.start(), m.end())) } fn find_match<'t>(re: &str, text: &'t str) -> Option> { let regex = common::regex(re); let result = regex.find(text); assert!( result.is_ok(), "Expected find to succeed, but was {:?}", result ); result.unwrap() } #[test] fn incomplete_escape_sequences() { // See GH-76 assert!(Regex::new("\\u").is_err()); assert!(Regex::new("\\U").is_err()); assert!(Regex::new("\\x").is_err()); } fancy-regex-0.7.1/tests/matching.rs000064400000000000000000000072340000000000000153260ustar 00000000000000use fancy_regex::{Error, RegexBuilder}; mod common; #[test] fn control_character_escapes() { assert_match(r"\a", "\x07"); assert_match(r"\e", "\x1B"); assert_match(r"\f", "\x0C"); assert_match(r"\n", "\x0A"); assert_match(r"\r", "\x0D"); assert_match(r"\t", "\x09"); assert_match(r"\v", "\x0B"); } #[test] fn character_class_escapes() { assert_match(r"[\[]", "["); assert_match(r"[\^]", "^"); // The regex crate would reject the following because it's not necessary to escape them. // Other engines allow to escape any non-alphanumeric character. assert_match(r"[\<]", "<"); assert_match(r"[\>]", ">"); assert_match(r"[\.]", "."); assert_match(r"[\ ]", " "); // Character class escape assert_match(r"[\d]", "1"); // Control characters assert_match(r"[\e]", "\x1B"); assert_match(r"[\n]", "\x0A"); // `]` can be unescaped if it's right after `[` assert_match(r"[]]", "]"); // `]` can be unescaped even after `[^` assert_match(r"[^]]", "a"); } #[test] fn character_class_nested() { assert_match(r"[[a][bc]]", "c"); assert_match(r"[a[^b]]", "c"); } #[test] fn character_class_intersection() { assert_match(r"[\w&&a-c]", "c"); assert_no_match(r"[\w&&a-c]", "d"); assert_match(r"[[0-9]&&[^4]]", "1"); assert_no_match(r"[[0-9]&&[^4]]", "4"); } #[test] fn alternation_with_empty_arm() { assert_match(r"^(a|)$", "a"); assert_match(r"^(a|)$", ""); assert_match(r"^(|a)$", "a"); assert_match(r"^(|a)$", ""); assert_match(r"a|", "a"); assert_match(r"a|", ""); assert_match(r"|a", "a"); assert_match(r"|a", ""); assert_no_match(r"^(a|)$", "b"); } #[test] fn case_insensitive_character_class() { assert_match(r"^(?i)[a-z]+$", "aB"); } #[test] fn case_insensitive_escape() { // `\x61` is lowercase `a` assert_match(r"(?i)\x61", "A"); // `\p{Ll}` is the "Letter, lowercase" category assert_match(r"(?i)\p{Ll}", "A"); } #[test] fn atomic_group() { assert_match(r"^a(?>bc|b)c$", "abcc"); assert_no_match(r"^a(?>bc|b)c$", "abc"); // Look-ahead forces use of VM assert_match(r"^a(bc(?=d)|b)cd$", "abcd"); assert_no_match(r"^a(?>bc(?=d)|b)cd$", "abcd"); } #[test] fn backtrack_limit() { let re = RegexBuilder::new("(?i)(a|b|ab)*(?=c)") .backtrack_limit(100_000) .build() .unwrap(); let s = "abababababababababababababababababababababababababababab"; let result = re.is_match(s); assert!(result.is_err()); match result.err() { Some(Error::BacktrackLimitExceeded) => {} _ => panic!("Expected Error::BacktrackLimitExceeded"), } } #[test] fn end_of_hard_expression_cannot_be_delegated() { assert_match(r"(?!x)(?:a|ab)c", "abc"); // If `(?:a|ab)` is delegated, there's no backtracking and `a` matches and `ab` is never tried. assert_match(r"((?!x)(?:a|ab))c", "abc"); } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_match(re: &str, text: &str) { let result = match_text(re, text); assert_eq!( result, true, "Expected regex '{}' to match text '{}'", re, text ); } #[cfg_attr(feature = "track_caller", track_caller)] fn assert_no_match(re: &str, text: &str) { let result = match_text(re, text); assert_eq!( result, false, "Expected regex '{}' to not match text '{}'", re, text ); } #[cfg_attr(feature = "track_caller", track_caller)] fn match_text(re: &str, text: &str) -> bool { let regex = common::regex(re); let result = regex.is_match(text); assert!( result.is_ok(), "Expected match to succeed, but was {:?}", result ); result.unwrap() } fancy-regex-0.7.1/tests/oniguruma.rs000064400000000000000000000200600000000000000155320ustar 00000000000000//! Run tests from Oniguruma's test suite, see `oniguruma/README.md` use std::collections::HashMap; use std::panic; use regex::Regex; use fancy_regex::Regex as FancyRegex; #[derive(Debug, Eq, Hash, PartialEq)] struct Test { source: String, pattern: String, text: String, assertion: Assertion, } #[derive(Debug, Eq, Hash, PartialEq)] enum Assertion { Match { group: usize, start: usize, end: usize, }, NoMatch, } /// Extract tests from the C source file (or the ignore file). /// /// Returns a vec of tuple of the test data and the comment for the test. fn parse_tests(test_source: &str) -> Vec<(Test, String)> { let mut tests = Vec::new(); let c_string = r#""((?:\\\\|\\"|[^"])*)""#; let re = Regex::new(&format!( r"(?m)((?:^ //.*\n)*)^\s*((x2|x3|n)\({},\s*{},?([^\)]+)\);)", c_string, c_string )) .unwrap(); for caps in re.captures_iter(test_source) { let comment = caps .get(1) .unwrap() .as_str() .replace(" // ", "") .trim() .to_string(); let source = caps.get(2).unwrap().as_str().to_string(); let kind = caps.get(3).unwrap().as_str(); let pattern = unescape(caps.get(4).unwrap().as_str()); let text = unescape(caps.get(5).unwrap().as_str()); let args: Vec = caps .get(6) .unwrap() .as_str() .split(",") .map(|s| s.trim().parse().unwrap()) .collect(); let assertion = match kind { "x2" => Assertion::Match { start: args[0], end: args[1], group: 0, }, "x3" => Assertion::Match { start: args[0], end: args[1], group: args[2], }, "n" => Assertion::NoMatch, _ => { panic!("Unexpected test type {}", kind); } }; let test = Test { source, pattern, text, assertion, }; tests.push((test, comment)); } tests } /// Unescape a string as it appears in C source. This is probably not a perfect implementation, but /// it's good enough for these tests. fn unescape(escaped: &str) -> String { let mut s: Vec = Vec::new(); let mut chars = escaped.chars(); while let Some(c) = chars.next() { match c { '\\' => { let next = chars.next().expect("Expected character after backslash"); match next { '\\' => { s.push(b'\\'); } '"' => { s.push(b'"'); } '?' => { // '?' has to be escaped in C to avoid trigraphs s.push(b'?'); } 'n' => { s.push(b'\n'); } 'r' => { s.push(b'\r'); } '0' => { // octal escape, e.g. \001 let mut octal = String::new(); octal.push(chars.next().expect("Expected character after \\0")); octal.push(chars.next().expect("Expected second character after \\0")); let num = u8::from_str_radix(&octal, 8).expect("Error parsing octal number"); s.push(num); } 'x' => { // hex escape, e.g. \x1f let mut hex = String::new(); hex.push(chars.next().expect("Expected character after \\x")); hex.push(chars.next().expect("Expected second character after \\x")); let num = u8::from_str_radix(&hex, 16).expect("Error parsing hex number"); s.push(num); } _ => { unimplemented!("Unknown escaped character {} in {}", next, escaped); } } } _ => { s.append(&mut c.to_string().into_bytes()); } } } // Some strings in the test are invalid UTF-8. We handle them via ignores. String::from_utf8_lossy(&s).to_string() } fn run_test(test: &Test) -> Option { let Test { pattern, text, assertion, .. } = test; let compile_result = FancyRegex::new(&pattern); if compile_result.is_err() { let error = format!("{:?}", compile_result.unwrap_err()); return Some(format!("Compile failed: {}", error)); } match *assertion { Assertion::Match { group, start, end } => { let result = panic::catch_unwind(|| { // compile regex again instead of using above, otherwise: // "may not be safely transferrable across a catch_unwind boundary" let regex = FancyRegex::new(&pattern).unwrap(); regex.captures(&text).unwrap() }); if let Ok(captures_result) = result { if let Some(captures) = captures_result { let m = captures.get(group).expect("Expected group to exist"); if m.start() != start || m.end() != end { Some(format!( "Match found at start {} and end {} (expected {} and {})", m.start(), m.end(), start, end )) } else { None } } else { Some("No match found".to_string()) } } else { Some("Panic while matching".to_string()) } } Assertion::NoMatch => { let regex = FancyRegex::new(&pattern).unwrap(); let result = regex.find(&text).unwrap(); if result.is_some() { Some("Match found".to_string()) } else { // We expected it not to match and it didn't -> good None } } } } #[test] fn oniguruma() { let tests: Vec = parse_tests(include_str!("oniguruma/test_utf8.c")) .into_iter() .map(|(test, _comment)| test) .collect(); let ignore: HashMap = parse_tests(include_str!("oniguruma/test_utf8_ignore.c")) .into_iter() .collect(); let mut ignored = 0; let mut success = 0; for test in tests { let result = run_test(&test); if let Some(expected_failure) = ignore.get(&test) { assert!(result.is_some(), "Expected ignored test to fail, but it succeeded. Remove it from the ignore file: {}", &test.source); let failure = result.unwrap(); assert!(failure.starts_with(expected_failure), "Expected failure differed for test, change it in the ignore file: {}\nExpected: {}\nActual : {}\n", &test.source, &expected_failure, &failure ); ignored += 1; } else { if let Some(failure) = result { // This is a weird way to do the assertions, but the nice thing about it is that we // can run the tests without an "ignore" file and instead of failing, print the // content for the ignore file. To do that, disable the assert and enable the print: // println!(" // {}\n {}\n", failure, test.source); assert!(false, "Test {} failed: {}", &test.source, failure); } else { // println!("Success: {}", test.source); success += 1; } } } println!( "{} successful Oniguruma tests, {} ignored", success, ignored ); } fancy-regex-0.7.1/tests/replace.rs000064400000000000000000000045210000000000000151430ustar 00000000000000use fancy_regex::{Captures, NoExpand}; use std::borrow::Cow; mod common; #[test] fn replacer_string() { let regex = common::regex( r"\b([sS])uc(?:cs|s?)e(ed(?:ed|ing|s?)|ss(?:es|ful(?:ly)?|i(?:ons?|ve(?:ly)?)|ors?)?)\b", ); // Replacer impl for &str let result = regex.replace("a sucessful b", "${1}ucce$2"); assert_eq!(result, "a successful b"); // Replacer impl for &String let repl_string = "${1}ucce$2".to_string(); let result = regex.replace("a Suceeded b", &repl_string); assert_eq!(result, "a Succeeded b"); // Replacer impl for String let result = regex.replace("a sucessor b", repl_string); assert_eq!(result, "a successor b"); } #[test] fn replacer_cow() { let regex = common::regex(r"\b([oO])mmi(?=t)t?(t(?:ed|ing)|s)\b"); // Replacer impl for &Cow let result = regex.replace("a ommiting b", &Cow::from("${1}mit$2")); assert_eq!(result, "a omitting b"); // Replacer for Cow::Borrowed let result = regex.replace("a ommited b", Cow::Borrowed("${1}mit$2")); assert_eq!(result, "a omitted b"); // Replacer for Cow::Owned let result = regex.replace("a Ommits b", Cow::Owned("${1}mit$2".to_string())); assert_eq!(result, "a Omits b"); } #[test] fn replacer_noexpand() { let regex = common::regex(r"\b([aA])n+ull(ar|ments?|s?)\b"); // Replacer impl for NoExpand let result = regex.replace("a anullment b", NoExpand("${1}nnul$2")); assert_eq!(result, "a ${1}nnul$2 b"); } #[test] fn replacer_callback() { let regex = common::regex(r"\b([aA])p(?:p[or]|ro)x\.?(?=[ \)\n])"); // Replacer impl for FnMut(&Captures) let result = regex.replace("a Aprox b", |cap: &Captures| { format!("{}pprox.", cap.get(1).unwrap().as_str()) }); assert_eq!(result, "a Approx. b"); } /// `replace()` does only one replacement #[test] fn replace_one() { let regex = common::regex("bla"); assert_eq!(regex.replace("blabla", "foo"), "foobla"); } /// `replace_all()` replaces all non-overlapping matches #[test] fn replace_all() { let regex = common::regex("aa"); assert_eq!(regex.replace_all("aaaa aaa aa a", "xx"), "xxxx xxa xx a"); } /// `replacen()` replaces predefined number of times #[test] fn replacen() { let regex = common::regex("bla"); assert_eq!(regex.replacen("blablabla", 2, "foo"), "foofoobla"); }